如何在解析大型文本文件时提高性能 - StreamReader + Regex - How to improve performance when parsing large text file

我正在开发一个Windows表单应用程序，该应用程序采用其他软件生成的机器人程序并对其进行修改。修改过程如下：

StreamReader.ReadLine() 用于逐行解析文件
正则表达式用于搜索文件中的特定关键字。如果获得匹配，则匹配的字符串将复制到另一个字符串中，并替换为新的机器人代码行。
修改后的代码保存在字符串中，最后写入新文件。
使用正则表达式获得的所有匹配字符串集合也保存在字符串中，最后写入新文件。

我已经能够成功地做到这一点

private void Form1_Load(object sender, EventArgs e)
{
string NextLine = null;
string CurrLine = null;
string MoveL_Pos_Data = null;
string MoveL_Ref_Data = null;
string MoveLFull = null;
string ModCode = null;
string TAB = "t";
string NewLine = "rn";
string SavePath = null;
string ExtCode_1 = null;
string ExtCode_2 = null;
string ExtCallMod = null;
int MatchCount = 0;
int NumRoutines = 0;
try
{
// Ask user location of the source file
// Displays an OpenFileDialog so the user can select a Cursor.  
OpenFileDialog openFileDialog1 = new OpenFileDialog
{
Filter = "MOD Files|*.mod",
Title = "Select an ABB RAPID MOD File"
};
// Show the Dialog.  
// If the user clicked OK in the dialog and  
// a .MOD file was selected, open it.  
if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK)
{
// Assign the cursor in the Stream to the Form's Cursor property.  
//this.Cursor = new Cursor(openFileDialog1.OpenFile());
using (StreamReader sr = new StreamReader(openFileDialog1.FileName))
{
// define a regular expression to search for extr calls 
Regex Extr_Ex = new Regex(@"bExtr(-?d*.d*);", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline);
Regex MoveL_Ex = new Regex(@"bMoveLs+(.*)(zd.*)", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Multiline);
Match MoveLString = null;
while (sr.Peek() >= 0)
{
CurrLine = sr.ReadLine();
//Console.WriteLine(sr.ReadLine());
// check if the line is a match 
if (Extr_Ex.IsMatch(CurrLine))
{
// Keep a count for total matches
MatchCount++;
// Save extr calls in a string
ExtCode_1 += NewLine + TAB + TAB + Extr_Ex.Match(CurrLine).ToString();

// Read next line (always a MoveL) to get Pos data for TriggL
NextLine = sr.ReadLine();
//Console.WriteLine(NextLine);
if (MoveL_Ex.IsMatch(NextLine))
{
// Next Line contains MoveL
// get matched string 
MoveLString = MoveL_Ex.Match(NextLine);
GroupCollection group = MoveLString.Groups;
MoveL_Pos_Data = group[1].Value.ToString();
MoveL_Ref_Data = group[2].Value.ToString();
MoveLFull = MoveL_Pos_Data + MoveL_Ref_Data;                                
}
// replace Extr with follwing commands
ModCode += NewLine + TAB + TAB + "TriggL " + MoveL_Pos_Data + "extr," + MoveL_Ref_Data;
ModCode += NewLine + TAB + TAB + "WaitDI DI1_1,1;";
ModCode += NewLine + TAB + TAB + "MoveL " + MoveLFull;
ModCode += NewLine + TAB + TAB + "Reset DO1_1;";
//break;
}
else
{
// No extr Match
ModCode += "rn" + CurrLine;
}                     
}
Console.WriteLine($"Total Matches: {MatchCount}");
}

}
// Write modified code into a new output file
string SaveDirectoryPath = Path.GetDirectoryName(openFileDialog1.FileName);
string ModName = Path.GetFileNameWithoutExtension(openFileDialog1.FileName);
SavePath = SaveDirectoryPath + @"" + ModName + "_rev.mod";
File.WriteAllText(SavePath, ModCode);
//Write Extr matches into new output file 
//Prepare module
ExtCallMod = "MODULE ExtruderCalls";
// All extr calls in one routine
//Prepare routines
ExtCallMod += NewLine + NewLine + TAB + "PROC Prg_ExtCall"; // + 1;
ExtCallMod += ExtCode_1;
ExtCallMod += NewLine + NewLine + TAB + "ENDPROC";
ExtCallMod += NewLine + NewLine;
//}
ExtCallMod += "ENDMODULE";
// Write to file
string ExtCallSavePath = SaveDirectoryPath + @"ExtrCalls.mod";                
File.WriteAllText(ExtCallSavePath, ExtCallMod);                
}
catch (Exception ex)
{
Console.WriteLine(ex.ToString());                
}
}                    
}

虽然这有助于我实现我想要的，但这个过程非常缓慢。由于我是 C# 编程的新手，我怀疑缓慢来自将原始文件内容复制到字符串并且没有就地替换内容(我不确定原始文件中的内容是否可以直接替换)。对于 20,000 行的输入文件，整个过程需要 5 分钟多一点。

我曾经收到以下错误：消息=托管调试助手"上下文切换死锁"："CLR 无法从 COM 上下文0xb27138转换到 COM 上下文0xb27080 60 秒。拥有目标上下文/单元的线程很可能正在执行非抽取等待或处理长时间运行的操作而不抽取 Windows 消息。这种情况通常会对性能产生负面影响，甚至可能导致应用程序无响应或内存使用量随着时间的推移不断累积。为了避免这个问题，所有单线程单元(STA)线程都应该使用泵送等待原语(如CoWaitForMultipleHandles)，并在长时间运行的操作期间定期泵送消息。

我能够通过在调试器设置中禁用"ContextSwitchDeadlock"设置来克服它。这可能不是最佳做法。

谁能帮助我提高代码的性能？

编辑：我发现机器人控制器对MOD文件(输出文件)中的行数有限制。允许的最大行数为 32768。我想出了一个逻辑来拆分字符串生成器的内容以分隔输出文件，如下所示：

// Split modCodeBuilder into seperate strings based on final size
const int maxSize = 32500;
string result = modCodeBuilder.ToString();
string[] splitResult = result.Split(new string[] { "rn" }, StringSplitOptions.None);
string[] splitModCode = new string[maxSize]; 
// Setup destination directory to be same as source directory
string destDir = Path.GetDirectoryName(fileNames[0]);
for (int count = 0; ; count++)
{
// Get the next batch of text by skipping the amount
// we've taken so far and then taking the maxSize.
string modName = $"PrgMOD_{count + 1}";
string procName = $"Prg_{count + 1}()";
// Use Array Copy to extract first 32500 lines from modCode[]
int src_start_index = count * maxSize;
int srcUpperLimit = splitResult.GetUpperBound(0);
int dataLength = maxSize;
if (src_start_index > srcUpperLimit) break; // Exit loop when there's no text left to take
if (src_start_index > 1)
{
// Make sure calculate right length so that src index is not exceeded
dataLength = srcUpperLimit - maxSize;
}                
Array.Copy(splitResult, src_start_index, splitModCode, 0, dataLength);
string finalModCode = String.Join("rn", splitModCode);
string batch = String.Concat("MODULE ", modName, "rnrntPROC ", procName, "rn", finalModCode, "rnrntENDPROCrnrnENDMODULE");
//if (batch.Length == 0) break; 
// Generate file name based on count
string fileName = $"ABB_R3DP_{count + 1}.mod";
// Write our file text
File.WriteAllText(Path.Combine(destDir, fileName), batch);
// Write status to output textbox
TxtOutput.AppendText("rn");
TxtOutput.AppendText("rn");
TxtOutput.AppendText($"Modified MOD File: {fileName} is generated sucessfully! It is saved to location: {Path.Combine(destDir, fileName)}");
}

字符串连接可能需要很长时间。改用StringBuilder可能会提高您的性能：

private static void GenerateNewFile(string sourceFullPath)
{
string posData = null;
string refData = null;
string fullData = null;
var modCodeBuilder = new StringBuilder();
var extCodeBuilder = new StringBuilder();
var extrRegex = new Regex(@"bExtr(-?d*.d*);", RegexOptions.Compiled | 
RegexOptions.IgnoreCase | RegexOptions.Multiline);
var moveLRegex = new Regex(@"bMoveLs+(.*)(zd.*)", RegexOptions.Compiled | 
RegexOptions.IgnoreCase | RegexOptions.Multiline);
int matchCount = 0;
bool appendModCodeNext = false;
foreach (var line in File.ReadLines(sourceFullPath))
{
if (appendModCodeNext)
{
if (moveLRegex.IsMatch(line))
{
GroupCollection group = moveLRegex.Match(line).Groups;
if (group.Count > 2)
{
posData = group[1].Value;
refData = group[2].Value;
fullData = posData + refData;
}
}
modCodeBuilder.Append("ttTriggL ").Append(posData).Append("extr,")
.Append(refData).Append("rnttWaitDI DI1_1,1;rnttMoveL ")
.Append(fullData).AppendLine("rnttReset DO1_1;");
appendModCodeNext = false;
}
else if (extrRegex.IsMatch(line))
{
matchCount++;
extCodeBuilder.Append("tt").AppendLine(extrRegex.Match(line).ToString());
appendModCodeNext = true;
}
else
{
modCodeBuilder.AppendLine(line);
}
}
Console.WriteLine($"Total Matches: {matchCount}");
string destDir = Path.GetDirectoryName(sourceFullPath);
var savePath = Path.Combine(destDir, Path.GetFileNameWithoutExtension(sourceFullPath), 
"_rev.mod");
File.WriteAllText(savePath, modCodeBuilder.ToString());
var extCallMod = string.Concat("MODULE ExtruderCallsrnrntPROC Prg_ExtCall",
extCodeBuilder.ToString(), "rnrntENDPROCrnrnENDMODULE");
File.WriteAllText(Path.Combine(destDir, "ExtrCalls.mod"), extCallMod);
}

您在评论中提到要批量获取文本并将它们写入单独的文件。一种方法是将字符串视为char[]，然后使用System.Linq扩展方法，Skip和Take。Skip将跳过字符串中一定数量的字符，然后Take将获取一定数量的字符并在IEnumerabe<char>中返回它们。然后，我们可以使用string.Concat将其转换为字符串并将其写入文件。

如果我们有一个代表我们最大大小的常量，以及一个从0开始的计数器，我们可以使用一个for循环来递增计数器并跳过counter * max个字符，然后从字符串中获取max个字符。我们还可以使用counter变量来创建文件名，因为它会在每次迭代时递增：

const int maxSize = 32500;
string result = modCodeBuilder.ToString();
for (int count = 0;; count++)
{
// Get the next batch of text by skipping the amount
// we've taken so far and then taking the maxSize.
string batch = string.Concat(result.Skip(count * maxSize).Take(maxSize));
if (batch.Length == 0) break; // Exit loop when there's no text left to take
// Generate file name based on count
string fileName = $"filename_{count + 1}.mod";
// Write our file text
File.WriteAllText(Path.Combine(destDir, fileName), batch);
}

另一种可能更快的方法是使用string.Substring，并使用count * maxSize作为要获取的子字符串的开始索引。然后我们只需要确保我们的length不超过字符串的边界，并将子字符串写入文件：

for (int count = 0;; count++)
{
// Get the bounds for the substring (startIndex and length)
var startIndex = count * maxSize;
var length = Math.Min(result.Length - startIndex, maxSize);
if (length < 1) break; // Exit loop when there's no text left to take
// Get the substring and file name
var batch = result.Substring(startIndex, length);
string fileName = $"filename_{count + 1}.mod";
// Write our file text  
File.WriteAllText(Path.Combine(destDir, fileName), batch);
}

请注意，这会将文本拆分为正好32500个字符的块(最后一个块除外)。如果你只想采取整条生产线，那需要更多的工作，但仍然不难做到。

如何在解析大型文本文件时提高性能 - StreamReader + Regex

相关内容

最新更新

热门标签：