比较两个电子表格,并使用Google App脚本输出差异



好吧,我正在尝试做标题中描述的事情。这两个电子表格都只有一张我要比较的表格。一个电子表格是另一个电子表格的更新,因此我试图仅获取新内容。(如果是FC(DOS命令(,例如功能,这很容易...(

进行了一些搜索后,我的follo脚本在大多数情况下都应该使用,该脚本使用每个表的数组。

function test() {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  
  var newarray = getNewData(array,old_array);
  Logger.log('there are ' + newarray.length + 'different rows');
}
function getNewData(array1,array2){    
  var diff =array2;   
  for (var i = 0; i<array1.length; i++){
    var duplicate = false;
    for (var j = 0;j<diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        var duplicate= true;
        break;
      }    
    }
    if (duplicate==false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
  }
  return diff;
}

问题是文件太大,几乎30000行,因此脚本超过5分钟的执行限制。

有没有一种方法来改进这一点,例如消除内部循环?还是有一种方法可以在零件中进行?像第一个5000行一样,等等。

问:

编辑:对电子表格进行了稍作分析后,我发现每个行都有一个ID,因此现在我只能将搜索集中在每个电子表格的一列中。所以这是我的新实施:

function test(){
var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId =Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length); 
  //The COlumn has an indicator, so i search for that. I don't control the formatting of the files, so i search in both spreadsheet for the indicator
  var searchString = 'NAME';
  for (var i = 0; i < old_array.length; i++) {    
    for (var j = 0; j < old_array[i].length; j++) {    
      if (old_array[i][j] == searchString) {
        var Row_old = i+1;
        var Column_old = j;
        break;
      }      
    }
    if (Row_old != undefined){
      break;
    }
  }
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }
  Logger.log(Row_old+':::'+Column_old+'n'+Row+':::'+Column);  
  var diff_index =[];
  var row_ind = 0;  
  for (var i=Row;i<array.length;i++){        
    Logger.log(i);
    var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
    if (existe==-1){      
      Logger.log(row_ind+'!!!');
      diff_index[row_ind]=i;
      row_ind++;          
    }
  }
  Logger.log(diff_index);
}

这仍然没有时间...我现在将尝试整合您的评论。

您的脚本有一些主要的瓶颈,可以大大降低其速度:

  • 每次在0中启动两个循环,使其运行时爆炸
  • 每次您找到重复时需要拼接,需要移动阵列
  • 字符串在每个迭代上串联一个数组

我们可以通过:

来规避这些问题
  • 对第二范围进行排序
  • 我敢肯定,通过迭代二进制搜索每一列有聪明的事情,但是我们必须每次求助,所以我们将二进制搜索第一列,然后进行线性搜索。

我们将使用arraylib进行排序(我希望这是一种快速分类算法(。

让我们从函数开始,以找到第一行匹配一个值(当前行的第一列(:

function firstRowMatchingCol1(target, lookupRange) {
  var min = 0;
  var max = lookupRange.length - 1;
  var guess;
  var guessVal;
  while(min <= max) {
    guess = (min + max) / 2 | 0;
    guessVal = lookupRange[guess][0];
    if (guessVal < target) {
      min = guess + 1;
    } else if (guessVal > target) {
      max = guess - 1; 
    } else {
      while (guess > 0 && lookupRange[guess - 1][0] === target) {
        guess -= 1; 
      }
      return guess;
    }
  }
  return -1;
}

现在,我们可以线性地通过每一行进行线性进行检查,并检查列是否匹配到第一列不再匹配。

function matchExists(row, lookupRange) {
  var index = firstRowMatchingCol1(row[0], lookupRange); 
  if (index === -1) {return false;}
  while (index < lookupRange.length && lookupRange[index][0] === row[0]) {
    for (var col = 1; col < row.length; col++) {
      if (row[col] !== lookupRange[index][col]) {break;}
      if (col === row.length - 1) {return true;} // This only works if the ranges are at least two columns wide but if they are one column wide you can just check if index > -1
    }
    index += 1;
  }
  return false;  
}

最后,我们可以得到这样的重复项:

function getNonDuplicates(r1, r2) {
  r2 = ArrayLib.sort(r2, 0, true);  
  return r1.filter(function(row) {return !matchExists(row, r2);});
}

像mtorres的代码一样,这是未经测试的

我提出的解决方案是围绕时间限制的"黑客"。但是,如果您想要一个更清洁的解决方案,则可以在可能的情况下重组并通过以某种方式订购数组来更有效。

您未在array1和array2中指定数据,如果行有某种ID字段,则可以通过此ID订购,并在Array1上订购ARRAY1和ROW i上的行i,而不是将Array1中的每个行与每个行中的每个行进行比较在Array2中(30000行效率极低(。

如果您的数据没有订购行的ID字段,那么您可以根据我提出的解决方案来解决这些问题:为Array1上的每个比较行添加一个曲目。当运行达到时间限制时,您将再次运行该功能,但从上次比较行开始(您会知道哪个是因为您将跟踪比较的行(,而当第二次运行时间您重复时,等等。

每次进行比较时,都会询问它是第一次运行(或使用布尔值 - 我更喜欢问用户,这样,您将不会忘记更改布尔值(,如果是第一次运行,则删除跟踪列,如果不是第一次运行,您将从下一步跟踪的行开始,因此基本上继续脚本结束。我一直在使用这种技术,并取得良好的结果。

在代码中(未经测试,因此请在使用真实数据运行之前检查一下(:

/**
 * Only checks if it's the first run and calls the real work function
 */
function test() {
  var firstRun = "yes" === Browser.msgBox("Question", "Is this the first run?", Browser.Buttons.YES_NO);
  doTest(firstRun);
}
/**
 * Gets the data of the 2 spreadsheets and also the starting
 * row
 */
function doTest(firstRun) {
  var Folder = DriveApp.getFoldersByName('theFolder').next();
  var FolderId = Folder.getId();
  //call old_spreadsheet
  var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var old_file = files.next();   
  var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
  var old_sheet = old_spreadsheet.getSheets()[0];
  var old_sheetname = old_sheet.getName();
  var old_array = old_sheet.getDataRange().getValues();
  /**
   * Here is the code to create the tracking hability
   */
  var strartFromRow = 0; // 0 because row 1 is array 0 index when you getValues();
  var trackSheet = old_spreadsheet.getSheetByName("Tracking");
  if (trackSheet === null) {
    trackSheet = old_spreadsheet.insertSheet("Tracking");
  }
  if (firstRun) {
    trackSheet.getRange("A:A").clearContent();   // make sure there no row is tracked yet
  }
  else {
    // we have to continue from the previous row, keep in mind you're making the comparison 
    // with array which is 0 based, but sheet is 1 based, but you want the next one so getLasRow()
    // should be the first item to compare on your array
    strartFromRow = trackSheet.getLastRow(); 
  }
  Logger.log(old_file.getName() + ' : ' + old_sheetname + ' : ' + old_array.length);
  //call spreadsheet
  var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
  var files = DriveApp.searchFiles(searchFor); 
  var file = files.next();   
  var spreadsheet = SpreadsheetApp.openById(file.getId());
  var sheet = spreadsheet.getSheets()[0];
  var sheetname = sheet.getName();
  var array = sheet.getDataRange().getValues();
  Logger.log(file.getName() + ' : ' + sheetname + ' : ' + array.length);  
  // when you call the DIFF function, pass the tracking sheet and the start Row
  var newarray = getNewData(array,old_array, trackSheet, startFromRow);
  Logger.log('there are ' + newarray.length + 'different rows');
}
/**
 * Creates a diff array using array1 and array2
 * It marks each element on array1 once it has checked if it's in array2
 */
function getNewData(array1, array2, trackingSheet, startFromRow){
  var logRow = trackingSheet.getLastRow();
  var diff = array2;   
  for (var i = startFromRow; i < array1.length; i++){
    var duplicate = false;
    for (var j = 0; j < diff.length;j++){
      if (array1[i].join() == diff[j].join()){
        Logger.log('duplicated line found on rows ' + i + ':' + j);
        diff.splice(j,1);
        duplicate = true;
        break;
      }    
    }
    if (duplicate === false) {
      Logger.log('not duplicated line found on row ' + i);
      diff.push(array1[i]);            
    }
    trackingSheet.getRange(logRow++, 1).setValue("Checked!");  // Mark i row as checked
  }
  return diff;
}

这是一个替代解决方案,可以限制时间限制。创建一个新的专用电子表格以及一个自定义的侧边栏。侧边栏将要求您创建一些HTML,这些HTML最终将嵌入并渲染到客户端上的iframe中。您可以通过脚本标签将纯JavaScript嵌入HTML中。

这种方法的优点在于,这些脚本不会独立于Google Apps脚本的服务器端环境而在客户端上运行服务器端,并且不受6分钟的限制。此外,他们还可以在您的Google脚本中调用功能。因此,一种方法是让客户端脚本调用Google脚本函数以检索必要的数据,在客户端脚本中进行所有重处理,然后将结果发送回服务器端脚本以更新表。

这是设置自定义侧边栏以使您入门的链接:https://developers.google.com/apps-script/guides/dialogs#custom_sidebars

最后,我决定选择缓存服务选项,这是代码,我正在测试它,以查看是否保留。

function getNewData() {
  //deleting triggers
  var triggers = ScriptApp.getProjectTriggers();    
  for (var i = 0; i < triggers.length; i++) {
    if (triggers[i].getHandlerFunction()=='getNewData'){
      ScriptApp.deleteTrigger(triggers[i]);
    }
  }  
  //max running time = 5.5 min
  var MAX_RUNNING_TIME = 330000;
  var startTime= (new Date()).getTime();
  //get cache
  var cache = CacheService.getUserCache();
  var downloaded =JSON.parse(cache.get('downloaded'));  
  var compared =JSON.parse(cache.get('compared'));
  //start
  if (downloaded==1 && compared!=1){
    //folder
    var Folder = DriveApp.getFoldersByName('theFolder').next();
    var FolderId = licitacionesFolder.getId();
    //call old_spreadsheet
    var searchFor ="fullText contains 'sheet_old' and '" + FolderId + "' in parents";  
    var files = DriveApp.searchFiles(searchFor); 
    var old_file = files.next();  
    var old_spreadsheet = SpreadsheetApp.openById(old_file.getId());
    var old_sheet = old_spreadsheet.getSheets()[0];
    var old_array = old_sheet.getDataRange().getValues();
    //call spreadsheet
    var searchFor ="fullText contains 'sheet' and '" + FolderId + "' in parents";  
    var files = DriveApp.searchFiles(searchFor); 
    var file = files.next();  
    var spreadsheet = SpreadsheetApp.openById(old_file.getId());
    var sheet = spreadsheet.getSheets()[0];
    var array = sheet.getDataRange().getValues();
    Logger.log(array.length+'::'+old_array.length); 
    // Column
    var searchString = 'NAME';
    var RC = getColumn(array,searchString);    
    var Row = RC.Row;
    var Column = RC.Column;
    var RC = getColumn(old_array,searchString);    
    var Row_old = RC.Row;
    var Column_old = RC.Column;    
    Logger.log(Row_old+':::'+Column_old+'n'+Row+':::'+Column);      
    //compare 
    var diff_index =JSON.parse(cache.get('diff_index'));
    var row_ind =JSON.parse(cache.get('row_ind'));  
    var Roww =JSON.parse(cache.get('Row'));  
    if (diff_index==null){var diff_index = [];}
    if (row_ind==null){var row_ind = 0;}
    if (Roww==null){var Roww = Row;}    
    Logger.log(row_ind+'n'+Roww);    
    for (var i=Roww;i<array.length;i++){  
      var currTime = (new Date()).getTime();
      if(currTime - startTime >= MAX_RUNNING_TIME){
        Logger.log((currTime - startTime)/(1000*60));
        Logger.log(i+'::'+row_ind);
        cache.putAll({'diff_index': JSON.stringify(diff_index),'row_ind': JSON.stringify(row_ind),'Row': JSON.stringify(i-1)},21600);   
        ScriptApp.newTrigger('getNewData').timeBased().after(2 * 60 * 1000).create();
        return;
      } else {
        Logger.log(i);
        var existe = ArrayLib.indexOf(old_array, Column_old, array[i][Column]);
        if (existe==-1){      
          Logger.log(row_ind+'!!!');
          diff_index[row_ind]=i;
          row_ind++;          
        }
      }
    }    
    cache.putAll({'diff_index': JSON.stringify(diff_index),'Row': JSON.stringify(Row),'compared': JSON.stringify(1)},21600);
  } else {
    Logger.log('file not downloaded yet or already compared');
  }
}
function getColumn(array,searchString){
  for (var i = 0; i < array.length; i++) {    
    for (var j = 0; j < array[i].length; j++) {    
      if (array[i][j] == searchString) {
        var Row = i+1;
        var Column = j;
        break;
      }      
    }
    if (Row != undefined){
      break;
    }
  }
  return {Row: Row, Column: Column};
}

最新更新