如何在shell脚本中比较和获取两个sQL文件中的不同行



我有两个转储SQL文件从同一表,我想得到不同的这些文件(插入,更新,删除在表的每一列)作为SQL文件导入数据库。我使用diff sql1 sq2 > diffsql.sql但这行不通。我怎样才能得到这些文件之间的区别?

我试图得到两个SQL文件的差异,并将它们导出为MySQL结构导入另一个DB

由于试图识别数据库中的差异与基本文件的差异并不完全相同(记录可能不在相同的序列中,但与确定差异无关),我认为我应该对逻辑进行初步尝试,以比较定义格式的两个csv (可以轻松地适应您在现实生活中使用的任何格式)。为了实现这一点,我去掉了所标识的每个输入行的最后一个字符,因为它们可以变化,而它们应该对比较结果没有影响。

如果您注意到,我还定义了两个变量,">COMPARISON_START_FIELD";和">COMPARISON_END_FIELD,它允许您定义与评估近似匹配相关的字段。";条件,显示可能发生更改/修改/篡改的地方。尽管我将其定义为连续范围的固定值,但可以想象,您可以修改这些引用,以使用在确定时应考虑的指定字段位置数组。

还要注意">${DBG}";值可以修改,以消除第一至第三阶段所示的输出。也可以将其更改为可编程控制的命令行参数标志。

显然,您可以修改语句以获得您选择的任何格式的输出。

#!/bin/sh
DBG=1
COMPARISON_START_FIELD=2
COMPARISON_END_FIELD=4
#QUESTION:  https://stackoverflow.com/questions/74803126/how-to-compare-and-get-diffrent-rows-at-two-sql-files-in-shell-script
#REFERENCE:  https://www.sample-videos.com/download-sample-sql.php   (for sample load files for 10/100/1000 etc. records)
FILE1="./test_92__input_1.txt"
cat >"${FILE1}" <<"EnDoFiNpUt"
(1, 'rogers63', 'david', 'john', 'Female', 'e6a33eee180b07e563d74fee8c2c66b8', 1),
(2, 'mike28', 'rogers', 'paul', 'Male', '2e7dc6b8a1598f4f75c3eaa47958ee2f', 1),
(3, 'rivera92', 'david', 'john', 'Male', '1c3a8e03f448d211904161a6f5849b68', 1),
(4, 'ross95', 'maria', 'sanders', 'Male', '62f0a68a4179c5cdd997189760cbcf18', 1),
(5, 'paul85', 'morris', 'miller', 'Female', '61bd060b07bddfecccea56a82b850ecf', 1),
(6, 'smith34', 'daniel', 'michael', 'Female', '7055b3d9f5cb2829c26cd7e0e601cde5', 1),
(8, 'daniel53', 'mark', 'mike', 'Male', '299cbf7171ad1b2967408ed200b4e26c', 1),
(9, 'brooks80', 'morgan', 'maria', 'Female', 'aa736a35dc15934d67c0a999dccff8f6', 1),
(10, 'morgan65', 'paul', 'miller', 'Female', 'a28dca31f5aa5792e1cefd1dfd098569', 1);
EnDoFiNpUt
FILE2="./test_92__input_2.txt"
cat >"${FILE2}" <<"EnDoFiNpUt"
(1, 'rogers63', 'david', 'john', 'Female', 'e6a33eee180b07e563d74fee8c2c66b8', 1),
(3, 'rivera92', 'daniella', 'john', 'Female', '1c3a8e03f448d211904161a6f5849b68', 1),
(4, 'ross95', 'maria', 'sanders', 'Male', '62f0a68a4179c5cdd997189760cbcf18', 1),
(5, 'paul85', 'morris', 'miller', 'Female', '61bd060b07bddfecccea56a82b850ecf', 1),
(7, 'james84', 'sanders', 'paul', 'Female', 'b7f72d6eb92b45458020748c8d1a3573', 1),
(8, 'daniel53', 'mark', 'mike', 'Male', '299cbf7171ad1b2967408ed200b4e26c', 1),
(9, 'brooks80', 'morgan', 'maria', 'Female', 'aa736a35dc15934d67c0a999dccff8f6', 1),
(10, 'morgan65', 'paul', 'miller', 'Female', 'a28dca31f5aa5792e1cefd1dfd098569', 1),
(11, 'scorpio17', 'regis', 'philbin', 'male', '5aa5792e1a28dca31fcefd1dfd098569', 1);
EnDoFiNpUt
ls -l ${FILE1} ${FILE2}
wc -l ${FILE1} ${FILE2}
###
### Method #2 - Comparison of arrays
###
awk 
-v dbg="${DBG}"                 
-v f1="${FILE1}"                
-v f2="${FILE2}"                
-v compStart="${COMPARISON_START_FIELD}"    
-v compEnd="${COMPARISON_END_FIELD}"        
'BEGIN{
print "nPHASE I - Loading FILE1 into array ..."
###
### Load up the first file into arrays
###
index1f=0 ;
index1d=0 ;
split( "", sql1full ) ;     # set to empty array
if( dbg == 1 ){ print f1 ; } ;
while( getline < f1 ){
index1f++ ;
i=index1f ;
sql1full[i,1]=substr( $0, 1, length($0)-1 ) ;
sql1full[i,2]=0 ;
if( dbg == 1 ){ printf("tt Line %s ...n", i ) ; } ;
} ;
printf("t File 1 captured as array (size= %s) ...n", index1f ) ;
index2f=0 ;
index2d=0 ;
split( "", sql2full ) ;     # set to empty array
split( "", components ) ;   # RESET to empty array
split( "", sql2details ) ;  # set to empty array
print "nPHASE II - Loading FILE2 into array "
print "    AND    First pair matching"
print "    AND    Identifying unique items in FILE2 ..."
indexI=0
split( "", matched ) ;      # set to empty array
}{
index2f++ ;
j=index2f ;
if( dbg == 1 ){ printf("t ================ j=%s ================n", j ) ; } ;
sql2full[j,1]=substr( $0, 1, length($0)-1 ) ;
sql2full[j,2]=0 ;
nom=1 ;
for( i=1 ; i <= index1f ; i++ ){
### Legend:
### 0 = not classified
### 1 = only copy
### 2 = duplicate (matched)
### 3 = partial match
if( sql1full[i,2] == 2 ){
if( dbg == 1 ){
printf("tt SKIPPED| sql1full[%s,1] already matched on previous pass ...n", i ) ;
} ;
}else{
if( sql2full[j,1] == sql1full[i,1] ){
nom=0
sql1full[i,2]=2 ;
sql2full[j,2]=2 ;
indexI++ ;
matched[indexI,1]=i ;
matched[indexI,2]=j ;
if( dbg == 1 ){
printf("t MATCHED| sql2full[%s,1] same as sql1full[%s,1] ...n", j, i ) ;
} ;
} ;
} ;
} ;
if( nom == 1 ){
if( dbg == 1 ){
printf("t UNIQUE| sql2full[%s,1] has no match ...n", j ) ;
} ;
sql2full[j,2]=1 ;
} ;
}END{
print "nPHASE III - Identifying unique items in FILE1 ..."
for( i=1 ; i <= index1f ; i++ ){
if( dbg == 1 ){ printf("t ================ i=%s ================n", i ) ; } ;
### Legend:
### 0 = not classified
### 1 = only copy
### 2 = duplicate (matched)
### 3 = partial match
if( sql1full[i,2] == 2 ){
if( dbg == 1 ){
printf("tt SKIPPED| sql1full[%s,1] already matched on previous pass ...n", i ) ;
} ;
}else{
nom=1 ;
for( j=1 ; j <= index2f ; j++ ){
if( sql2full[j,2] != 2 ){
if( sql2full[j,1] == sql1full[i,1] ){
nom=0 ;
sql1full[i,2]=2 ;
sql2full[j,2]=2 ;
indexI++ ;
matched[indexI,1]=i ;
matched[indexI,2]=j ;
if( dbg == 1 ){
printf("t MATCHED| sql1full[%s,1] same as sql2full[%s,1] ...n", i, j ) ;
} ;
} ;
} ;
} ;
if( nom == 1 ){
sql1full[i,2]=1 ;
if( dbg == 1 ){
printf("t UNIQUE| sql1full[%s,1] has no match ...n", i ) ;
} ;
} ;
} ;
} ;
print ""
print "nPHASE IV - Identifying Lines which are partial matches ..."
indexK=0 ;
split("", near) ;
for( i=1 ; i <= index1f ; i++ ){
if( sql1full[i,2] == 1 ){
if( dbg == 1 ){
printf("t Examining sql1full[%s,1] ...n", i ) ;
} ;
n=index( sql1full[i,1], "(" ) ;
rem=substr( sql1full[i,1], n+1 ) ;
n=index( rem, ")" ) ;
rem=substr( rem, 1, n-1 ) ;
num1=split( rem, sql1details ) ;
if( dbg == 1 ){
printf("tt num1 = %s ...n", num1 ) ;
} ;
for( j=1 ; j <= index2f ; j++ ){
if( sql2full[j,2] == 1 ){
if( dbg == 1 ){
printf("tt Examining sql2full[%s,1] ...n", j ) ;
} ;
n=index( sql2full[j,1], "(" ) ;
rem=substr( sql2full[j,1], n+1 ) ;
n=index( rem, ")" ) ;
rem=substr( rem, 1, n-1 ) ;
num2=split( rem, sql2details ) ;
if( dbg == 1 ){
printf("tt num2 = %s ...n", num2 ) ;
} ;
if( num1 == num2 ){
pm=0
for( f1=compStart ; f1 <= compEnd ; f1++ ){
if( sql1details[f1] == sql2details[f1] ){
pm=pm+1 ;
if( dbg == 1 ){
printf("ttt %s|  %s == %s ...n", f1, sql1details[f1], sql2details[f1] ) ;
} ;
}else{
if( dbg == 1 ){
printf("ttt %s|  %s != %s ...n", f1, sql1details[f1], sql2details[f1] ) ;
} ;
} ;
} ;
if( pm != 0 ){
indexK++ ;
sql1full[i,2]=3 ;
sql2full[j,2]=3 ;
near[indexK,1]=i ;
near[indexK,2]=j ;
if( dbg == 1 ){
printf("ttt PARTIAL| sql1full[%s,1] and sql2full[%s,1] share %s fields ...n", i, j, pm ) ;
} ;
} ;
}else{
if( dbg == 1 ){
printf("ttt NO_MATCH| sql1full[%s,1] and sql2full[%s,1] have different field counts ...n", i, j ) ;
} ;

} ;
} ;
} ;
} ;
} ;

print "nPHASE IV - Report SHARED for each file ..."
for( i=1 ; i <= indexI ; i++ ){
printf("SHARED 1|%03d|%sn", matched[i,1], sql1full[matched[i,1],1] ) ;
printf("SHARED 2|%03d|%sn", matched[i,2], sql2full[matched[i,2],1] ) ;
print ""
} ;
#for( i=1 ; i <= index1f ; i++ ){
#   if( sql1full[i,2] == 2 ){
#       printf("SHARED 1|%03d|%sn", i, sql1full[i,1] ) ;
#   } ;
#} ;
#print ""
#for( j=1 ; j <= index2f ; j++ ){
#   if( sql2full[j,2] == 2 ){
#       printf("SHARED 2|%03d|%sn", j, sql2full[j,1] ) ;
#   } ;
#} ;
print ""
print "nPHASE  V - Report items UNIQUE to each file ..."
for( i=1 ; i <= index1f ; i++ ){
if( sql1full[i,2] == 1 ){
printf("UNIQUE 1|%03d|%sn", i, sql1full[i,1] ) ;
} ;
} ;
print ""
for( j=1 ; j <= index2f ; j++ ){
if( sql2full[j,2] == 1 ){
printf("UNIQUE 2|%03d|%sn", j, sql2full[j,1] ) ;
} ;
} ;
print ""
print "nPHASE  VI - Report items which have shared data fields ..."
for( k=1 ; k <= indexK ; k++ ){
printf("PARTIAL 1|%03d|%sn", near[k,1], sql1full[near[k,1],1] ) ;
printf("PARTIAL 2|%03d|%sn", near[k,2], sql2full[near[k,2],1] ) ;
print ""
} ;
}' "${FILE2}"

exit

会话输出如下:

-rw-rw-r-- 1 ericthered ericthered 746 Dec 29 22:48 ./test_92__input_1.txt
-rw-rw-r-- 1 ericthered ericthered 755 Dec 29 22:48 ./test_92__input_2.txt
9 ./test_92__input_1.txt
9 ./test_92__input_2.txt
18 total
PHASE I - Loading FILE1 into array ...
./test_92__input_1.txt
Line 1 ...
Line 2 ...
Line 3 ...
Line 4 ...
Line 5 ...
Line 6 ...
Line 7 ...
Line 8 ...
Line 9 ...
File 1 captured as array (size= 9) ...
PHASE II - Loading FILE2 into array 
AND    First pair matching
AND    Identifying unique items in FILE2 ...
================ j=1 ================
MATCHED| sql2full[1,1] same as sql1full[1,1] ...
================ j=2 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
UNIQUE| sql2full[2,1] has no match ...
================ j=3 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
MATCHED| sql2full[3,1] same as sql1full[4,1] ...
================ j=4 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
SKIPPED| sql1full[4,1] already matched on previous pass ...
MATCHED| sql2full[4,1] same as sql1full[5,1] ...
================ j=5 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
SKIPPED| sql1full[4,1] already matched on previous pass ...
SKIPPED| sql1full[5,1] already matched on previous pass ...
UNIQUE| sql2full[5,1] has no match ...
================ j=6 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
SKIPPED| sql1full[4,1] already matched on previous pass ...
SKIPPED| sql1full[5,1] already matched on previous pass ...
MATCHED| sql2full[6,1] same as sql1full[7,1] ...
================ j=7 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
SKIPPED| sql1full[4,1] already matched on previous pass ...
SKIPPED| sql1full[5,1] already matched on previous pass ...
SKIPPED| sql1full[7,1] already matched on previous pass ...
MATCHED| sql2full[7,1] same as sql1full[8,1] ...
================ j=8 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
SKIPPED| sql1full[4,1] already matched on previous pass ...
SKIPPED| sql1full[5,1] already matched on previous pass ...
SKIPPED| sql1full[7,1] already matched on previous pass ...
SKIPPED| sql1full[8,1] already matched on previous pass ...
MATCHED| sql2full[8,1] same as sql1full[9,1] ...
================ j=9 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
SKIPPED| sql1full[4,1] already matched on previous pass ...
SKIPPED| sql1full[5,1] already matched on previous pass ...
SKIPPED| sql1full[7,1] already matched on previous pass ...
SKIPPED| sql1full[8,1] already matched on previous pass ...
SKIPPED| sql1full[9,1] already matched on previous pass ...
UNIQUE| sql2full[9,1] has no match ...
PHASE III - Identifying unique items in FILE1 ...
================ i=1 ================
SKIPPED| sql1full[1,1] already matched on previous pass ...
================ i=2 ================
UNIQUE| sql1full[2,1] has no match ...
================ i=3 ================
UNIQUE| sql1full[3,1] has no match ...
================ i=4 ================
SKIPPED| sql1full[4,1] already matched on previous pass ...
================ i=5 ================
SKIPPED| sql1full[5,1] already matched on previous pass ...
================ i=6 ================
UNIQUE| sql1full[6,1] has no match ...
================ i=7 ================
SKIPPED| sql1full[7,1] already matched on previous pass ...
================ i=8 ================
SKIPPED| sql1full[8,1] already matched on previous pass ...
================ i=9 ================
SKIPPED| sql1full[9,1] already matched on previous pass ...

PHASE IV - Identifying Lines which are partial matches ...
Examining sql1full[2,1] ...
num1 = 7 ...
Examining sql2full[2,1] ...
num2 = 7 ...
2|  'mike28', != 'rivera92', ...
3|  'rogers', != 'daniella', ...
4|  'paul', != 'john', ...
Examining sql2full[5,1] ...
num2 = 7 ...
2|  'mike28', != 'james84', ...
3|  'rogers', != 'sanders', ...
4|  'paul', == 'paul', ...
PARTIAL| sql1full[2,1] and sql2full[5,1] share 1 fields ...
Examining sql2full[9,1] ...
num2 = 7 ...
2|  'mike28', != 'scorpio17', ...
3|  'rogers', != 'regis', ...
4|  'paul', != 'philbin', ...
Examining sql1full[3,1] ...
num1 = 7 ...
Examining sql2full[2,1] ...
num2 = 7 ...
2|  'rivera92', == 'rivera92', ...
3|  'david', != 'daniella', ...
4|  'john', == 'john', ...
PARTIAL| sql1full[3,1] and sql2full[2,1] share 2 fields ...
Examining sql2full[9,1] ...
num2 = 7 ...
2|  'rivera92', != 'scorpio17', ...
3|  'david', != 'regis', ...
4|  'john', != 'philbin', ...
Examining sql1full[6,1] ...
num1 = 7 ...
Examining sql2full[9,1] ...
num2 = 7 ...
2|  'smith34', != 'scorpio17', ...
3|  'daniel', != 'regis', ...
4|  'michael', != 'philbin', ...
PHASE IV - Report of matched SHARED items for each file ...
SHARED 1|001|(1, 'rogers63', 'david', 'john', 'Female', 'e6a33eee180b07e563d74fee8c2c66b8', 1)
SHARED 2|001|(1, 'rogers63', 'david', 'john', 'Female', 'e6a33eee180b07e563d74fee8c2c66b8', 1)
SHARED 1|004|(4, 'ross95', 'maria', 'sanders', 'Male', '62f0a68a4179c5cdd997189760cbcf18', 1)
SHARED 2|003|(4, 'ross95', 'maria', 'sanders', 'Male', '62f0a68a4179c5cdd997189760cbcf18', 1)
SHARED 1|005|(5, 'paul85', 'morris', 'miller', 'Female', '61bd060b07bddfecccea56a82b850ecf', 1)
SHARED 2|004|(5, 'paul85', 'morris', 'miller', 'Female', '61bd060b07bddfecccea56a82b850ecf', 1)
SHARED 1|007|(8, 'daniel53', 'mark', 'mike', 'Male', '299cbf7171ad1b2967408ed200b4e26c', 1)
SHARED 2|006|(8, 'daniel53', 'mark', 'mike', 'Male', '299cbf7171ad1b2967408ed200b4e26c', 1)
SHARED 1|008|(9, 'brooks80', 'morgan', 'maria', 'Female', 'aa736a35dc15934d67c0a999dccff8f6', 1)
SHARED 2|007|(9, 'brooks80', 'morgan', 'maria', 'Female', 'aa736a35dc15934d67c0a999dccff8f6', 1)
SHARED 1|009|(10, 'morgan65', 'paul', 'miller', 'Female', 'a28dca31f5aa5792e1cefd1dfd098569', 1)
SHARED 2|008|(10, 'morgan65', 'paul', 'miller', 'Female', 'a28dca31f5aa5792e1cefd1dfd098569', 1)

PHASE  V - Report items UNIQUE to each file ...
UNIQUE 1|006|(6, 'smith34', 'daniel', 'michael', 'Female', '7055b3d9f5cb2829c26cd7e0e601cde5', 1)
UNIQUE 2|009|(11, 'scorpio17', 'regis', 'philbin', 'male', '5aa5792e1a28dca31fcefd1dfd098569', 1)

PHASE  VI - Report items which have selected SHARED FIELDS ...
PARTIAL 1|002|(2, 'mike28', 'rogers', 'paul', 'Male', '2e7dc6b8a1598f4f75c3eaa47958ee2f', 1)
PARTIAL 2|005|(7, 'james84', 'sanders', 'paul', 'Female', 'b7f72d6eb92b45458020748c8d1a3573', 1)
PARTIAL 1|003|(3, 'rivera92', 'david', 'john', 'Male', '1c3a8e03f448d211904161a6f5849b68', 1)
PARTIAL 2|002|(3, 'rivera92', 'daniella', 'john', 'Female', '1c3a8e03f448d211904161a6f5849b68', 1)

希望有帮助。: -)

VERSION 2

修改/扩展脚本逻辑。它去掉第一个索引字段,并丢弃最后一个"常量"。字段。还在awk命令行上添加选项,仅在存在差异的地方指定文件详细信息的输出模式,并在输出行上使用短格式前缀。

#!/bin/sh
DBG=1
COMPARISON_START_FIELD=2
COMPARISON_END_FIELD=4
DO_DIFF=0   ### Scenario of reporting each file in full but with short form prefix
DO_DIFF=1   ### Scenario of only reporting each file with differences but with short form prefix
#QUESTION:  https://stackoverflow.com/questions/74803126/how-to-compare-and-get-diffrent-rows-at-two-sql-files-in-shell-script
FILE1="./test_92__input_1.txt"
cat >"${FILE1}" <<"EnDoFiNpUt"
(1, 'rogers63', 'david', 'john', 'Female', 'e6a33eee180b07e563d74fee8c2c66b8', 1),
(2, 'mike28', 'rogers', 'paul', 'Male', '2e7dc6b8a1598f4f75c3eaa47958ee2f', 1),
(3, 'rivera92', 'david', 'john', 'Male', '1c3a8e03f448d211904161a6f5849b68', 1),
(4, 'ross95', 'maria', 'sanders', 'Male', '62f0a68a4179c5cdd997189760cbcf18', 1),
(5, 'paul85', 'morris', 'miller', 'Female', '61bd060b07bddfecccea56a82b850ecf', 1),
(6, 'smith34', 'daniel', 'michael', 'Female', '7055b3d9f5cb2829c26cd7e0e601cde5', 1),
(8, 'daniel53', 'mark', 'mike', 'Male', '299cbf7171ad1b2967408ed200b4e26c', 1),
(9, 'brooks80', 'morgan', 'maria', 'Female', 'aa736a35dc15934d67c0a999dccff8f6', 1),
(10, 'morgan65', 'paul', 'miller', 'Female', 'a28dca31f5aa5792e1cefd1dfd098569', 1);
EnDoFiNpUt
FILE2="./test_92__input_2.txt"
cat >"${FILE2}" <<"EnDoFiNpUt"
(1, 'rogers63', 'david', 'john', 'Female', 'e6a33eee180b07e563d74fee8c2c66b8', 1),
(3, 'morgan65', 'paul', 'miller', 'Female', 'a28dca31f5aa5792e1cefd1dfd098569', 1),
(4, 'brooks80', 'morgan', 'maria', 'Female', 'aa736a35dc15934d67c0a999dccff8f6', 1),
(5, 'paul85', 'morris', 'miller', 'Female', '61bd060b07bddfecccea56a82b850ecf', 1),
(7, 'james84', 'sanders', 'paul', 'Female', 'b7f72d6eb92b45458020748c8d1a3573', 1),
(8, 'daniel53', 'mark', 'mike', 'Male', '299cbf7171ad1b2967408ed200b4e26c', 1),
(9, 'rivera92', 'daniella', 'john', 'Female', '1c3a8e03f448d211904161a6f5849b68', 1),
(10, 'ross95', 'maria', 'sanders', 'Male', '62f0a68a4179c5cdd997189760cbcf18', 1),
(11, 'scorpio17', 'regis', 'philbin', 'male', '5aa5792e1a28dca31fcefd1dfd098569', 1);
EnDoFiNpUt
ls -l ${FILE1} ${FILE2}
wc -l ${FILE1} ${FILE2}
###
### Method #2 - Comparison of arrays
###
awk 
-v discardStrt=1                
-v discardTail=", 1)"               
-v dbg="${DBG}"                 
-v f1="${FILE1}"                
-v f2="${FILE2}"                
-v compStart="${COMPARISON_START_FIELD}"    
-v compEnd="${COMPARISON_END_FIELD}"        
-v doDiff="${DO_DIFF}"              
'BEGIN{
compStart=compStart-discardStrt ;
compEnd=compEnd-discardStrt ;
print "nPHASE I - Loading FILE1 into array ..."
###
### Load up the first file into arrays
###
index1f=0 ;
index1d=0 ;
split( "", sql1full ) ;     # set to empty array
if( dbg == 1 ){ print f1 ; } ;
while( getline < f1 ){
index1f++ ;
i=index1f ;
mem=1+discardStrt ;
pos=index($0, $mem ) ; 
ptl=index($0, discardTail )-pos ; 
#sql1full[i,1]=substr( $0, pos, length($0)-discardTail ) ;
sql1full[i,1]=substr( $0, pos, ptl ) ;
sql1full[i,2]=0 ;
if( dbg == 1 ){ printf("tt Line %s ...n", i ) ; } ;
print sql1full[i,1] ;
} ;
printf("t File 1 captured as array (size= %s) ...n", index1f ) ;
index2f=0 ;
index2d=0 ;
split( "", sql2full ) ;     # set to empty array
split( "", components ) ;   # RESET to empty array
split( "", sql2details ) ;  # set to empty array
print "nPHASE II - Loading FILE2 into array "
print "    AND    First pair matching"
print "    AND    Identifying unique items in FILE2 ..."
indexI=0
split( "", matched ) ;      # set to empty array
}{
index2f++ ;
j=index2f ;
if( dbg == 1 ){ printf("t ================ j=%s ================n", j ) ; } ;
mem=1+discardStrt ;
pos=index($0, $mem ) ; 
ptl=index($0, discardTail )-pos ; 
#sql2full[i,1]=substr( $0, pos, length($0)-discardTail ) ;
sql2full[j,1]=substr( $0, pos, ptl ) ;
sql2full[j,2]=0 ;
print sql2full[j,1] ;
nom=1 ;
for( i=1 ; i <= index1f ; i++ ){
### Legend:
### 0 = not classified
### 1 = only copy
### 2 = duplicate (matched)
### 3 = partial match
if( sql1full[i,2] == 2 ){
if( dbg == 1 ){
printf("tt SKIPPED| sql1full[%s,1] already matched on previous pass ...n", i ) ;
} ;
}else{
if( sql2full[j,1] == sql1full[i,1] ){
nom=0
sql1full[i,2]=2 ;
sql2full[j,2]=2 ;
indexI++ ;
matched[indexI,1]=i ;
matched[indexI,2]=j ;
if( dbg == 1 ){
printf("t MATCHED| sql2full[%s,1] same as sql1full[%s,1] ...n", j, i ) ;
} ;
} ;
} ;
} ;
if( nom == 1 ){
if( dbg == 1 ){
printf("t UNIQUE| sql2full[%s,1] has no match ...n", j ) ;
} ;
sql2full[j,2]=1 ;
} ;
}END{
print "nPHASE III - Identifying unique items in FILE1 ..."
for( i=1 ; i <= index1f ; i++ ){
if( dbg == 1 ){ printf("t ================ i=%s ================n", i ) ; } ;
### Legend:
### 0 = not classified
### 1 = only copy
### 2 = duplicate (matched)
### 3 = partial match
if( sql1full[i,2] == 2 ){
if( dbg == 1 ){
printf("tt SKIPPED| sql1full[%s,1] already matched on previous pass ...n", i ) ;
} ;
}else{
nom=1 ;
for( j=1 ; j <= index2f ; j++ ){
if( sql2full[j,2] != 2 ){
if( sql2full[j,1] == sql1full[i,1] ){
nom=0 ;
sql1full[i,2]=2 ;
sql2full[j,2]=2 ;
indexI++ ;
matched[indexI,1]=i ;
matched[indexI,2]=j ;
if( dbg == 1 ){
printf("t MATCHED| sql1full[%s,1] same as sql2full[%s,1] ...n", i, j ) ;
} ;
} ;
} ;
} ;
if( nom == 1 ){
sql1full[i,2]=1 ;
if( dbg == 1 ){
printf("t UNIQUE| sql1full[%s,1] has no match ...n", i ) ;
} ;
} ;
} ;
} ;
print ""
print "nPHASE IV - Identifying Lines which are partial matches ..."
indexK=0 ;
split("", near) ;
for( i=1 ; i <= index1f ; i++ ){
if( sql1full[i,2] == 1 ){
if( dbg == 1 ){
printf("t Examining sql1full[%s,1] ...n", i ) ;
} ;
num1=split( sql1full[i,1], sql1details ) ;
if( dbg == 1 ){
printf("tt num1 = %s ...n", num1 ) ;
} ;
for( j=1 ; j <= index2f ; j++ ){
if( sql2full[j,2] == 1 ){
if( dbg == 1 ){
printf("tt Examining sql2full[%s,1] ...n", j ) ;
} ;
num2=split( sql2full[j,1], sql2details ) ;
if( dbg == 1 ){
printf("tt num2 = %s ...n", num2 ) ;
} ;
if( num1 == num2 ){
pm=0
for( f1=compStart ; f1 <= compEnd ; f1++ ){
if( sql1details[f1] == sql2details[f1] ){
pm=pm+1 ;
if( dbg == 1 ){
printf("ttt %s|  %s == %s ...n", f1, sql1details[f1], sql2details[f1] ) ;
} ;
}else{
if( dbg == 1 ){
printf("ttt %s|  %s != %s ...n", f1, sql1details[f1], sql2details[f1] ) ;
} ;
} ;
} ;
if( pm != 0 ){
indexK++ ;
sql1full[i,2]=3 ;
sql2full[j,2]=3 ;
near[indexK,1]=i ;
near[indexK,2]=j ;
if( dbg == 1 ){
printf("ttt PARTIAL| sql1full[%s,1] and sql2full[%s,1] share %s fields ...n", i, j, pm ) ;
} ;
} ;
}else{
if( dbg == 1 ){
printf("ttt NO_MATCH| sql1full[%s,1] and sql2full[%s,1] have different field counts ...n", i, j ) ;
} ;

} ;
} ;
} ;
} ;
} ;

if( doDiff == 0 ){
print "nPHASE IV - Report of matched SHARED items for each file ..."
for( i=1 ; i <= indexI ; i++ ){
printf("SHARED 1|%03d|%sn", matched[i,1], sql1full[matched[i,1],1] ) ;
printf("SHARED 2|%03d|%sn", matched[i,2], sql2full[matched[i,2],1] ) ;
print ""
} ;
print ""
print "nPHASE  V - Report items UNIQUE to each file ..."
for( i=1 ; i <= index1f ; i++ ){
if( sql1full[i,2] == 1 ){
printf("UNIQUE 1|%03d|%sn", i, sql1full[i,1] ) ;
} ;
} ;
print ""
for( j=1 ; j <= index2f ; j++ ){
if( sql2full[j,2] == 1 ){
printf("UNIQUE 2|%03d|%sn", j, sql2full[j,1] ) ;
} ;
} ;
print ""
print "nPHASE  VI - Report items which have selected SHARED FIELDS ..."
for( k=1 ; k <= indexK ; k++ ){
printf("PARTIAL 1|%03d|%sn", near[k,1], sql1full[near[k,1],1] ) ;
printf("PARTIAL 2|%03d|%sn", near[k,2], sql2full[near[k,2],1] ) ;
print ""
} ;
} ;
if( doDiff == 1 ){
print "nPHASE VII - Distilled Report for File1 using short-form symbols ..."
for( i=1 ; i <= index1f ; i++ ){
switch (sql1full[i,2]) {
case "1" : {
### 1 = only copy
###M1- printf("%s|%03d|%sn", "+", i, sql1full[i,1] ) ;
printf("%s|%sn", "+", sql1full[i,1] ) ;
break ;
} ;
case "2" : {
### 2 = duplicate (matched)
if( doDiff == 1){
break ;
}else{
###M1- printf("%s|%03d|%sn", "=", i, sql1full[i,1] ) ;
printf("%s|%sn", "=", sql1full[i,1] ) ;
break ;
} ;
} ;
case "3" : {
for( k=1 ; k <= indexK ; k++ ){
#printf("PARTIAL 1|%03d|%sn", near[k,1], sql1full[near[k,1],1] ) ;
if( near[k,1] == i ){ matchid=k ; break } ;
} ;
### 3 = partial match
###M1- printf("%s|%03d|%sn", "~", i, sql1full[i,1] ) ;
printf("%s|%03d|%sn", "~", matchid, sql1full[i,1] ) ;
break ;
} ;
default : {
### 0 = not classified - There should be no instances of these
###M1- printf("%s|%03d|%sn", "ERROR", i, sql1full[i,1] ) ;
printf("%s|%03d|%sn", "ERROR", i, sql1full[i,1] ) ;
break ;
} ;
} ;
} ;
print ""
print "nPHASE VIII - Distilled Report for File2 using short-form symbols ..."
for( j=1 ; j <= index2f ; j++ ){
switch (sql2full[j,2]) {
case "1" : {
### 1 = only copy
###M1- printf("%s|%03d|%sn", "+", j, sql2full[j,1] ) ;
printf("%s|%sn", "+", sql2full[j,1] ) ;
break ;
} ;
case "2" : {
### 2 = duplicate (matched)
if( doDiff == 1){
break ;
}else{
###M1- printf("%s|%03d|%sn", "=", j, sql2full[j,1] ) ;
printf("%s|%sn", "=", sql2full[j,1] ) ;
break ;
} ;
} ;
case "3" : {
for( k=1 ; k <= indexK ; k++ ){
#printf("PARTIAL 2|%03d|%sn", near[k,2], sql2full[near[k,2],1] ) ;
if( near[k,2] == j ){ matchid=k ; break } ;
} ;
### 3 = partial match
###M1- printf("%s|%03d|%sn", "~", j, sql2full[j,1] ) ;
printf("%s|%03d|%sn", "~", matchid, sql2full[j,1] ) ;
break ;
} ;
default : {
### 0 = not classified - There should be no instances of these
###M1- printf("%s|%03d|%sn", "ERROR", j, sql2full[j,1] ) ;
printf("%s|%03d|%sn", "ERROR", j, sql2full[j,1] ) ;
break ;
} ;
} ;
} ;
} ;
}' "${FILE2}"

exit

会话输出类似于以前的版本(用于启动和调试),但每个版本的最终决定不同,如下所示:

PHASE VII - Distilled Report for File1 using short-form symbols ...
~|001|'mike28', 'rogers', 'paul', 'Male', '2e7dc6b8a1598f4f75c3eaa47958ee2f'
~|002|'rivera92', 'david', 'john', 'Male', '1c3a8e03f448d211904161a6f5849b68'
+|'smith34', 'daniel', 'michael', 'Female', '7055b3d9f5cb2829c26cd7e0e601cde5'

PHASE VIII - Distilled Report for File2 using short-form symbols ...
~|001|'james84', 'sanders', 'paul', 'Female', 'b7f72d6eb92b45458020748c8d1a3573'
~|002|'rivera92', 'daniella', 'john', 'Female', '1c3a8e03f448d211904161a6f5849b68'
+|'scorpio17', 'regis', 'philbin', 'male', '5aa5792e1a28dca31fcefd1dfd098569'

,~";标识部分匹配,该数字用于标识来自其他文件的接近匹配的对应行。">+";指示在其他文件中未找到的行。

如果您不想要接近匹配,您可以简单地从脚本中删除与此相关的任何逻辑,这将使您的逻辑仅报告每个文件唯一的行。

最新更新