我正在尝试将行与CSV文件中匹配的第一个单元格合并。以便根据匹配的字符串将以下单元格放置在其合法的列中。
我有一个包含以下内容的文件:
item,pieces,color,last order
"apples","4 pieces"
"apples","red color"
"apples","last ordered 2 hours ago"
"mangos","1 piece"
"mangos","last ordered 1 day ago"
"carrots","10 pieces"
"carrots","orange color"
然后应该合并为以下内容:
item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""
我使用的代码:
awk '{ printf "%s", $0; if (NR % 3 == 0) print ""; else printf "," }' file.csv
这种一次合并三行的方法只需要一点手动编辑,只要所有项目都有三条数据";片"颜色"";最后一个订单";。
但是,这是不起作用的,因为不同的项目有不同的数据集。
您可以尝试此awk
:
awk 'BEGIN {FS=OFS=","} NR == 1 {print; next} item != $1 {if (item != "") print item, pieces, color, order; item = $1; pieces = $2; color = order = """"; next} {if ($2 ~ /color/) color = $2; else order = $2} END {print item, pieces, color, order}' file
item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""
可读性更强的版本:
awk 'BEGIN {
FS = OFS = ","
}
NR == 1 {
print
next
}
item != $1 {
if (item != "")
print item, pieces, color, order
item = $1
pieces = $2
color = order = """"
next
}
{
if ($2 ~ /color/)
color = $2
else
order = $2
}
END {
print item, pieces, color, order
}' file
$ cat tst.awk
BEGIN { FS=OFS="," }
{ gsub(/"/,"") }
NR==1 {
print
sub(/s,/,",")
numTags = split($0,tags)
next
}
$1 != prev {
if ( prev != "" ) {
prt()
}
prev=$1
}
{
tag = tags[1]
tag2val[tag] = $1
for (tagNr=2; tagNr<=numTags; tagNr++) {
tag = tags[tagNr]
if ( index($2,tag) ) {
tag2val[tag] = $2
next
}
}
}
END { prt() }
function prt( tagNr,tag,val) {
for (tagNr=1; tagNr<=numTags; tagNr++) {
tag = tags[tagNr]
val = tag2val[tag]
printf ""%s"%s", val, (tagNr<numTags ? OFS : ORS)
}
delete tag2val
}
$ awk -f tst.awk file
item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""
检查一下:
awk -F, '{ if (f == $1) { for (c=0; c <length($1) + length(FS); c++) printf " "; print $2 FS $3 } else { print $0 } } { f = $1 }' yourfile.csv
只要我们的所有项都在输入文件中以行的形式分组在一起,以下实现就可以工作。实现缓存单个输出记录的字段和print
的字段,当它看到不同的项时(或者当脚本END
的(。
awk -F, -v OFS=, '
NR==1 { f1=$1; f2=$2; f3=$3; f4=$4 }
FNR==1 { next }
f1 != $1 {
print f1, f2, f3, f4
f1=$1
f2=f3=f4=""""
}
$2 ~ /piece/ { f2 = $2; next }
$2 ~ /color/ { f3 = $2; next }
$2 ~ /last order/ { f4 = $2; next }
END { print f1, f2, f3, f4 }
' file.csv
如果项目没有分组,我们必须缓存整个文件。。。在下文中,f
基本上将文件存储/表示为一个表。k
数组存储给定项的输出记录的行号,因此当我们print
时,我们的输出记录将按输入文件的顺序出现。
awk -F, -v OFS=, '
BEGIN { n = 0; split("", k); split("", f) }
NR==1 { f[++n,1]=$1; f[n,2]=$2; f[n,3]=$3; f4[n,4]=$4 }
FNR==1 { next }
!($1 in k) {
k[$1] = ++n
f[n,1]=$1
f[n,2]=f[n,3]=f[n,4]=""""
}
$2 ~ /piece/ { f[k[$1],2] = $2; next }
$2 ~ /color/ { f[k[$1],3] = $2; next }
$2 ~ /last order/ { f[k[$1],4] = $2; next }
END {
for (i=1; i<=n; ++i)
print f[i,1], f[i,2], f[i,3], f[i,4]
}
' file.csv
注:
两种实现中的NR/FNR之舞都是试图处理头行和多个输入文件(我们可能关心也可能不关心处理多个文件输入(。
后一种实现允许覆盖项的字段。。。如果我们不想要这种行为,我们就必须改变逻辑——对逻辑的改变将是关于在输入字段
$2
上进行正则表达式比较的行。