基于CSV文件中的单元格内容合并行



我正在尝试将行与CSV文件中匹配的第一个单元格合并。以便根据匹配的字符串将以下单元格放置在其合法的列中。

我有一个包含以下内容的文件:

item,pieces,color,last order
"apples","4 pieces"
"apples","red color"
"apples","last ordered 2 hours ago"
"mangos","1 piece"
"mangos","last ordered 1 day ago"
"carrots","10 pieces"
"carrots","orange color"

然后应该合并为以下内容:

item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""

我使用的代码:

awk '{ printf "%s", $0; if (NR % 3 == 0) print ""; else printf "," }' file.csv

这种一次合并三行的方法只需要一点手动编辑,只要所有项目都有三条数据";片"颜色""最后一个订单";。

但是,这是不起作用的,因为不同的项目有不同的数据集。

您可以尝试此awk:

awk 'BEGIN {FS=OFS=","} NR == 1 {print; next} item != $1 {if (item != "") print item, pieces, color, order; item = $1; pieces = $2; color = order = """"; next} {if ($2 ~ /color/) color = $2; else order = $2} END {print item, pieces, color, order}' file
item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""

可读性更强的版本:

awk 'BEGIN {
FS = OFS = ","
}
NR == 1 {
print
next
}
item != $1 {
if (item != "")
print item, pieces, color, order
item = $1
pieces = $2
color = order = """"
next
}
{
if ($2 ~ /color/)
color = $2
else
order = $2
}
END {
print item, pieces, color, order
}' file
$ cat tst.awk
BEGIN { FS=OFS="," }
{ gsub(/"/,"") }
NR==1 {
print
sub(/s,/,",")
numTags = split($0,tags)
next
}
$1 != prev {
if ( prev != "" ) {
prt()
}
prev=$1
}
{
tag = tags[1]
tag2val[tag] = $1
for (tagNr=2; tagNr<=numTags; tagNr++) {
tag = tags[tagNr]
if ( index($2,tag) ) {
tag2val[tag] = $2
next
}
}
}
END { prt() }
function prt(   tagNr,tag,val) {
for (tagNr=1; tagNr<=numTags; tagNr++) {
tag = tags[tagNr]
val = tag2val[tag]
printf ""%s"%s", val, (tagNr<numTags ? OFS : ORS)
}
delete tag2val
}

$ awk -f tst.awk file
item,pieces,color,last order
"apples","4 pieces","red color","last ordered 2 hours ago"
"mangos","1 piece","","last ordered 1 day ago"
"carrots","10 pieces","orange color",""

检查一下:

awk -F, '{ if (f == $1) { for (c=0; c <length($1) + length(FS); c++) printf " "; print $2 FS $3 } else { print $0 } } { f = $1 }' yourfile.csv

只要我们的所有项都在输入文件中以行的形式分组在一起,以下实现就可以工作。实现缓存单个输出记录的字段和print的字段,当它看到不同的项时(或者当脚本END的(。

awk -F, -v OFS=, '
NR==1 { f1=$1; f2=$2; f3=$3; f4=$4 }
FNR==1 { next }
f1 != $1 {
print f1, f2, f3, f4
f1=$1
f2=f3=f4=""""
}
$2 ~ /piece/      { f2 = $2; next }
$2 ~ /color/      { f3 = $2; next }
$2 ~ /last order/ { f4 = $2; next }
END { print f1, f2, f3, f4 }
' file.csv

如果项目没有分组,我们必须缓存整个文件。。。在下文中,f基本上将文件存储/表示为一个表。k数组存储给定项的输出记录的行号,因此当我们print时,我们的输出记录将按输入文件的顺序出现。

awk -F, -v OFS=, '
BEGIN { n = 0; split("", k); split("", f) }
NR==1 { f[++n,1]=$1; f[n,2]=$2; f[n,3]=$3; f4[n,4]=$4 }
FNR==1 { next }
!($1 in k) {
k[$1] = ++n
f[n,1]=$1
f[n,2]=f[n,3]=f[n,4]=""""
}
$2 ~ /piece/      { f[k[$1],2] = $2; next }
$2 ~ /color/      { f[k[$1],3] = $2; next }
$2 ~ /last order/ { f[k[$1],4] = $2; next }
END {
for (i=1; i<=n; ++i)
print f[i,1], f[i,2], f[i,3], f[i,4]
}
' file.csv

注:

  1. 两种实现中的NR/FNR之舞都是试图处理头行和多个输入文件(我们可能关心也可能不关心处理多个文件输入(。

  2. 后一种实现允许覆盖项的字段。。。如果我们不想要这种行为,我们就必须改变逻辑——对逻辑的改变将是关于在输入字段$2上进行正则表达式比较的行。

相关内容

  • 没有找到相关文章

最新更新