想知道如何根据$1&2个组合,从3美元、4美元、5美元、6美元和7美元的中出现1、2、3和4的次数
样本输入
Name,Date,XXX,YYY,ZZZ,AAA,BBB
ABC,19-10-2020,2,NA,4,3,NA
ABC,19-10-2020,NA,3,NA,NA,4
ABC,18-10-2020,1,NA,4,4,NA
ABC,18-10-2020,NA,3,NA,NA,4
CDE,19-10-2020,1,NA,4,3,NA
CDE,19-10-2020,NA,2,NA,NA,4
CDE,18-10-2020,3,3,4,3,3
CDE,18-10-2020,NA,3,NA,NA,4
FGH,18-10-2020,4,4,4,4,4
期望输出
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
我已经尝试过如下命令,但没有成功输出。请帮忙。
awk -F"," '{OFS=","; print $1,$2}' | awk -F"," 'BEGIN {count=0} {key=$0; a[key]++} END {for (i in a) print i,a[i]}'
您永远不需要多次调用awk。您只需将发生次数和输出相加,例如
awk -F, '
BEGIN {
OFS=","
print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
ones=twos=threes=fours=0
}
last && last != $1" "$2 {
print $1,$2,ones,twos,threes,fours
ones=twos=threes=fours=0
last = $1" "$2
}
FNR > 1 {
for (i=3; i<=NF; i++) {
$i=="1" && ones++
$i=="2" && twos++
$i=="3" && threes++
$i=="4" && fours++
}
last=$1" "$2
}
END {
print $1,$2,ones,twos,threes,fours
}
' file.csv
示例使用/输出
$ awk -F, '
> BEGIN {
> OFS=","
> print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
> ones=twos=threes=fours=0
> }
> last && last != $1" "$2 {
> print $1,$2,ones,twos,threes,fours
> ones=twos=threes=fours=0
> last = $1" "$2
> }
> FNR > 1 {
> for (i=3; i<=NF; i++) {
> $i=="1" && ones++
> $i=="2" && twos++
> $i=="3" && threes++
> $i=="4" && fours++
> }
> last=$1" "$2
> }
> END {
> print $1,$2,ones,twos,threes,fours
> }
> ' file.csv
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,18-10-2020,0,1,2,2
CDE,19-10-2020,1,0,1,3
CDE,18-10-2020,1,1,1,2
FGH,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
这个awk也应该工作:
awk 'BEGIN {
FS=OFS=","
}
NR > 1 {
k=$1 OFS $2
arr[k]
for (i=3; i<=NF; ++i)
++freq[k OFS $i]
}
END {
print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
for (i in arr)
print i, freq[i OFS 1]+0, freq[i OFS 2]+0,freq[i OFS 3]+0,freq[i OFS 4]+0
}' file.csv
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
您能尝试以下内容吗?这些内容是用GNUawk
中显示的示例编写和测试的。
awk '
BEGIN{
FS=OFS=","
print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
}
FNR>1{
till=""
delete arr
for(i=3;i<=NF;i++){
ind[$1 OFS $2]
if($i!="NA"){ arr[$i]++; max_till=(max_till>$i?max_till:$i) }
}
till=(NF-3)
for(j=1;j<=till;j++){
value[$1 OFS $2 OFS j]+=arr[j]
}
}
END{
for(k in ind){
printf("%s,",k)
for(i=1;i<=max_till;i++){
printf("%d%s",(value[k OFS i]?value[k OFS i]:0),i==max_till?ORS:OFS)
}
}
}' Input_file
输出如下。
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
$ cat tst.awk
BEGIN {
FS = OFS = ","
maxVal = 4
}
NR > 1 {
key = $1 OFS $2
keys[key]
for (i=3; i<=NF; i++) {
cnt[key,$i]++
}
}
END {
printf "Name%sDate%s", OFS, OFS
for (i=1; i<=maxVal; i++) {
printf "CountOF %d%s", i, (i<maxVal ? OFS : ORS)
}
for (key in keys) {
printf "%s%s", key, OFS
for (i=1; i<=maxVal; i++) {
printf "%d%s", cnt[key,i], (i<maxVal ? OFS : ORS)
}
}
}
$ awk -f tst.awk file
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
END中的CCD_ 2可以打乱输出行的顺序。如果这是一个问题,可以通过各种调整来解决。计算maxVal
而不是将其硬编码为4也是微不足道的。
另一个使用数组和分割函数的awk
$ awk -F, ' BEGIN {OFS="," } NR>1 { k=$1 OFS $2;$1=$2=""; a[k]=a[k] OFS $0 } END { for(i in a) { printf("%s",i); for(j=1;j<=4;j++) { n=split(a[i],t,j); printf(",%s",n-1) } print "" } } ' count_1234.txt
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
$
为便于阅读,拆分为多行。
awk -F, ' BEGIN {OFS="," }
NR>1 { k=$1 OFS $2;$1=$2=""; a[k]=a[k] OFS $0 }
END {
for(i in a)
{
printf("%s",i);
for(j=1;j<=4;j++)
{
n=split(a[i],t,j);
printf(",%s",n-1)
}
print ""
}
} '