awk 计算 1 的 2 的 3 的多列

  • 本文关键字:awk 计算 awk
  • 更新时间 :
  • 英文 :


想知道如何根据$1&2个组合,从3美元、4美元、5美元、6美元和7美元的中出现1、2、3和4的次数

样本输入

Name,Date,XXX,YYY,ZZZ,AAA,BBB
ABC,19-10-2020,2,NA,4,3,NA
ABC,19-10-2020,NA,3,NA,NA,4
ABC,18-10-2020,1,NA,4,4,NA
ABC,18-10-2020,NA,3,NA,NA,4
CDE,19-10-2020,1,NA,4,3,NA
CDE,19-10-2020,NA,2,NA,NA,4
CDE,18-10-2020,3,3,4,3,3
CDE,18-10-2020,NA,3,NA,NA,4
FGH,18-10-2020,4,4,4,4,4

期望输出

Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5

我已经尝试过如下命令,但没有成功输出。请帮忙。

awk -F"," '{OFS=","; print $1,$2}'  | awk -F"," 'BEGIN {count=0} {key=$0; a[key]++} END {for (i in a) print i,a[i]}'

您永远不需要多次调用awk。您只需将发生次数和输出相加,例如

awk -F, '
BEGIN { 
OFS=","
print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
ones=twos=threes=fours=0
}
last && last != $1" "$2 {
print $1,$2,ones,twos,threes,fours
ones=twos=threes=fours=0
last = $1" "$2
}
FNR > 1 {
for (i=3; i<=NF; i++) {
$i=="1" && ones++
$i=="2" && twos++
$i=="3" && threes++
$i=="4" && fours++
}
last=$1" "$2
}
END {
print $1,$2,ones,twos,threes,fours
}
' file.csv

示例使用/输出

$ awk -F, '
>     BEGIN {
>         OFS=","
>         print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
>         ones=twos=threes=fours=0
>     }
>     last && last != $1" "$2 {
>         print $1,$2,ones,twos,threes,fours
>         ones=twos=threes=fours=0
>         last = $1" "$2
>     }
>     FNR > 1 {
>         for (i=3; i<=NF; i++) {
>             $i=="1" && ones++
>             $i=="2" && twos++
>             $i=="3" && threes++
>             $i=="4" && fours++
>         }
>         last=$1" "$2
>     }
>     END {
>         print $1,$2,ones,twos,threes,fours
>     }
> ' file.csv
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,18-10-2020,0,1,2,2
CDE,19-10-2020,1,0,1,3
CDE,18-10-2020,1,1,1,2
FGH,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5

这个awk也应该工作:

awk 'BEGIN {
FS=OFS=","
}
NR > 1 {
k=$1 OFS $2
arr[k]
for (i=3; i<=NF; ++i)
++freq[k OFS $i]
}
END {
print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
for (i in arr)
print i, freq[i OFS 1]+0, freq[i OFS 2]+0,freq[i OFS 3]+0,freq[i OFS 4]+0
}' file.csv
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5

您能尝试以下内容吗?这些内容是用GNUawk中显示的示例编写和测试的。

awk '
BEGIN{
FS=OFS=","
print "Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4"
}
FNR>1{
till=""
delete arr
for(i=3;i<=NF;i++){
ind[$1 OFS $2]
if($i!="NA"){ arr[$i]++; max_till=(max_till>$i?max_till:$i) }
}
till=(NF-3)
for(j=1;j<=till;j++){
value[$1 OFS $2 OFS j]+=arr[j]
}
}
END{
for(k in ind){
printf("%s,",k)
for(i=1;i<=max_till;i++){
printf("%d%s",(value[k OFS i]?value[k OFS i]:0),i==max_till?ORS:OFS)
}
}
}' Input_file

输出如下。

Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
$ cat tst.awk
BEGIN {
FS = OFS = ","
maxVal = 4
}
NR > 1 {
key = $1 OFS $2
keys[key]
for (i=3; i<=NF; i++) {
cnt[key,$i]++
}
}
END {
printf "Name%sDate%s", OFS, OFS
for (i=1; i<=maxVal; i++) {
printf "CountOF %d%s", i, (i<maxVal ? OFS : ORS)
}
for (key in keys) {
printf "%s%s", key, OFS
for (i=1; i<=maxVal; i++) {
printf "%d%s", cnt[key,i], (i<maxVal ? OFS : ORS)
}
}
}

$ awk -f tst.awk file
Name,Date,CountOF 1,CountOF 2,CountOF 3,CountOF 4
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5

END中的CCD_ 2可以打乱输出行的顺序。如果这是一个问题,可以通过各种调整来解决。计算maxVal而不是将其硬编码为4也是微不足道的。

另一个使用数组和分割函数的awk

$ awk -F, ' BEGIN {OFS="," } NR>1 { k=$1 OFS $2;$1=$2=""; a[k]=a[k] OFS $0  } END { for(i in a) { printf("%s",i); for(j=1;j<=4;j++) { n=split(a[i],t,j); printf(",%s",n-1) } print "" } }  ' count_1234.txt
ABC,19-10-2020,0,1,2,2
ABC,18-10-2020,1,0,1,3
CDE,19-10-2020,1,1,1,2
CDE,18-10-2020,0,0,5,2
FGH,18-10-2020,0,0,0,5
$

为便于阅读,拆分为多行。

awk -F, ' BEGIN {OFS="," } 
NR>1 { k=$1 OFS $2;$1=$2=""; a[k]=a[k] OFS $0  } 
END { 
for(i in a) 
{    
printf("%s",i); 
for(j=1;j<=4;j++) 
{ 
n=split(a[i],t,j); 
printf(",%s",n-1) 
} 
print "" 
} 
}  '

最新更新