Awk提取列从文件3的标题中提取列

  • 本文关键字:提取 标题 文件 Awk bash awk
  • 更新时间 :
  • 英文 :


我正在尝试从上一个主题上的上线程运行一些修改的代码。我有一个文件data.txt,其中第一行是标题。我想创建一个新文件,只有那些与第二个文件(list.txt)中的条目匹配的列。

data.txt

1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0

list.txt

3
5
7
9

所需的输出是

3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0,1.000000,0
0,0,0,0
0,0,0,1.000000

我使用了下面的代码

echo "${DATAFILE:-data.txt}"
echo "${COLUMNFILE:-list.txt}"
awk {
     j=1
     while ((getline < COLUMNFILE) > 0) {
        col[j++] = $1
     }
     n=j-1;
     close(COLUMNFILE)
     for (i=1; i<=n; i++) s[col[i]]=i
   }
   NR==1 {
     for (f=1; f<=NF; f++)
       if ($f in s) c[s[$f]]=f
     next
   }
   { sep=","
     for (f=1; f<=n; f++) {
       printf("%c%s",sep,$c[f])
       sep=FS
     }
     print ""
 } 
 DATAFILE

我得到下面的结果,该结果正在复制data.txt中的行而不进行任何选择。list.txt的条目在文件末尾打印

的末尾
1,2,3,4,5,6,7,8,9,10
1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
0,0,0,0,0,0,0,0,1.000000,0
3
3
5
5
7
7
9
9

任何帮助都非常感谢。

$ awk '
    BEGIN { FS=OFS="," }
    NR==FNR { f[++nf]=$0; next }
    { for (i=1; i<=nf; i++) printf "%s%s", $(f[i]), (i<nf?OFS:ORS) }
' list.txt data.txt
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

您可以将位置文件和数据文件同时传递到尴尬中,并在内部进行逻辑:

 awk -F"," 'FILENAME=="list.txt"{a[NR]=$1}FILENAME=="data.txt"{for(i=1; i<=length(a); i++){printf (i==length(a)?"%sn":"%s,"),$a[i]}}' list.txt data.txt

在这里我们是:

  1. 用逗号定界符(-F",")拆分传入文件
  2. 如果文件名awk变量为" list.txt"( FILENAME=="list.txt"
  3. 然后使用索引(a[NR]=$1
  4. 使用行号将行中的值添加到数组中
  5. 如果文件名awk变量为" data.txt"( FILENAME=="data.txt"
  6. 然后循环遍历数组for(i=1; i<=length(a); i++)
  7. 中的每个元素
  8. 并在该位置($a[i])打印出项目的值。如果该位置是找到的最后位置(i==length(a)),则使用以下线馈电("%sn")将其打印出来,否则将其用逗号关注($a[i])打印出来。

另一个选项是通过-v(变量)标志传递您的位置,但这对可变数量的位置的解释不佳:

awk -F"," -v f1=$(awk 'NR==1' list.txt) -v f2=$(awk 'NR==2' list.txt) -v f3=$(awk 'NR==3' list.txt) -v f4=$(awk 'NR==4' list.txt) '{print $f1, $f2, $f3, $f4}' data.txt

awk 解决方案:

awk -F, 'function pr(a){ r=""; for(i=1;i<=NF;i++) if(i in a) r=(r!="")? r","$i:$i; print r }
         NR==FNR{ a[$0]; next }{ pr(a) }' list.txt data.txt

输出:

3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

一种非awk解决方案,用于比较和对比...

$ join -t, <(sort list) <(<file tr ',' 'n' | pr -10ts, | sort) | 
  sort -n | 
  tr ',' 'n' | 
  pr -4ts,

3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

您需要魔术数字104,即原始文件和提取的列数(也可以自动化这些文件)。将数值排序转换为词典和背部所需的多种方式(join所需)。

本质上的算法是transpose -join -transpose

这是一个将list.txt处理到字段列表的尴尬,并调用了该列表的另一个尴尬来处理data.txt

$ awk '
BEGIN { FS=OFS="," }          # set the delimiters for the list file
NR==FNR {                     # process the list file
    p=p (p==""?"":OFS) "$" $1 # make a field list ($3,$5,$7,$9)
    next
}
{                             # process the data or call the processor
    RS=""                     # for getline to return multilined output
    cmd="awk 47BEGIN{FS=OFS=","}{print "p"}47 " FILENAME   # build awk call
    cmd | getline res         # actual awk call and output to res
    print res                 # output res
    exit                      # exit after first record
}
' list data
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

awk解决方案,考虑到列名称可以是任何东西,而不仅仅是列的索引。

BEGIN { FS=OFS="," }
NR==FNR { l[$0]++; next }                      # save headers from list
FNR==1{ for (i=1; i<=NF; i++) 
            if ($i in l){ max=i; c[i]++ }}         # save column index in c;
                                                   # max index in max 
{ for(j=1; j<=NF; j++)                             # loop over column indices
      if(j in c)                                   # if index in c
          printf "%s%s", $j, (j==max ? ORS : OFS)  # print column
}

输入:

$ cat list.txt
C
E
G
I

$ cat data.txt
A,B,C,D,E,F,G,H,I,J
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0

给出结果:

$ awk 'BEGIN {FS=OFS=","} NR==FNR{l[$0]++;next} FNR==1{ for (i=1; i<=NF; i++) if ($i in l){max=i; c[i]++}}{for (j=1;j<=NF;j++) if(j in c) printf "%s%s",$j,(j==max ?ORS:OFS) }' list.txt data.txt
C,E,G,I
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000

最新更新