我正在尝试从上一个主题上的上线程运行一些修改的代码。我有一个文件data.txt,其中第一行是标题。我想创建一个新文件,只有那些与第二个文件(list.txt)中的条目匹配的列。
data.txt
1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
list.txt
3
5
7
9
所需的输出是
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0,1.000000,0
0,0,0,0
0,0,0,1.000000
我使用了下面的代码
echo "${DATAFILE:-data.txt}"
echo "${COLUMNFILE:-list.txt}"
awk {
j=1
while ((getline < COLUMNFILE) > 0) {
col[j++] = $1
}
n=j-1;
close(COLUMNFILE)
for (i=1; i<=n; i++) s[col[i]]=i
}
NR==1 {
for (f=1; f<=NF; f++)
if ($f in s) c[s[$f]]=f
next
}
{ sep=","
for (f=1; f<=n; f++) {
printf("%c%s",sep,$c[f])
sep=FS
}
print ""
}
DATAFILE
我得到下面的结果,该结果正在复制data.txt中的行而不进行任何选择。list.txt的条目在文件末尾打印
的末尾1,2,3,4,5,6,7,8,9,10
1,2,3,4,5,6,7,8,9,10
1.000000,0,0,0,0,0,0,0,0,0
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
0,0,0,0,0,0,0,0,1.000000,0
3
3
5
5
7
7
9
9
任何帮助都非常感谢。
$ awk '
BEGIN { FS=OFS="," }
NR==FNR { f[++nf]=$0; next }
{ for (i=1; i<=nf; i++) printf "%s%s", $(f[i]), (i<nf?OFS:ORS) }
' list.txt data.txt
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
您可以将位置文件和数据文件同时传递到尴尬中,并在内部进行逻辑:
awk -F"," 'FILENAME=="list.txt"{a[NR]=$1}FILENAME=="data.txt"{for(i=1; i<=length(a); i++){printf (i==length(a)?"%sn":"%s,"),$a[i]}}' list.txt data.txt
在这里我们是:
- 用逗号定界符(
-F","
)拆分传入文件 - 如果文件名awk变量为" list.txt"(
FILENAME=="list.txt"
) - 然后使用索引(
a[NR]=$1
)
使用行号将行中的值添加到数组中 - 如果文件名awk变量为" data.txt"(
FILENAME=="data.txt"
) - 然后循环遍历数组
for(i=1; i<=length(a); i++)
中的每个元素 - 并在该位置(
$a[i]
)打印出项目的值。如果该位置是找到的最后位置(i==length(a)
),则使用以下线馈电("%sn"
)将其打印出来,否则将其用逗号关注($a[i]
)打印出来。
另一个选项是通过-v(变量)标志传递您的位置,但这对可变数量的位置的解释不佳:
awk -F"," -v f1=$(awk 'NR==1' list.txt) -v f2=$(awk 'NR==2' list.txt) -v f3=$(awk 'NR==3' list.txt) -v f4=$(awk 'NR==4' list.txt) '{print $f1, $f2, $f3, $f4}' data.txt
awk 解决方案:
awk -F, 'function pr(a){ r=""; for(i=1;i<=NF;i++) if(i in a) r=(r!="")? r","$i:$i; print r }
NR==FNR{ a[$0]; next }{ pr(a) }' list.txt data.txt
输出:
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
一种非awk
解决方案,用于比较和对比...
$ join -t, <(sort list) <(<file tr ',' 'n' | pr -10ts, | sort) |
sort -n |
tr ',' 'n' |
pr -4ts,
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
您需要魔术数字10
和4
,即原始文件和提取的列数(也可以自动化这些文件)。将数值排序转换为词典和背部所需的多种方式(join
所需)。
本质上的算法是transpose
-join
-transpose
这是一个将list.txt
处理到字段列表的尴尬,并调用了该列表的另一个尴尬来处理data.txt
:
$ awk '
BEGIN { FS=OFS="," } # set the delimiters for the list file
NR==FNR { # process the list file
p=p (p==""?"":OFS) "$" $1 # make a field list ($3,$5,$7,$9)
next
}
{ # process the data or call the processor
RS="" # for getline to return multilined output
cmd="awk 47BEGIN{FS=OFS=","}{print "p"} 47 " FILENAME # build awk call
cmd | getline res # actual awk call and output to res
print res # output res
exit # exit after first record
}
' list data
3,5,7,9
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000
awk解决方案,考虑到列名称可以是任何东西,而不仅仅是列的索引。
BEGIN { FS=OFS="," }
NR==FNR { l[$0]++; next } # save headers from list
FNR==1{ for (i=1; i<=NF; i++)
if ($i in l){ max=i; c[i]++ }} # save column index in c;
# max index in max
{ for(j=1; j<=NF; j++) # loop over column indices
if(j in c) # if index in c
printf "%s%s", $j, (j==max ? ORS : OFS) # print column
}
输入:
$ cat list.txt
C
E
G
I
和
$ cat data.txt
A,B,C,D,E,F,G,H,I,J
1.000000,0,0,0,0,0,0,0,0,0
0,1.000000,0.031250,0,0,0,0.031250,0,0,0
0,0.031250,1.000000,0,0,0,0.062500,0,0,0
0,0,0,1.000000,0,0,0,0,0,0
0,0,0,0,1.000000,0,0,0,0,0
0,0,0,0,0,1.000000,0,0.062500,0,0
0,0.031250,0.062500,0,0,0,1.000000,0,0,0
0,0,0,0,0,0.062500,0,1.000000,0,0
0,0,0,0,0,0,0,0,1.000000,0
给出结果:
$ awk 'BEGIN {FS=OFS=","} NR==FNR{l[$0]++;next} FNR==1{ for (i=1; i<=NF; i++) if ($i in l){max=i; c[i]++}}{for (j=1;j<=NF;j++) if(j in c) printf "%s%s",$j,(j==max ?ORS:OFS) }' list.txt data.txt
C,E,G,I
0,0,0,0
0.031250,0,0.031250,0
1.000000,0,0.062500,0
0,0,0,0
0,1.000000,0,0
0,0,0,0
0.062500,0,1.000000,0
0,0,0,0
0,0,0,1.000000