我正在尝试将一个目录下的多个csv文件合并为一个csv。每个文件的所有标头都相同。但当我看到合并后的文件时,很难理解数据实际上来自哪个文件。我还为我的目的固定了我想要的列。是否有任何方法可以使用以下代码唯一地识别它们?导入csv从pathlib导入Path
p = Path(r'E:NeurogenMerging_test_data')
file_list = p.glob("*.csv")
desired_columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene', 'Xref.refGene', 'cytoBand', 'cosmic70', 'avsnp147', 'ExAC_ALL', 'ExAC_AFR', 'ExAC_AMR', 'ExAC_EAS', 'ExAC_FIN', 'ExAC_NFE', 'ExAC_OTH', 'ExAC_SAS', 'CLINSIG', 'CLNDBN', 'CLNACC', 'CLNDSDB', 'CLNDSDBID', '1000g2015aug_all', 'SIFT_score', 'SIFT_pred', 'Polyphen2_HDIV_score', 'Polyphen2_HDIV_pred', 'Polyphen2_HVAR_score', 'Polyphen2_HVAR_pred', 'LRT_score', 'LRT_pred', 'MutationTaster_score', 'MutationTaster_pred', 'MutationAssessor_score', 'MutationAssessor_pred', 'FATHMM_score', 'FATHMM_pred', 'PROVEAN_score', 'PROVEAN_pred', 'VEST3_score', 'CADD_raw', 'CADD_phred', 'DANN_score', 'fathmm-MKL_coding_score', 'fathmm-MKL_coding_pred', 'MetaSVM_score', 'MetaSVM_pred', 'MetaLR_score', 'MetaLR_pred', 'integrated_fitCons_score', 'integrated_confidence_value', 'GERP++_RS', 'phyloP7way_vertebrate', 'phyloP20way_mammalian', 'phastCons7way_vertebrate', 'phastCons20way_mammalian', 'SiPhy_29way_logOdds', 'Otherinfo']
desired_rows = []
for csv_file in file_list:
with open(csv_file, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
desired_rows.append({c: row[c] for c in desired_columns})
with open('merged.csv', 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=desired_columns)
writer.writeheader()
writer.writerows(desired_rows)
由于您没有提供任何样本数据,我生成了一些任意文件来显示一般概念:
a.txt:
col_1;col_2;col_3
1;2;3
4;5;6
7;8;9
b.txt:
col_1;col_2;col_3
10;20;30
40;50;60
70;80;90
假设您想在列col_1
和col_3
上进行筛选,基于内置csv
模块的一种非常基本的方法可能如下所示:
import csv
from pathlib import Path
DIRECTORY = Path(__file__).parent
FILE_SUFFIX = '*.txt'
DESIRED_COLUMNS = ['col_1', 'col_3']
files = sorted(
DIRECTORY.glob(FILE_SUFFIX),
key=lambda x: x.name,
)
filtered = []
for f in files:
reader = csv.DictReader(f.open(), delimiter=';')
for row in reader:
d = {k: v for k, v in row.items() if k in DESIRED_COLUMNS}
d['from_file'] = f.name
filtered.append(d)
print(filtered)
# filtered is a list of dicts and can be written to file with csv.DictWriter
上面剪下的指纹:
[{'col_1': '1', 'col_3': '3', 'from_file': 'a.txt'}, {'col_1': '4', 'col_3': '6', 'from_file': 'a.txt'}, {'col_1': '7', 'col_3': '9', 'from_file': 'a.txt'}, {'col_1': '10', 'col_3': '30', 'from_file': 'b.txt'}, {'col_1': '40', 'col_3': '60', 'from_file': 'b.txt'}, {'col_1': '70', 'col_3': '90', 'from_file': 'b.txt'}]
一个更优雅的解决方案可以基于pandas:
import pandas as pd
from pathlib import Path
DIRECTORY = Path(__file__).parent
FILE_SUFFIX = '*.txt'
DESIRED_COLUMNS = ['col_1', 'col_3']
files = sorted(
DIRECTORY.glob(FILE_SUFFIX),
key=lambda x: x.name,
)
filtered = []
for f in files:
df = pd.read_csv(
f,
delimiter=';',
usecols=DESIRED_COLUMNS,
)
df['from_file'] = f.name
filtered.append(df)
# print(filtered)
concated = pd.concat(filtered, ignore_index=True)
print(concated)
# concated is a pandas.DataFrame. Use `concated.to_csv()` to write it to file
pandas
方法导致:
col_1 col_3 from_file
0 1 3 a.txt
1 4 6 a.txt
2 7 9 a.txt
3 10 30 b.txt
4 40 60 b.txt
5 70 90 b.txt