如何将钥匙值管道划界文件转换为带标头的完美CSV文件



嗨,我有一个格式的文件。

key1=abc||key2=ajdskj||name=ankush||contact=123444
key1=def||name=reddy||contact=456778
key1=aef||address=ashaskawe||name=john

如何使用Python将其转换为带有标头的任何划界文件。喜欢

key1||key2||name||contact||address
abc||ajdskj||ankush||123444||NULL
def||NULL||reddy||456778||NULL
aef||NULL||john||NULL||ashaskawe

您能告诉我,如果有更多的字段,该方法可能是什么。

正在尝试使用CSV阅读器和Pandas读取该文件,但我不知道如何将键和值分开。

感谢您的帮助

这是一种使用标准库中工具并维护列的顺序的方法。messy_data.txt文件包含原始数据,而cleaner_data.txt是保存清洁数据的地方:

from collections import defaultdict, OrderedDict
with open('messy_data.txt') as infile, open('cleaner_data.txt','w') as outfile:
    whole_data = [x.strip().split("||") for x in infile]
    headers = []
    for x in whole_data:
        for k in [y.split("=")[0] for y in x]:
            if k not in headers:
                headers.append(k)
    whole_data = [dict(y.split("=") for y in x) for x in whole_data]
    output = defaultdict(list)
    for header in headers:
        for d in whole_data:
            output[header].append(d.get(header,'NULL'))
    output = OrderedDict((x,output.get(x)) for x in headers)
    outfile.write("||".join(list(output.keys()))+"n")
    for row in zip(*output.values()):
        outfile.write("||".join(row)+"n")

这应该产生:

key1||key2||name||contact||address
abc||ajdskj||ankush||123444||NULL
def||NULL||reddy||456778||NULL
aef||NULL||john||NULL||ashaskawe

编辑:

更友好的脚本:

from collections import defaultdict, OrderedDict
with open('messy_data.txt') as infile, open('cleaner_data.txt','w') as outfile:
    whole_data = [x.strip().split("||") for x in infile]
    headers = []
    for x in whole_data:
        for k in [y.split("=")[0] for y in x]:
            if k not in headers:
                headers.append(k)
    #whole_data = [dict(y.split("=") for y in x) for x in whole_data]
    whole_data2 = []
    for x in whole_data:
        temp_list = [y.split("=") for y in x]
        try:
            temp_dict = dict(temp_list)
            whole_data2.append(temp_dict)
        except:
            print(temp_list)
            continue
    output = defaultdict(list)
    for header in headers:
        for d in whole_data2:
            output[header].append(d.get(header,'NULL'))
    output = OrderedDict((x,output.get(x)) for x in headers)
    print(output)
    outfile.write("||".join(list(output.keys()))+"n")
    for row in zip(*output.values()):
        outfile.write("||".join(row)+"n")

我希望这很有用。

熊猫解决方案:

阅读文件:

df=pd.read_csv('data.csv',delimiter='|',header=None)
dfu=df.unstack().dropna()
keys,values=np.array(dfu.apply(lambda s:str.split(s,'=')).tolist()).T

制作数据框:

data=dfu.to_frame()
data['keys']=keys
data['values']=values
final=data.reset_index().pivot(
index='level_1',columns='keys',values='values')
keys       address contact key1    key2    name
level_1                                        
0             None  123444  abc  ajdskj  ankush
1             None  456778  def    None   reddy
2        ashaskawe    None  aef    None    john

我不确定Pandas是否可以做到,但是我自己分开了很长的方法(这还不错)。

代码:

import pandas as pd
from io import StringIO
#Mimic opening a file with StringIO
in_file = StringIO(u"""
key1=abc||key2=ajdskj||name=ankush||contact=123444
key1=def||name=reddy||contact=456778
key1=aef||address=ashaskawe||name=john
""")
#First loop through the lines to find all the fields
#Also store each line info as a dict
all_line_dicts = []
all_fields = set()
for line in in_file.readlines():
    line_dict = dict(pair.split('=') for pair in line.strip().split('||'))
    all_line_dicts.append(line_dict)
    all_fields = all_fields.union(line_dict.keys())
#Now loop through the line dicts and fill each field
#Put 'NULL' if the field is not given
field_dicts = {field:[] for field in all_fields}
for line_dict in all_line_dicts:
    for field in field_dicts:
        field_dicts[field].append(line_dict[field] if field in line_dict else 'NULL')
#Convert to dataframe by pandas for simplicity
df = pd.DataFrame(field_dicts)
print df #<-- look at it
df.to_csv('test.csv',index = False) #<-- save it as a CSV

输出:

 address contact key1    key2    name
0       NULL  123444  abc  ajdskj  ankush
1       NULL  456778  def    NULL   reddy
2  ashaskawe    NULL  aef    NULL    john

一种简单的正则表达方式,易于维护:

 import re
 f = open('test.txt', 'r').readlines()
 print('key1', 'key2', 'name', 'contact', 'address', sep='||')
 for line in f:
     if re.search('key1=(w+)', line):
          k1 = re.search('key1=(w+)', line).group(1)
      else:
          k1 = 'NULL'
      if re.search('key2=(w+)',line):
          k2 = re.search('key2=(w+)',line).group(1)
      else:
          k2 = 'NULL'
      if re.search('address=(w+)',line):
          a = re.search('address=(w+)',line).group(1)
      else:
          a = 'NULL'
      if re.search('name=(w+)', line):
          n = re.search('name=(w+)', line).group(1)
      else:
          n = 'NULL'
      if re.search('contact=(w+)', line):
          c = re.search('contact=(w+)', line).group(1)
      else:
          c = 'NULL'
      print(k1, k2, n, c, a, sep=' || ')

输出:

 key1||key2||name||contact||address
 abc || ajdskj || ankush || 123444 || NULL
 def || NULL || reddy || 456778 || NULL
 aef || NULL || john || NULL || ashaskawe

最新更新