想要在单独的列(a)
中列出1|
2|
3|
,它是由;
分隔的第一项,并且是与另一列(d)
的所有相关数字的总和
数据结构
1|1=89325|2=96682|3=81940 267947
2|1=17162|2=21282|3=23033; 61477
3|1=71761|2=73375|3=83581; 228717
编码
a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
parsed_data = re.findall(r'([d.]+)=([d.]+)', a['TAG'])
a = ','.join(str(dict(zip(['a', 'b', 'c'], i))) for i in parsed_data)
a = pd.DataFrame(eval(a))
a
代码输出
a b
0 1 89325
1 2 96682
2 3 81940
3 1 17162
4 2 21282
5 3 23033
6 1 71761
7 2 73375
8 3 83581
预期输出
a b c d
0 1 1 89325
1 1 2 96682
2 1 3 81940 267947
3 2 1 17162
4 2 2 21282
5 2 3 23033 61477
6 3 1 71761
7 3 2 73375
8 3 3 83581 228717
不使用正则表达式但生成预期输出的解决方案:
import pandas as pd
from itertools import chain
a = {'TAG': '1@@@;1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
list_of_lists = []
# Transform string into list of lists of tuples
for row in a['TAG'].split(';'):
content = tuple(row.split('|'))
row_num = int(content[0][0])
if len(content) > 1:
list_of_lists.append([tuple([row_num] + [int(elem) for elem in c.split('=')]) for c in content[1:]])
# Calculate the total for each list
totals = [sum([values[-1] for values in t]) for t in list_of_lists]
# Append the total to the last tuple in each list
for index, elem in enumerate(list_of_lists):
elem[-1] = (*elem[-1], totals[index])
# Flatten the list_of_lists structure
data = list(chain(*list_of_lists))
# Create the dataframe
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])
UPDATE:将映射函数应用于使用正则表达式解析的输入的解决方案:
import re
from itertools import chain
import pandas as pd
a = {'TAG': ';1|1=89325|2=96682|3=81940;2|1=17162|2=21282|3=23033;3|1=71761|2=73375|3=83581'}
def list_of_tuples(tup):
indexes = [(0,1,2), (0,3,4), (0,5,6)]
lot = [tuple([int(tup[i]) for i in index]) for index in indexes]
lot[-1] = (*lot[-1], sum([int(tup[i]) for i in (2,4,6)]))
return lot
regex = r";(d+)|(d+)=(d+)|(d+)=(d+)|(d+)=(d+)"
data = list(chain(*map(list_of_tuples, re.findall(regex, a['TAG']))))
df = pd.DataFrame(data, columns =['a', 'b', 'c', 'd'])
UPDATE 2:一个更高效的映射函数,尽管代码更详细。
def list_of_tuples(tup):
values_total = 0
tuples_list = []
for index,elem in enumerate(tup):
if index == 0:
a = int(elem)
else:
if index%2 == 1:
b = int(elem)
else:
c = int(elem)
values_total += c
tuples_list.append((a, b, c))
tuples_list[-1] = (*tuples_list[-1], values_total)
return tuples_list