我需要帮助来制作下面的片段。我需要合并两个文件,并在匹配的行上执行计算
我有包含旧数据的oldFile.txt和包含更新数据集的newFile.txt。
我需要根据newFile.txt中的数据更新oldFile.txt,并计算百分比变化。任何想法都会很有帮助。提前感谢
from collections import defaultdict
num = 0
data=defaultdict(int)
with open("newFile.txt", encoding='utf8', errors='ignore') as f:
for line in f:
grp, pname, cnt, cat = line.split(maxsplit=3)
data[(pname.strip(),cat.replace('n','').strip(),grp,cat)]+=int(cnt)
sorteddata = sorted([[k[0],v,k[1],k[2]] for k,v in data.items()], key=lambda x:x[1], reverse=True)
for subl in sorteddata[:10]:
num += 1
line = " ".join(map(str, subl))
print ("{:>5} -> {:>}".format(str(num), line))
with open("oldFile.txt", 'a', encoding='utf8', errors='ignore') as l:
l.write(" ".join(map(str, subl)) + 'n')
oldFile.txt
#col1 #col2 #col3 #col4
1,396 c15e89f2149bcc0cbd5fb204 4 HUH_Token (HUH)
279 9e4d81c8fc15870b15aef8dc 3 BABY BNB (BBNB)
231 31b5c07636dab8f0909dbd2d 6 Buff Unicorn (BUFFUN...)
438 1c6bc8e962427deb4106ae06 8 Charge (Charge)
2,739 6ea059a29eccecee4e250414 2 MAXIMACASH (MAXCAS...)
newFile.txt#--在oldFile.txt 中找不到带有附加行的更新数据
#col1 #col2 #col3 #col4
8,739 6ea059a29eccecee4e250414 60 MAXIMACASH (MAXCAS...)
138 1c6bc8e962427deb4106ae06 50 Charge (Charge)
860 31b5c07636dab8f0909dbd2d 40 Buff Unicorn (BUFFUN...)
200 9e4d81c8fc15870b15aef8dc 30 BABY BNB (BBNB) #-- not found in the oldFile.txt
20 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA)
1,560 c15e89f2149bcc0cbd5fb204 20 HUH_Token (HUH)
需要改进:#--具有附加列(col5,col6(并根据(col3(值进行排序
#col1 #col2 #col3 #col4 #col5 (oldFile-newFile) #col6 (oldFile-newFile)
8,739 6ea059a29eccecee4e250414 62 MAXIMACASH (MAXCAS...) 2900.00 % (col3 2-60) 219.06 % (col1 2,739-8,739)
138 1c6bc8e962427deb4106ae06 58 Charge (Charge) 625.00 % (col3 8-50) -68.49 % (col1 438-138)
860 31b5c07636dab8f0909dbd2d 46 Buff Unicorn (BUFFUN...) 566.67 % (col3 6-40) 272.29 % (col1 231-860)
200 9e4d81c8fc15870b15aef8dc 33 BABY BNB (BBNB) 900.00 % (col3 3-30) -28.32 % (col1 279-200)
20 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA) 0.00 % (col3 0-30) 20.00 % (col1 0-20)
1,560 c15e89f2149bcc0cbd5fb204 24 HUH_Token (HUH) 400.00 % (col3 4-20) 11.75 % (col1 1,396-1,560)
这里有一个示例代码,用于输出您需要的内容。我用下面的公式来计算pct的变化。percentage_change = 100*(new-old)/old
如果old为0,则将其更改为1以避免被零除错误。
import pandas as pd
def read_file(fn):
"""
Read file fn and convert data into a dict of dict.
data = {pname1: {grp: grp1, pname: pname1, cnt: cnt1, cat: cat1},
pname2: {gpr: grp2, ...} ...}
"""
data = {}
with open(fn, 'r') as f:
for lines in f:
line = lines.rstrip()
grp, pname, cnt, cat = line.split(maxsplit=3)
data.update({pname: {'grp': float(grp.replace(',', '')), 'pname': pname, 'cnt': int(cnt), 'cat': cat}})
return data
def process_data(oldfn, newfn):
"""
Read old and new files, update the old file based on new file.
Save output to text, and csv files.
"""
# Get old and new data in dict.
old = read_file(oldfn)
new = read_file(newfn)
# Update old data based on new data
u_data = {}
for ko, vo in old.items():
if ko in new:
n = new[ko]
# Update cnt.
old_cnt = vo['cnt']
new_cnt = n['cnt']
u_cnt = old_cnt + new_cnt
# cnt change, if old is zero we set it to 1 to avoid division by zero error.
tmp_old_cnt = 1 if old_cnt == 0 else old_cnt
cnt_change = 100 * (new_cnt - tmp_old_cnt) / tmp_old_cnt
# grp change
old_grp = vo['grp']
new_grp = n['grp']
grp_change = 100 * (new_grp - old_grp) / old_grp
u_data.update({ko: {'grp': n['grp'], 'pname': n['pname'], 'cnt': u_cnt, 'cat': n['cat'],
'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)}})
# add new data to u_data, that is not in old data
for kn, vn in new.items():
if kn not in old:
# Since this is new item its old cnt is zero, we set it to 1 to avoid division by zero error.
old_cnt = 1
new_cnt = vn['cnt']
cnt_change = 100 * (new_cnt - old_cnt) / old_cnt
# grp change is similar to cnt change
old_grp = 1
new_grp = vn['grp']
grp_change = 100 * (new_grp - old_grp) / old_grp
# Update new columns.
vn.update({'cnt_change%': round(cnt_change, 2), 'grp_change%': round(grp_change, 2)})
u_data.update({kn: vn})
# Create new data mydata list from u_data, and only extract the dict value.
mydata = []
for _, v in u_data.items():
mydata.append(v)
# Convert mydata into pandas dataframe to easier manage the data.
df = pd.DataFrame(mydata)
df = df.sort_values(by=['cnt'], ascending=False) # sort on cnt column
# Save to csv file.
df.to_csv('output.csv', index=False)
# Save to text file.
with open('output.txt', 'w') as w:
w.write(f'{df.to_string(index=False)}')
# Print in console.
print(df.to_string(index=False))
# Start
oldfn = 'F:/Tmp/oldFile.txt'
newfn = 'F:/Tmp/newFile.txt'
process_data(oldfn, newfn)
控制台输出:
grp pname cnt cat cnt_change% grp_change%
8739.0 6ea059a29eccecee4e250414 62 MAXIMACASH (MAXCAS...) 2900.00 219.06
138.0 1c6bc8e962427deb4106ae06 58 Charge (Charge) 525.00 -68.49
860.0 31b5c07636dab8f0909dbd2d 46 Buff Unicorn (BUFFUN...) 566.67 272.29
200.0 9e4d81c8fc15870b15aef8dc 33 BABY BNB (BBNB) 900.00 -28.32
20.0 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA) 2900.00 1900.00
1560.0 c15e89f2149bcc0cbd5fb204 24 HUH_Token (HUH) 400.00 11.75
文本输出:
grp pname cnt cat cnt_change% grp_change%
8739.0 6ea059a29eccecee4e250414 62 MAXIMACASH (MAXCAS...) 2900.00 219.06
138.0 1c6bc8e962427deb4106ae06 58 Charge (Charge) 525.00 -68.49
860.0 31b5c07636dab8f0909dbd2d 46 Buff Unicorn (BUFFUN...) 566.67 272.29
200.0 9e4d81c8fc15870b15aef8dc 33 BABY BNB (BBNB) 900.00 -28.32
20.0 5esdsds2sd15870b15aef8dc 30 CharliesAngel (CA) 2900.00 1900.00
1560.0 c15e89f2149bcc0cbd5fb204 24 HUH_Token (HUH) 400.00 11.75
csv输出:
grp,pname,cnt,cat,cnt_change%,grp_change%
8739.0,6ea059a29eccecee4e250414,62,MAXIMACASH (MAXCAS...),2900.0,219.06
138.0,1c6bc8e962427deb4106ae06,58,Charge (Charge),525.0,-68.49
860.0,31b5c07636dab8f0909dbd2d,46,Buff Unicorn (BUFFUN...),566.67,272.29
200.0,9e4d81c8fc15870b15aef8dc,33,BABY BNB (BBNB),900.0,-28.32
20.0,5esdsds2sd15870b15aef8dc,30,CharliesAngel (CA),2900.0,1900.0
1560.0,c15e89f2149bcc0cbd5fb204,24,HUH_Token (HUH),400.0,11.75
只是为了提供一个基于convtools的替代解决方案,您可以在其中找到有用的部分:
from convtools import conversion as c
from convtools.contrib.tables import Table
# your percentage change calculation
def c_change(column_name):
return c.if_(
c.and_(
c.col(f"{column_name}_LEFT"),
c.col(f"{column_name}_RIGHT").is_not(None),
),
(
(c.col(f"{column_name}_RIGHT") - c.col(f"{column_name}_LEFT"))
/ c.col(f"{column_name}_LEFT")
* 100.0
).pipe(round, 2),
None,
)
prepare_columns = {
"COLUMN_0": c.col("COLUMN_0").as_type(float),
"COLUMN_2": c.col("COLUMN_2").as_type(float),
}
dialect = Table.csv_dialect(delimiter="t")
sorted_rows = sorted(
Table.from_csv("tmp1.csv", dialect=dialect)
.update(**prepare_columns)
.join(
Table.from_csv(
"tmp2.csv",
dialect=dialect,
).update(**prepare_columns),
on=["COLUMN_1", "COLUMN_3"],
how="full",
)
.update(
COLUMN_4=c_change("COLUMN_2"),
COLUMN_5=c_change("COLUMN_0"),
COLUMN_2=c.col("COLUMN_2_RIGHT"),
COLUMN_0=c.col("COLUMN_0_RIGHT"),
)
.take(
"COLUMN_0",
"COLUMN_1",
"COLUMN_2",
"COLUMN_3",
"COLUMN_4",
"COLUMN_5",
)
.into_iter_rows(tuple),
key=lambda row: row[2],
reverse=True,
)
Table.from_rows(sorted_rows).into_csv("tmp_result.csv", dialect=dialect)
结果:
COLUMN_0 COLUMN_1 COLUMN_2 COLUMN_3 COLUMN_4 COLUMN_5
8739.0 6ea059a29eccecee4e250414 60.0 MAXIMACASH (MAXCAS...) 2900.0 219.06
138.0 1c6bc8e962427deb4106ae06 50.0 Charge (Charge) 525.0 -68.49
860.0 31b5c07636dab8f0909dbd2d 40.0 Buff Unicorn (BUFFUN...) 566.67 272.29
200.0 9e4d81c8fc15870b15aef8dc 30.0 BABY BNB (BBNB) 900.0 -28.32
20.0 5esdsds2sd15870b15aef8dc 30.0 CharliesAngel (CA)
1560.0 c15e89f2149bcc0cbd5fb204 20.0 HUH_Token (HUH) 400.0 11.75