通过数据帧改进迭代



尝试在两个数据帧中找到一些内部,知道循环在pandas中不是解决方案,并且使用两个各有15k行的工作表。我怎样才能提高后面代码的速度?它不可能使用合并,因为在匹配条件行之后,需要从err中删除,以便不再匹配。16k行,耗时140分钟。

import pandas as pd
scr = pd.DataFrame({
'Product':['10101.A', '10101.A', '10101.A', '10147.A', '10147.A', '10147.A', '10147.A','10147.A'],
'Source Handling Unit':['7000000051481339', '7000000051481342', '7000000051722237','7000000051530150','7000000051530152', '7000000051530157', '7000000051546193', '7000000051761150'],
'Available Qty BUoM':[1,1,1,1,1,1,1,1],
'Confirmation Date':['10-5-2022', '10-5-2022', '9-5-2022', '6-5-2022', '6-5-2022', '6-5-2022', '6-5-2022', '11-5-2022']
})
err = pd.DataFrame({
'Posting Date':['4-5-2022','6-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','11-5-2022','13-5-2022','15-5-2022','16-5-2022','25-5-2022'],
'Product':['10101.A', '10147.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10101.A', '10147.A'],
'Reason':['L400', 'CCIV', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'UPLD', 'L400', 'L400'],
'Activity Area':['A970', 'D300', 'A990', 'A990', 'A990', 'A990', 'A990', 'A990','A990', 'A990', 'A990','A970','A970'],
'Difference Quantity':[1, 5, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1]
})
#Creating filter for scr
filt_scr_col = ['Product', 'Source Handling Unit', 'Available Qty BUoM', 'Confirmation Date']
#Applying filter
scr = scr[filt_scr_col]
#Creating filter for err
filt_post_col = ['Posting Date', 'Product', 'Reason', 'Activity Area', 'Difference Quantity']
#Applying filter
err = err[filt_post_col] 
#Replace empty characters with underscore 
scr.columns = scr.columns.str.replace(' ', '_')
err.columns = err.columns.str.replace(' ', '_')
#Creating filter to extract A450 rows from postings
filt = err['Activity_Area'] == 'A450'
#Assign A450 rows to new dataframe a450
a450 = err.loc[filt]
#.groupby 'Posting_Date', 'Product','Reason' but when I pass an argument as_index = False doent aggregate Products returns relevant object
#.query gets Difference_Quantity > 0 this evaluates and refers to exact row
a450 = (
a450.groupby(['Posting_Date', 'Product','Reason'], as_index = False, sort = False).sum()
.query('Difference_Quantity > 0')
)
#Creating filter to remove all < 0
filt = err['Difference_Quantity'] > 0
#Applying filter to dataframe
err = err.loc[filt]
#Removing column 'Activity_Area' from err don't need it and columns will match when appent with a450
err = err.drop(columns='Activity_Area')
#Concat err and a450
err = pd.concat([err, a450], ignore_index= True)
scr['Confirmation_Date'] = pd.to_datetime(scr['Confirmation_Date'], format = "%d-%m-%Y")  
err['Posting_Date'] = pd.to_datetime(err['Posting_Date'], format = "%d-%m-%Y")
scr = scr.sort_values(by='Product', ascending=True)
err = err.sort_values(by='Posting_Date', ascending=True)
scr['Reason'] = None
match = []
for i, row in scr.iterrows():
for e, erow in err.iterrows():
if (row['Product'] == erow['Product']) & (erow['Posting_Date'] >= row['Confirmation_Date']) & (erow['Difference_Quantity'] - row['Available_Qty_BUoM'] >= 0):
row['Reason'] = err['Reason'][e]
err['Difference_Quantity'][e]-= row['Available_Qty_BUoM']
row_to_dict = row.to_dict()
match.append(row_to_dict)
break
report = pd.DataFrame(match)  
report = report[['Product', 'Source_Handling_Unit', 'Available_Qty_BUoM', 'Confirmation_Date', 'Reason']]
#Pandas think that report['Source_Handling_Unit'] has floats and love to round them
report = report.astype({'Source_Handling_Unit': str})
report

设置:

scr = pd.DataFrame({'id' : ['10101.A', '10101.A', '10101.A'],'date' : ['10-5-2022', '10-5-2022', '9-5-2022'], 'qty': [1, 1, 1]})
err = pd.DataFrame({'id' : ['10101.A', '10101.A', '10101.A'], 'date' : ['4-5-2022', '13-5-2022', '16-5-2022'],'qty': [1, 1, 1], 'r':['a', 'b', 'c']})
scr['date'] = pd.to_datetime(scr['date'], format = "%d-%m-%Y")  
err['date'] = pd.to_datetime(err['date'], format = "%d-%m-%Y")

做:

df = scr.merge(err, on='id', suffixes=[None, '_er']).drop_duplicates()
report = df[df['date_er'].ge(df['date']) & df['qty_er'].sub(df['qty']).ge(0)]
print(report[['id', 'date', 'qty', 'r']])

输出:

id       date  qty  r
1  10101.A 2022-05-10    1  b
2  10101.A 2022-05-10    1  c
7  10101.A 2022-05-09    1  b
8  10101.A 2022-05-09    1  c

另一种方法是类似sql的方法,这可能在大型数据集上更有效:

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
query = '''
select distinct scr.id, scr.date, scr.qty, r
from scr
join err
on scr.id == err.id
and err.date >= scr.date
and err.qty - scr.qty >= 0
'''
df = pysqldf(query)
df.date = pd.to_datetime(df.date)
print(df)
...
id       date  qty  r
0  10101.A 2022-05-10    1  b
1  10101.A 2022-05-10    1  c
2  10101.A 2022-05-09    1  b
3  10101.A 2022-05-09    1  c

如果我理解对了,下面的代码是快速的,没有循环:

import pandas as pd
scr = pd.DataFrame({'id' : ['1a', '1a', '1a', '1b'], 'date' : ['10-5-2022', '10-5-2022', '9-5-2022', '11-5-2022'], 'qty': [1, 1, 2, 2]})
err = pd.DataFrame({'id' : ['1a', '1a', '1a', '2s'], 'date' : ['4-5-2022', '13-5-2022', '16-5-2022', '12-3-2022'], 'qty': [1, 1, 3, 5], 'r':['a', 'b', 'c', 'a']})
scr['date'] = pd.to_datetime(scr['date'], format = "%d-%m-%Y")  
err['date'] = pd.to_datetime(err['date'], format = "%d-%m-%Y")
# merging the two dataframes
arr = scr.merge(err, on='id', how = 'left')
# filtering
arr = arr[(arr['date_x']<=arr['date_y']) & (arr['qty_x'] == arr['qty_y'])]
arr

最新更新