使用python处理百万行的查找操作



我是数据处理的新手。我需要创建python程序来从samplefile2中的samplefile1中搜索记录。我能够做到这一点,但对于samplefile1中200行中的每一条记录在samplefile2中的200行上循环,它需要180秒的完整执行时间。

我正在寻找一些更节省时间的东西,这样我就可以在最短的时间内完成这项任务。

我的实际数据集大小是:900万->samplefile1和900万-->samplefile2.

这是我使用Pandas的代码。

sample1file1行:

number='7777777777' subscriber-id="7777777777" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip+e2u"regexp="!^(.*)$!sip:+7777777777@ims.mnc001.mcc470.3gppnetwork.org;user=phone!"replacement=[]};
number='7777777778' subscriber-id="7777777778" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip+e2u"regexp="!^(.*)$!sip:+7777777778@ims.mnc001.mcc470.3gppnetwork.org;user=phone!"replacement=[]};
number='7777777779' subscriber-id="7777777779" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip+e2u"regexp="!^(.*)$!sip:+7777777779@ims.mnc001.mcc470.3gppnetwork.org;user=phone!"replacement=[]};
.........100 rows

samplefile2行

number='7777777777' subscriber-id="7777777777" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip+e2u"regexp="!^(.*)$!sip:+7777777777@ims.mnc001.mcc470.3gppnetwork.org;user=phone!"replacement=[]};
number='7777777778' subscriber-id="7777777778" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip+e2u"regexp="!^(.*)$!sip:+7777777778@ims.mnc001.mcc470.3gppnetwork.org;user=phone!"replacement=[]};
number='7777777769' subscriber-id="7777777779" rrid=0 NAPTR {order=10 preference=50 flags="U"service="sip+e2u"regexp="!^(.*)$!sip:+7777777779@ims.mnc001.mcc470.3gppnetwork.org;user=phone!"replacement=[]};
........100 rows
import time
import pandas as pd
def timeit(func):
"""
Decorator for measuring function's running time.
"""
def measure_time(*args, **kw):
start_time = time.time()
result = func(*args, **kw)
print("Processing time of %s(): %.2f seconds."
% (func.__qualname__, time.time() - start_time))
return result
return measure_time
@timeit
def func():
df = pd.read_csv("sample_2.txt", names=["A1"], skiprows=0, sep=';')
df.drop(df.filter(regex="Unname"),axis=1, inplace=True)
finaldatafile1=df.fillna("TrackRow")

df1=pd.read_csv("sample_1.txt",names=["A1"],skiprows=0,sep=';')
df1.drop(df.filter(regex="Unname"),axis=1, inplace=True)
finaldatafile2=df1.fillna("TrackRow")
indexdf=df.index
indexdf1=df1.index
##### for loop for string to be matched (small datasets#######
for i in range(0,len(indexdf)-1):
lookup_value=finaldatafile1.iloc[[i]].to_string()
# print(lookup_value)
######### for loop for lookup dataset( large dataset #########
for j in range(0,len(indexdf1)-1):
match_value=finaldatafile2.iloc[[j]].to_string()
if i is j:
print (f"Its a match on lookup table position {j} and for string {lookup_value}")
else:
print("no match found in complete dataset")
if __name__ == "__main__":
func()

我认为使用Pandas在这里没有帮助,因为你只是在比较整行。另一种方法是将第一个文件加载为一组行。然后枚举第二个文件中的行,测试它是否在集合中。这将更快:

@timeit
def func():
with open('sample_1.txt') as f_sample1:
data1 = set(f_sample1.read().splitlines())

with open('sample_2.txt') as f_sample2:
data2 = f_sample2.read().splitlines()

for index, entry in enumerate(data2):
if entry in data1:
print(f"It's a match on lookup table position {index} and for stringn{entry}")
else:
print("no match found in complete dataset")

最新更新