我正在尝试匹配引用索引(coClean(上的左地址和紧地址(来自单独的表(,该索引是我在#Python#JupyterNotebook 中使用以下公式创建的
import pandas as pd
df1=pd.read_csv("/content/Addmatchdf1.csv")
df2=pd.read_csv("/content/Addmatchdf2.csv")
import re
def cleanAddress(series):
return series.str.lower().str.replace(r"[a-zs,]","")
df1["coClean"]=cleanAddress(df1["Address"])
df1["coClean"]=cleanAddress(df1["Address"])
df = pd.merge(df1, df2,
on =['coClean'],
how ='inner')
这将生成一个coClean作为引用索引。
Address_x | coClean | Address_y|
---|---|---|
7 Pindara Bvd LANGWARRIN VIC 3910 | 73910 | 7 Pindara Blv,LANGWARRIN,VIC 3910|
2a Manor St BACCHUS MARSH VIC 3340 | 23340 | VIC 33402a Manor Street|
38 Sommersby Rd POINT COOK VIC 3030 | 383030 | 38 Sommerrsby Road,POINT COOK,VIC 3030 |
17 Moira Avenue,Carnegie,Vic 3163 | 173163 | 17莫伊拉Avenue|
17 Moira Avenue,Carnegie,Vic 3163 | 173163 | 17 Newman Avenue,卡内基,Vic 3163|
17 Moira Avenue,Carnegie,Vic 3163 | 173163 | 17 Maroona Rd,Carnenegie Vic 3163
import pandas as pd
df1 = pd.DataFrame({"Address_x":["7 Pindara Bvd LANGWARRIN VIC 3910","2a Manor St BACCHUS MARSH VIC 3340","38 Sommersby Rd POINT COOK VIC 3030","17 Moira Avenue, Carnegie, Vic 3163"],"Address_y":["7 Pindara Blv, Langwarrin, VIC 3910","2a Manor Street, BACCHUS MARSH, VIC 3340","38 Sommersby Road, Point Cook, VIC 3030","17 Moira Avenue, Carnegie, Vic 3163"]})
def cleanAddress(series):
cocleans=[]
for address in series:
number_of_letters=0
coclean=""
for i in range(len(address)):
if address[i].isnumeric():
coclean+=address[i]
elif address[i].isalpha():
number_of_letters+=1
coclean+=address[i]
if number_of_letters==4:
break
for i in range(i,len(address)):
if address[i].isnumeric():
coclean+=address[i]
cocleans.append(coclean.lower())
return cocleans
df1["coClean"]=cleanAddress(df1["Address_x"])