如何正确标准化地址类型



我正在尝试通过将缩写转换为完整单词(例如RD - Road(来标准化街道地址。我创建了许多行来解释不同的拼写,并遇到了一个替换代码覆盖另一个替换代码的问题

import pandas as pd 
mydata = {'Street_type': ['PL', 'pl', 'Pl', 'PLACE', 'place']}
mydata = pd.DataFrame(mydata)
mydata['Street_type'] = mydata['Street_type'].replace('PL','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('Pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('PLACE','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('place','Place',regex=True)

我得到了的不是Place,而是Placeace。避免此错误的最佳方法是什么?我是否编写 if-else 语句或任何函数?提前感谢!

在其他问题中,您有重叠的逻辑:在替换目标("旧"(字符串之前,您无法检查它是否是一个完整的单词。 例如,使用"PLACE"的输入类型,您可以触发第一次和第三次替换,生成PlaceACE,然后在达到所需条件之前PlaceaceACE

您需要仔细处理跟踪和排除逻辑,然后仅应用其中一个替换项。 您可以检查street_type的长度,并应用该长度所需的唯一过渡。

如果您尝试转换case语句,则需要遵循该逻辑模式,而不是您编码的连续应用程序。 您可以轻松查找如何在 Python 中模拟"case"语句。

还可以考虑使用翻译词典,例如

type_trans = {
"pl":    "Place",
"Pl":    "Place",
"PLACE": "Place",
...
}

那么你的改变很简单

mydata['Street_type'] = type_trans[mydata['Street_type']]

此外,您还可以列出元组中的所有变体,例如:

type_place = ("PL", "Pl", "pl", "PLACE", "place")
if mydata['Street_type'] in type_place
mydata['Street_type'] = "Place"

。但请务必针对整个街道类型列表正确概括这一点。

如果您在此处使用正确的正则表达式,则可以通过一次传递正确执行此操作,例如使用字边界 (\b(:

In [11]: places = ["PL", "pl", "Pl", "PLACE", "Place", "place"]
In [12]: mydata.Street_type
Out[12]:
0       PL
1       pl
2       Pl
3    PLACE
4    place
Name: Street_type, dtype: object
In [13]: mydata.Street_type.replace("(^|b)({})(b|$)".format("|".join(places)), "Place", regex=True)
Out[13]:
0    Place
1    Place
2    Place
3    Place
4    Place
Name: Street_type, dtype: object
#Needlemanwunch
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval
match_award      = 10
mismatch_penalty = -3
gap_penalty      = -4 # both for opening and extanding
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty
def finalize(align1, align2):
align1 = align1[::-1]    #reverse sequence 1
align2 = align2[::-1]    #reverse sequence 2
i,j = 0,0
#calcuate identity, score and aligned sequeces
symbol = ''
found = 0
score = 0
identity = 0
for i in range(0,len(align1)):
# if two AAs are the same, then output the letter
if align1[i] == align2[i]:                
symbol = symbol + align1[i]
identity = identity + 1
score += match_score(align1[i], align2[i])
# if they are not identical and none of them is gap
elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-': 
score += match_score(align1[i], align2[i])
symbol += ' '
found = 0
#if one of them is a gap, output a space
elif align1[i] == '-' or align2[i] == '-':          
symbol += ' '
score += gap_penalty
identity = float(identity) / len(align1) * 100
print('Similarity =', "%3.3f" % identity, 'percent')
print('Score =', score)
#     print(align1)
#     print(symbol)
#     print(align2)

def needle(seq1, seq2):
m, n = len(seq1), len(seq2)  # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1))      # the DP table
# Calculate DP table
for i in range(0, m + 1):
score[i][0] = gap_penalty * i
for j in range(0, n + 1):
score[0][j] = gap_penalty * j
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
delete = score[i - 1][j] + gap_penalty
insert = score[i][j - 1] + gap_penalty
score[i][j] = max(match, delete, insert)
# Traceback and compute the alignment 
align1, align2 = '', ''
i,j = m,n # start from the bottom right cell
while i > 0 and j > 0: # end toching the top or the left edge
score_current = score[i][j]
score_diagonal = score[i-1][j-1]
score_up = score[i][j-1]
score_left = score[i-1][j]
if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
align1 += seq1[i-1]
align2 += seq2[j-1]
i -= 1
j -= 1
elif score_current == score_left + gap_penalty:
align1 += seq1[i-1]
align2 += '-'
i -= 1
elif score_current == score_up + gap_penalty:
align1 += '-'
align2 += seq2[j-1]
j -= 1
# Finish tracing up to the top left cell
while i > 0:
align1 += seq1[i-1]
align2 += '-'
i -= 1
while j > 0:
align1 += '-'
align2 += seq2[j-1]
j -= 1
finalize(align1, align2)
needle('kizlerlo','killerpo' )
***********************************************************************************************************************
#import textdistance as txd
import numpy
txd.overlap('kizlerlo','kilerpo' )
txd.jaro('kizlerlo','killerpo' )
txd.cosine('kizlerlo','killerpo' )
#txd.needleman_wunsch('kizlerlo','killerpo' )
txd.jaro_winkler('kizlerlo','killerpo' )
#txd.smith_waterman('Loans and Accounts','Loans Accounts' )
#txd.levenshtein.normalized_similarity('Loans and Accounts','Loans Accounts' )
from scipy.spatial import distance
a = 'kizlerlo'
b = 'kilerpoo'
#txd.gotoh('Loans and Accounts','Loans Accounts' )
print(txd.needleman_wunsch.normalized_similarity('Loans and Accounts','Loans Accounts' ))
***************************************************************************************************************************
#Euclidean
import math
import numpy as np
def euclid(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
dist=1/(1+math.sqrt(sum([(a - b) ** 2 for a, b in zip(vec1, vec2)])))
print(dist)
euclid('kizlerlo','killerpo')
***************************************************************************************************************************

from similarity.qgram import QGram
import affinegap
qgram = QGram(2)
#print(qgram.distance('kizlerlo', 'killerpo'))
affinegap.affineGapDistance('kizlerlokill' ,'erpozlerlzler')
***************************************************************************************************************************
#manhattan
def manhattan(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
#dist= sum([np.abs(a - b) for a, b in zip(vec1, vec2)])
dist=1/(1+sum([np.abs(a - b) for a, b in zip(vec1, vec2)]))
print(dist)
manhattan('kizlerlo','killerpo')
import jellyfish
import json
from Levenshtein import distance,jaro_winkler,jaro,ratio,seqratio
def comp(a,b):
return jellyfish.jaro_winkler(a,b)*100 + distance(a,b) + jaro(a,b)*100
ip = {"CED":"WALMART INC_10958553"}
ala = {}
for index,row in df_ala.iterrows():
a = ip.get("CED")
b = row['NN_UID']
c = comp(a,b)
ala.update({row['N_UID'] : c})
ala_max = max(ala, key=ala.get)
ala_f = {"ALACRA" : ala_max}
ces_f = {"CESIUM" : "WALMART_10958553_CESIUM"}
dun_f = {"DUNS" : "WALMART_10958053_DUNS"}
ref_f = {"REF" : "WALMART INC_10958553_REF"}
cax_f = {"CAX" : "WALMART LTD_10958553_CAX"}
final_op = {**ala_f,**ces_f,**dun_f,**ref_f,**cax_f }
final_json = json.dumps(final_op)
print(final_json)
from flask import Flask,request, jsonify
app = Flask(__name__)
@app.route('/test',methods = ['GET','POST'])
def test():
if request.method == "GET":
return jsonify({"response":"Get request called"})
elif request.method == "POST":
req_Json = request.json
name = req_Json['name']
return jsonify({"response": "Hi" + name})
if __name__ == '__main__':
app.run(debug = True,port = 9090)
{
"name": "Mike"
}
import usaddress
import pandas as pd
import statistics 
#sa = dict(usaddress.parse('123 Main St. Suite  Chicago, IL' ))
adr = pd.read_excel('C:\VINAYAK\Address.xlsx')
adr.columns = ['Address']
strlen = []
scr = []
loop = adr['Address'].tolist()
for i in loop:
strlen.append(len(i))
x = statistics.median(strlen)
for i in loop:
sa = dict(usaddress.parse(i))
sa = list(sa.values())
a = 0
if len(i) > x :
a+= 5
if 'AddressNumber' in sa :  
a+= 23
if 'StreetName' in sa :  
#a = a + 20
a+= 17
if 'OccupancyType' in sa :  
a+= 6
if 'OccupancyIdentifier' in sa :  
a+= 12
if 'PlaceName' in sa :  
a+= 12
if 'StateName' in sa :  
a+= 13
if 'ZipCode' in sa :  
a+= 12
scr.append(a)
adr['Adr_Score'] = scr
adr.head()
#(pd.DataFrame([(key) for  key in sa.items()])).transpose()
#pd.DataFrame(dict([(value, key) for key, value in sa.items()])) 
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
# df_ts = pd.DataFrame(columns = ['AddressNumber' , 'Age', 'City' , 'Country'])
# df_ts.append(sa, ignore_index=False, verify_integrity=False, sort=None)
# df_ts.head()
import pandas as pd
from zipfile import ZipFile
# core = []
# f = open('C:/Users/s.natarajakarayalar/1.txt','r')
# core.append(str(f.readlines()))
# print(core)
import os
import zipfile
import re
import nltk
import os
core = []
with zipfile.ZipFile('C:/Users/s.natarajakarayalar/TF.zip') as z:
a = 0
for filename in z.namelist():
#if a < 1:
#if not os.path.isdir(filename):
# read the file
with z.open(filename) as f:
#a = 2
x = f.readlines()
core = core + x
with open('C:/Users/s.natarajakarayalar/fins.txt', 'w') as f:
for item in core:
f.write("%sn" % item)
# for i in core:
#     if k < 5:
#         tkt = re.sub(r'.*CONTENT', '', i)
#         new_core.append(tkt)
#         k = k+1
# for item in core:
#     new_core.append(len(item.split()))
# print(sum(new_core))
# from nltk.tokenize import word_tokenize
# new_core = []
# stp = ['URL:https://','TITLE:b','META-KEYWORDS:','None','DOC ID:','CONTENT:b','URL:','TITLE:','META-CONTENT:']
# #new_core = [word for word in core if word not in stopwords]
# for i in core:
#     wk = word_tokenize(i)
#     for w in wk:
#         if w not in stp:
#             new_core.append(w)

最新更新