替换Python中的ASCII字符



我有一个数据帧,其中包含一堆需要清理的ASCII字符的URL:

df = pd.DataFrame({'Webpage':['Webpage_A','Webpage_B','Webpage_C','Webpage_D'],
'NSW and VIC': ['https://contentspace.global.com/teams/Australia/NSW/Documents/Forms/AllItems.aspx?
RootFolder=%2Fteams%Australia%2FNSW%2FDocuments%2FIn%20Scope%2FA%20I%20TOPPER%20GROUP&FolderCTID=
0x01200016BC4CE0C21A6645950C100F37A60ABD&View=%7B64F44840%2D04FE%2D4341%2D9FAC%2D902BB54E7F10%7D',
'https://contentspace.global.com/teams/Australia/Victoria/Documents/Forms/AllItems.aspx?RootFolder
=%2Fteams%2FAustralia%2FVictoria%2FDocuments%2FIn%20Scope&FolderCTID=0x0120006984C27BA03D394D9E2E95FB
893593F9&View=%7B3276A351%2D18C1%2D4D32%2DADFF%2D54158B504FCC%7D',0,0], 'Other States': [0,0,
'https://contentspace.global.com/teams/Australia/WA/Documents/Forms/AllItems.aspx?
RootFolder=%2Fteams%2FAustralia%2FWA%2FDocuments%2FIn%20Scope&FolderCTID=0x012000EDE8B08D50FC3741A5
206CD23377AB75&View=%7B287FFF9E%2DD60C%2D4401%2D9ECD%2DC402524F1D4A%7D',
'https://contentspace.global.com/teams/Australia/QLD/Documents/Forms/AllItems.aspx?RootFolder=%
2Fteams%2FAustralia%2FQLD%2FDocuments%2FIn%20Scope%2FAACO%20GROUP&FolderCTID=0x012000E689A6C1960E8
648A90E6EC3BD899B1A&View=%7B6176AC45%2DC34C%2D4F7C%2D9027%2DDAEAD1391BFC%7D']})

DataFrame的输出如下所示:

NSW and VIC                                     Other States                            Webpage
0   https://contentspace.global.com/teams/Australi...   0                                   Webpage_A
1   https://contentspace.global.com/teams/Australi...   0                                   Webpage_B
2   0                                     https://contentspace.global.com/teams/Australi... Webpage_C
3   0                                     https://contentspace.global.com/teams/Australi... Webpage_D

要映射的ASCII字符如下:

df_ASCII = pd.DataFrame({'%21': ['!'],
'%22': ['“'],
'%23': ['#'],
'%24': ['$'],
'%25': ['%'],
'%26': ['&'],
'%27': ['‘'],
'%28': ['('],
'%29': [')'],
'%2A': ['*'],
'%2B': ['+'],
'%2C': [','],
'%2D': ['–'],
'%2E': ['.'],
'%2F': ['/'],
'%30': ['0'],
'%31': ['1'],
'%32': ['2'],
'%33': ['3'],
'%34': ['4'],
'%35': ['5'],
'%36': ['6'],
'%37': ['7'],
'%38': ['8'],
'%39': ['9'],
'%3A': [':'],
'%3B': [';'],
'%3C': ['<'],
'%3D': ['='],
'%3E': ['>'],
'%3F': ['?'],
'%40': ['@'],
'%41': ['A'],
'%42': ['B'],
'%43': ['C'],
'%44': ['D'],
'%45': ['E'],
'%46': ['F'],
'%47': ['G'],
'%48': ['H'],
'%49': ['I'],
'%4A': ['J'],
'%4B': ['K'],
'%4C': ['L '],
'%4D': ['M'],
'%4E': ['N'],
'%4F': ['O'],
'%50': ['P'],
'%51': ['Q'],
'%52': ['R'],
'%53': ['S'],
'%54': ['T'],
'%55': ['U'],
'%56': ['V'],
'%57': ['W'],
'%58': ['X'],
'%59': ['Y'],
'%5A': ['Z'],
'%5B': ['['],
'%5C': [nan],
'%5D': [']'],
'%5E': ['^'],
'%5F': ['_'],
'%60': ['`'],
'%61': ['a'],
'%62': ['b'],
'%63': ['c'],
'%64': ['d'],
'%65': ['e'],
'%66': ['f'],
'%67': ['g'],
'%68': ['h'],
'%69': ['i'],
'%6A': ['j'],
'%6B': ['k'],
'%6C': ['l'],
'%6D': ['m'],
'%6E': ['n'],
'%6F': ['o'],
'%70': ['p'],
'%71': ['q'],
'%72': ['r'],
'%73': ['s'],
'%74': ['t'],
'%75': ['u'],
'%76': ['v'],
'%77': ['w'],
'%78': ['x'],
'%79': ['y'],
'%7A': ['z'],
'%7B': ['{'],
'%7C': ['|'],
'%7D': ['}'],
'%7E': ['~'],
'%7F': [nan],
'%80': ['€'],
'%81': [nan],
'%82': ['‚'],
'%83': ['ƒ'],
'%84': ['„'],
'%85': ['…'],
'%86': ['†'],
'%87': ['‡'],
'%88': ['ˆ'],
'%89': ['‰'],
'%8A': ['Š'],
'%8B': ['‹'],
'%8C': ['Œ'],
'%8D': [nan],
'%8E': ['Ž'],
'%8F': [nan],
'%90': [nan],
'%91': ['‘'],
'%92': ['’'],
'%93': ['“'],
'%94': ['”'],
'%95': ['•'],
'%96': ['–'],
'%97': ['—'],
'%98': ['˜'],
'%99': ['™'],
'%9A': ['š'],
'%9B': ['›'],
'%9C': ['œ'],
'%9D': [nan],
'%9E': ['ž'],
'%9F': ['Ÿ'],
'%A0': [nan],
'%A1': ['¡'],
'%A2': ['¢'],
'%A3': ['£'],
'%A4': [nan],
'%A5': ['¥'],
'%A6': ['|'],
'%A7': ['§'],
'%A8': ['¨'],
'%A9': ['©'],
'%AA': ['ª'],
'%AB': ['«'],
'%AC': ['¬'],
'%AD': ['¯'],
'%AE': ['®'],
'%AF': ['¯'],
'%B0': ['°'],
'%B1': ['±'],
'%B2': ['²'],
'%B3': ['³'],
'%B4': ['´'],
'%B5': ['µ'],
'%B6': ['¶'],
'%B7': ['·'],
'%B8': ['¸'],
'%B9': ['¹'],
'%BA': ['º'],
'%BB': ['»'],
'%BC': ['¼'],
'%BD': ['½'],
'%BE': ['¾'],
'%BF': ['¿'],
'%C0': ['À'],
'%C1': ['Á'],
'%C2': [nan],
'%C3': ['Ã'],
'%C4': ['Ä'],
'%C5': ['Å'],
'%C6': ['Æ'],
'%C7': ['Ç'],
'%C8': ['È'],
'%C9': ['É'],
'%CA': [nan],
'%CB': ['Ë'],
'%CC': ['Ì'],
'%CD': ['Í'],
'%CE': ['Î'],
'%CF': ['Ï'],
'%D0': ['Ð'],
'%D1': ['Ñ'],
'%D2': ['Ò'],
'%D3': ['Ó'],
'%D4': ['Ô'],
'%D5': ['Õ'],
'%D6': ['Ö'],
'%D7': [nan],
'%D8': ['Ø'],
'%D9': ['Ù'],
'%DA': ['Ú'],
'%DB': ['Û'],
'%DC': ['Ü'],
'%DD': ['Ý'],
'%DE': ['Þ'],
'%DF': ['ß'],
'%E0': ['à'],
'%E1': ['á'],
'%E2': ['â'],
'%E3': ['ã'],
'%E4': ['ä'],
'%E5': ['å'],
'%E6': ['æ'],
'%E7': ['ç'],
'%E8': ['è'],
'%E9': ['é'],
'%EA': ['ê'],
'%EB': ['ë'],
'%EC': ['ì'],
'%ED': ['í'],
'%EE': ['î'],
'%EF': ['ï'],
'%F0': ['ð'],
'%F1': ['ñ'],
'%F2': ['ò'],
'%F3': ['ó'],
'%F4': ['ô'],
'%F5': ['õ'],
'%F6': ['ö'],
'%F7': ['÷'],
'%F8': ['ø'],
'%F9': ['ù'],
'%FA': ['ú'],
'%FB': ['û'],
'%FC': ['ü'],
'%FD': ['ý'],
'%FE': ['þ'],
'%FF': ['ÿ']})

我试着一个接一个地替换它们(如下所示(,这完全没有问题,但我似乎无法使完整的ASCII映射发挥作用,即使我试图将其转换为列表或字典:

cols = ['NSW and VIC', 'Other States']
for i in cols:  
df[i] = df[i].str.replace('%2F','/')

所以问题是:如何清理df中多列中的所有ASCII字符?

这里有一个简单得多的解决方案。您可以使用urllib.parse.unquote()解码url。

from urllib.parse import unquote
df["NSW and VIC"] = df["NSW and VIC"].apply(lambda x: unquote(x) if(x!=0) else x)
df['Other States'] = df["Other States"].apply(lambda x: unquote(x) if(x!=0) else x)
df

输出:

>>> df['NSW and VIC'][1]
'https://contentspace.global.com/teams/Australia/Victoria/Documents/Forms/AllItems.aspx?RootFolder=/teams/Australia/Victoria/Documents/In Scope&FolderCTID=0x0120006984C27BA03D394D9E2E95FB893593F9&View={3276A351-18C1-4D32-ADFF-54158B504FCC}'

最新更新