通过链接解码 json



所需的 json 包含土耳其字符。我正在使用python 3.5作为编程语言。为了从 url 读取 json,我使用了两种不同的方法,并且出现不同的错误。 首先,我使用阅读作为url方法,并使用以下代码段:

import pprint
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=6784eb9d038057a0821a7c905fd5f263&config[item_category]=Ym9yc2E=&config[item_title]=QUtCTks=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvYWtibmstYWtiYW5rLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=5a8cadfa04b533f95ae83f0b9e530091&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1"
html =  urlopen(url)
#print(html.read())
data = html.read()
soup = BeautifulSoup(data.decode('utf-8'),"lxml")
print(soup)

但是,我得到的输出输出带有未解码的字母。如 , \u00f6 \u0131 输出如下:

<html><body><p>mynetComment.render({"config":
{"service":"finanspano","moderation":"1","item_alias":"6784eb9d038057a0821a7c905fd5f263","item_category":"Ym9yc2E=","item_title":"QUtCTks=","item_url":"aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvYWtibmstYWtiYW5rLw==","profile":"0","share_email":"1","share_fb":"1","share_tw":"1","profile_pattern":"Iw==","pagination":"1","pagination_pattern":"aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=","comment_per_page":"5","page":"2","reply_count":"2","title":"yorumlar","hash":"5a8cadfa04b533f95ae83f0b9e530091"},"data":{"mynetUsername":null,"ordering":"desc","orderBy":"c.created","items":[{"id":"4037034","parent_id":"0","child":"0","item_id":"448","comment":"parau015fu00fctlerinizi taku0131n her ihtimale karu015fu0131","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-30 11:45:36","user":"sekmentx_2014","clike":"2","cdislike":"0","ip":"1372981766","clikeTotal":"2","ctotal":"2","timeDiff":"2541843","like":"+2","timePast":"4 hafta u00f6nce"},{"id":"4034275","parent_id":"0","child":"0","item_id":"448","comment":"au015fau011fu0131lardan almasu0131nu0131 bilene yukaru0131dan satmasu0131nu0131 bilene gu00fczel ortamlar oluu015fuyor","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 15:45:37","user":"sekmentx_2014","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2613842","like":"+1","timePast":"1 ay u00f6nce"},{"id":"4033970","parent_id":"0","child":"0","item_id":"448","comment":"kar cebe yaku0131u015fu0131r aku0131llu0131 olanlara","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 14:58:55","user":"sekmentx_2014","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2616644","like":"+1","timePast":"1 ay u00f6nce"},{"id":"4032505","parent_id":"0","child":"0","item_id":"448","comment":"en gu00fczeli satmak nazlana nazlana u00e7u0131ku0131yor ne dersiniz iu015flem hacimleri iyice du00fcu015ftu00fc","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-29 11:04:45","user":"erdal_1972_pknez","clike":"1","cdislike":"0","ip":"1372981766","clikeTotal":"1","ctotal":"1","timeDiff":"2630694","like":"+1","timePast":"1 ay u00f6nce"},{"id":"4023515","parent_id":"0","child":"0","item_id":"448","comment":"Akbank u00e7u00f6ken sistemi ile iyi bir zarar edecek bugu00fcn u00f6yle gu00f6ru00fcnu00fcyor. yazu0131klar olsun bu devirde bilgi iu015flem sistemin u00e7u00f6ku00fcyor yahu.","can_reply":"1","share":"1.0.0","status":"1","created":"2017-06-22 15:36:53","user":"ekin_yildirim_2015","clike":"1","cdislike":"0","ip":"3578451480","clikeTotal":"1","ctotal":"1","timeDiff":"3219166","like":"+1","timePast":"1 ay u00f6nce"}],"total":"908","totalPage":182}});</p></body></html>

其次我使用以下方法

import urllib.request, json
url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=f89e64e27edc887b8ed3314fe8562eb2&config[item_category]=Ym9yc2E=&config[item_title]=R0FSQU4=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvZ2FyYW4tZ2FyYW50aS1iYW5rYXNpLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=e80cdd0e7a3dd9f4bbc393517386781c&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1"
data = json.loads(urllib.request.urlopen(url).read().decode('utf-8'))
print(data)

我收到以下错误:

Traceback (most recent call last):
File "G:/Internship/quantsol-text/web-crawler/mynet_new/date_gaining.py", line 17, in <module>
data = json.loads(urllib.request.urlopen(url).read().decode('utf-8'))
File "C:UsersnihadazimliAppDataLocalProgramsPythonPython35libjson__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "C:UsersnihadazimliAppDataLocalProgramsPythonPython35libjsondecoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:UsersnihadazimliAppDataLocalProgramsPythonPython35libjsondecoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)

由于输出网页提供的不是必需的 JSON。
我们需要从文本中提取 JSON,一个周转答案可能专门针对此问题使用 re 来提取文本中的 JSON 类型信息。

import urllib.request, json, re
url = "http://finanspano.mynet.com/index/index/?config[service]=finanspano&config[moderation]=1&config[item_alias]=f89e64e27edc887b8ed3314fe8562eb2&config[item_category]=Ym9yc2E=&config[item_title]=R0FSQU4=&config[item_url]=aHR0cDovL2ZpbmFucy5teW5ldC5jb20vYm9yc2EvaGlzc2VsZXIvZ2FyYW4tZ2FyYW50aS1iYW5rYXNpLw==&config[profile]=0&config[share_email]=1&config[share_fb]=1&config[share_tw]=1&config[profile_pattern]=Iw==&config[pagination]=1&config[pagination_pattern]=aHR0cDovL2ZpbmFuc3Bhbm8ubXluZXQuY29tL2NsaWVudC5waHA/cGFnZT17UEFHRX0=&config[comment_per_page]=5&config[page]=2&config[reply_count]=2&config[title]=yorumlar&config[hash]=e80cdd0e7a3dd9f4bbc393517386781c&data[orderBy]=c.created&data[ordering]=desc&orderChanged=1"
data = urllib.request.urlopen(url).read().decode('utf-8')
json_type_string = re.findall('({.*})',data)[0]
json_data = json.loads(json_type_string)
print(json_data)  

这里的正则表达式本质上是在拉出第一个{左括号和最后一个}右括号之间的信息。

最新更新