Parse <script type="text/javascript" twitter python



很长的代码。需要解析screen_name:

<script type="text/javascript" charset="utf-8" nonce="YjJmNTAwODgtODBmMy00YzQ5LWJhODItMmQwNTk0Yjg4MTI1">window.__INITIAL_STATE__={"optimist":[],"urt":{},"toasts":[],"needs_phone_verification":false,"normal_followers_count":2,"notifications":false,"pinned_tweet_ids_str":[],"profile_image_url_https":"https://pbs.twimg.com/profile_images/1174197230003208192/qK5cqalJ_normal.jpg","profile_interstitial_type":"","protected":false,"featureSwitch":{"config":{"2fa_multikey_management_enabled":{"value":false},"screen_name":"Vickson25435099","always_use_https":true,"use_cookie_personalization":false,"sleep_time":{"enabled":false,"end_time":null,"start_time":null},"geo_enabled":false,"language":"en","discoverable_by_email":true,"discoverable_by_mobile_phone":true,"personalized_trends":true,"allow_media_tagging":"none","allow_contributor_request":"all","allow_ads_personalization":true,"allow_logged_out_device_personalization":true,"allow_location_history_personalization":true,"allow_sharing_data_for_third_party_personalization":false,"allow_dms_from":"following","allow_dm_groups_from":"following","translator_type":"none","country_code":"us","nsfw_user":false,"nsfw_admin":false,"ranked_timeline_setting":1,"ranked_timeline_eligible":null,"address_book_live_sync_enabled":false,"universal_quality_filtering_enabled":"enabled","dm_receipt_setting":"all_disabled","alt_text_compose_enabled":null,"mention_filter":"unfiltered","allow_authenticated_periscope_requests":true,"protect_password_reset":false,"require_password_login":false,"requires_login_verification":false,"dm_quality_filter":"enabled","autoplay_disabled":false,"settings_metadata":{}},"fetchStatus":"loaded"},"dataSaver":{"dataSaverMode":false},"transient":{"dtabBarInfo":{"hide":false},"loginPromptShown":false,"lastViewedDmInboxPath":"/messages","themeFocus":""}},"devices":{"browserPush":{"fetchStatus":"none","pushNotificationsPrompt":{"dismissed":false,"fetchStatus":"none"},"subscribed":false,"supported":null},"devices":{"data":{"emails":[],"phone_numbers":[]},"fetchStatus":"none"},"notificationSettings":{"push_settings":{"error":null,"fetchStatus":"none"},"push_settings_template":{"template":{"settings":[]}},"sms_settings":{"error":null,"fetchStatus":"none"},"sms_settings_template":{"template":{"settings":[]}},"checkin_time":null}},"audio":{"conversationLookup":{}},"hashflags":{"fetchStatus":"none","hashflags":{}},"friendships":{"pendingFollowers":{"acceptedIds":[],"ids":[],"fetchStatus":{"bottom":"none","top":"none"},"hydratedIds":[]}},"homeTimeline":{"useLatest":false,"fetchStatus":"none"},"multiAccount":{"fetchStatus":"none","users":[],"badgeCounts":{},"addAccountFetchStatus":"none"},"badgeCount":{"unreadDMCount":0},"ocf_location":{"startLocation":{}},"navigation":{},"teams":{"fetchStatus":"none","teams":{}},"cardState":{},"promotedContent":{}};window.__META_DATA__={"env":"prod","isLoggedIn":true,"isRTL":false,"hasMultiAccountCookie":false,"uaParserTags":["m2","rweb","msw"],"serverDate":1614578006755,"sha":"9921d3a6d626dc45b0f5a65681ef95c891d815cd"};window.__PREFETCH_DATA__={"items":[{"key":"dataUsageSettings","payload":{"dataSaverMode":false}}],"timestamp":1614578006700};</script>

我正在尝试这个方法

import requests
import json
from bs4 import BeautifulSoup
x = requests.get('https://twitter.com/home')
b = BeautifulSoup(x.text, 'html.parser')
for b in b.find_all('script'):
wis = x.text.split('window.__INITIAL_STATE__=')
if len(wis) > 1:
data = json.loads(wis[1].split(';')[0])

print(data["screen_name"])

结果:KeyError "screen_name"这种方式也行不通:

import requests
import json

x = requests.get('https://twitter.com/home')
html = x.text.split('window.__INITIAL_STATE__=')[0]
html = html.split(';</script>')[0]
data = json.loads(html)
print(data['screen_name'])

结果

Traceback (most recent call last):
File "<string>", line 8, in <module>
File "/usr/lib/python3.8/json/__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "/usr/lib/python3.8/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python3.8/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
> 

使用完整源代码更新

for script in b.find_all('script'):
if 'window.__INITIAL_STATE__=' not in script.contents[0]:
continue
wis = script.contents[0].split('window.__INITIAL_STATE__=')
data = json.loads(wis[1].split(';window.__META_DATA__')[0])
print(data["settings"]["remote"]["settings"]["screen_name"])
break

你不会得到screen_name,它只对当前登录的用户,你必须requests与有效的cookie来获取数据。

顺便说一句,例如上面,它有多个变量(json),你想要在window.__INITIAL_STATE__=,"devices"之间的json

b = BeautifulSoup(html, 'html.parser')
for script in b.find_all('script'):
if 'window.__INITIAL_STATE__=' not in script.contents[0]:
continue
wis = script.contents[0].split('window.__INITIAL_STATE__=')
data = json.loads(wis[1].split(',"devices"')[0])
print(data['featureSwitch']['config']['screen_name'])
break

最新更新