Python BeautifulSoup解析脚本标记



我正在尝试解析脚本标记中的内容以提取某些数据。以下代码使用有效的xbox实时帐户。

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re
email = 'email'
password = 'password'
driver = webdriver.Chrome()
driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)

headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
s = requests.Session()
s.headers.update(headers)
for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)
#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')

soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')
text = str(soup.find_all('script')[13])
value = re.findall(r'DisplayName', text)
print(value)

我试图访问每个"DisplayName"后面的某些数据,但我没能做到,因为我只得到了"DisplayName"而不是它的值。如果您需要更好的想法,可以打印"text"变量并搜索"DisplayName"。感谢所有提前回复的人。

所以你没有得到任何东西的原因是因为你告诉re搜索确切的短语。你并没有告诉它要获得更多的角色以及在哪里停下来。在下面的例子中,我使用的是单引号,但代码可以针对双引号进行调整。然后我让re找到DisplayName,但.*找到它后面的字符,但停在单引号处。然后,它只是替换你不想要的东西。

import re
url = "DisplayName='PoppaBear4'"
info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))

最新更新