from bs4 import BeautifulSoup
from lxml import etree
import requests
import re
URL = "https://csimarket.com/stocks/at_glance.php?code=AA"
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
raw_html = soup.find(href="../Industry/Industry_Data.php?s=100")
span = raw_html.find("span")
span.decompose()
print(raw_html.text.strip())
代码运行良好raw_html=汤.find(href="../Industry/Industry_Data.php?s=100"(当我浏览其他页面时,这部分会有所不同/Industry/Industry_Data.php?s=1000
我如何搜索"…"/Industry/Industry_Data.php";
用css selectors
选择元素,并检查<span>
是否存在:
for a in soup.select('a[href*="../Industry/Industry_Data.php"]'):
if a.span:
a.span.decompose()
print(a.text.strip())
示例
from bs4 import BeautifulSoup
import requests
URL = "https://csimarket.com/stocks/at_glance.php?code=AA"
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
for a in soup.select('a[href*="../Industry/Industry_Data.php"]'):
if a.span:
a.span.decompose()
print(a.text.strip())
输出
Industries At a Glance
Basic Materials
Aluminum
Aluminum
Basic Materials