Python中的网页抓取挑战



我正在尝试在Python中web抓取此链接。最理想的输出是一个dataframe 4列:日期、作者、标题和文本。到目前为止,我通过以下方式找到了作者、标题和日期:

from bs4 import BeautifulSoup
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
print(req)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
date = card.select('.item_date')
title = card.select_one('.title a').get_text()
author = card.select_one('.authorlnk.dashed').get_text().strip()
data.append({
'date': date,
'title':title,
'author':author
})
print(data)

现在,我发现很难提取页面中10个链接中的每个链接的文本。我正在做以下事情:

data = []
for link in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{link['href']}").content,features="lxml")    
data.append({
'Text': ''.join([str(e) for e in r.select('p')])})

但是,我没有得到任何好的结果。

有谁能帮我一下吗?谢谢!

您已经接近您的目标,只需在for循环中处理对文本的请求:

for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('nn', strip=True)
})

from bs4 import BeautifulSoup
import pandas as pd
import requests
payload = 'from=&till=&objid=cbspeeches&page=&paging_length=10&sort_list=date_desc&theme=cbspeeches&ml=false&mlurl=&emptylisttext='
url= 'https://www.bis.org/doclist/cbspeeches.htm'
headers= {
"content-type": "application/x-www-form-urlencoded",
"X-Requested-With": "XMLHttpRequest"
}
req=requests.post(url,headers=headers,data=payload)
soup = BeautifulSoup(req.content, "lxml")
data=[]
for card in soup.select('.documentList tbody tr'):
r = BeautifulSoup(requests.get(f"https://www.bis.org{card.a.get('href')}").content)
data.append({
'date': card.select_one('.item_date').get_text(strip=True),
'title': card.select_one('.title a').get_text(strip=True),
'author': card.select_one('.authorlnk.dashed').get_text(strip=True),
'url': f"https://www.bis.org{card.a.get('href')}",
'text': r.select_one('#cmsContent').get_text('nn', strip=True)
})
pd.DataFrame(data)

可以使用.get_text()separator=参数。例如:

import requests
import pandas as pd
from bs4 import BeautifulSoup

api_url = "https://www.bis.org/doclist/cbspeeches.htm"
payload = {
"from": "",
"till": "",
"objid": "cbspeeches",
"page": "1",
"paging_length": "25",
"sort_list": "date_desc",
"theme": "cbspeeches",
"ml": "false",
"mlurl": "",
"emptylisttext": "",
}

all_data = []
for payload["page"] in range(1, 3):
soup = BeautifulSoup(
requests.post(api_url, data=payload).content, "html.parser"
)
for row in soup.select(".item"):
date = row.select_one(".item_date").get_text(strip=True)
author = row.select_one(".authorlnk").get_text(strip=True)
title = row.a.get_text(strip=True)
text = row.select_one(".info").get_text(strip=True, separator=" ")
all_data.append((date, author, title, text))
df = pd.DataFrame(all_data, columns=["Date", "Author", "Title", "Text"])
print(df.head(5).to_markdown(index=False))

打印:

布雷纳德:Crypto-assets通过金融稳定和分散金融镜头英格兰银行审慎监管副行长兼审慎监管局(PRA)首席执行官Sam Woods先生在英格兰银行网络研讨会上的演讲(虚拟),2022年7月8日。2022年08年7月Pablo Hernández de CosPablo Hernández de Cos:西班牙银行第一次西班牙经济会议MárioMário Centeno:主题发言-第26届经济学人政府圆桌会议葡萄牙央行行长Mário Centeno先生在第26届经济学人政府圆桌会议上的主旨发言(虚拟),2022年7月7日。潘功胜潘功胜:债券通五周年论坛暨互换通启动仪式致辞中国人民银行副行长潘功胜在债券通五周年论坛暨互换通启动仪式上的致辞(虚拟),2022年7月4日。作者:潘功胜