如何在web scraper中实现多线程



我目前正在处理一个项目,该项目要求我从数百页中提取数据。然而,我注意到整个提取过程花费了太长时间,因为刮刀必须处理大约800+页。我读过关于多处理的文章,我相信它可以加快速度,但我真的不知道如何将它集成到我当前的代码中。

from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
final_data = []
for i in range(1,8271,10):
url = (f'https://www.fca.org.uk/news/search-results?np_category=warnings&start={i}')
req = requests.get(url)
start = time.process_time()
page_html = req.content
page_soup = soup(page_html, "lxml")
data = page_soup.find_all('li', class_='search-item')
print(f'Processing {url}')
for x in data:
list = {}
list['name'] = x.find('a','search-item__clickthrough').text.strip()
try:
list['published_date']=x.find('span','meta-item published-date').text
except:
list['published_date'] = 'None'
list['modified_date']=x.find('span','meta-item modified-date').text
final_data.append(list)
df = pd.DataFrame(final_data)
TodaysDate = time.strftime("%Y%m%d")
csvfilename = TodaysDate + "_FCA Macro.csv"
df.to_csv(csvfilename, encoding="utf-8-sig")
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
import time
import threading
final_data = []
def scrape(i):
url = (f'https://www.fca.org.uk/news/search-results?np_category=warnings&start={i}')
req = requests.get(url)
start = time.process_time()
page_html = req.content
page_soup = soup(page_html, "lxml")
data = page_soup.find_all('li', class_='search-item')
print(f'Processing {url}')
for x in data:
list = {}
list['name'] = x.find('a','search-item__clickthrough').text.strip()
try:
list['published_date']=x.find('span','meta-item published-date').text
except:
list['published_date'] = 'None'
list['modified_date']=x.find('span','meta-item modified-date').text
final_data.append(list)

for i in range(1,8271,10):
threading.Thread(target=scrape, args=(i,)).start()

df = pd.DataFrame(final_data)
TodaysDate = time.strftime("%Y%m%d")
csvfilename = TodaysDate + "_FCA Macro.csv"
df.to_csv(csvfilename, encoding="utf-8-sig")

请参阅https://realpython.com/intro-to-python-threading/获取更多文档。

只需使用for循环来创建新线程,并为每个创建的新线程传递i。需要CCD_ 2来启动线程。