为每个列表执行一个函数,在列表列表中比for循环[网络抓取]更快



我正在构建一个抓取脚本。我有一个.csv文件,里面有客户的详细信息——每行一个客户,每行的第一个单元格是客户名称。我想抓取谷歌新闻,并为每个客户的名字取前5个结果。

现在我有一个for循环,所以它迭代每一行,取客户的名字,运行抓取函数并打印结果。不利的一面是速度相当慢,因为它必须先完成一行,然后才能转到下一行。

我想知道是否存在更快的东西,我想从csv文件中读取行,并有一个列表列表;然后对所有列表并行运行scrape函数。

import bs4,requests
import csv

class Scraper():
def __init__(self):
pass
def ScrapeWebStr(self,account):
base_url = 'https://news.google.com/search?q={}%20when%3A3d&hl=en-US&gl=US&ceid=US%3Aen'
request = requests.get(base_url.format(account)) 
webcontent = bs4.BeautifulSoup(request.content,'lxml') 
counter = 0 
global articles_str
articles_str = ""
for i in webcontent.findAll('div',{'jslog':'93789'}):  #All the news in GN have jslog 93789. This iterate in all the news 
if counter ==5:
break
else:
for link in i.findAll('a', attrs={'href': re.compile("/articles/")},limit=1): #takes the article link for each news
if any(keyword in i.select_one('h3').getText() for keyword in keyword_list): #filter articles that have a keyword
articles_str = articles_str + str(i.select_one('h3').getText()) + 'n' + "https://news.google.com"+str(link.get('href')) + 'n' + ('-' * 80) + 'n' 
counter +=1
if counter == 5: 
break                       
def fileOpen(self,file):
data = open(file,encoding='utf-8')
csv_rows = list(csv.reader(data))
return csv_rows

#FILTERS
keyword_list = ['ACQUISITION', 'Acquisition', 'BALANCE', 'BAN', 'BOND', 'BRAND', 'Balance', 'Ban', 'Bond', 'Brand', 'CAPITAL', 'COSTS', 'CRISIS', 'CUSTOMERS', 'Capital', 'Costs', 'Crisis', 'Customers', 'DEBT', 'DEMAND', 'Debt', 'Demand', 'ECONOMY', 'Economy', 'FINANCE', 'FINANCIAL', 'FUND', 'Finance', 'Financial', 'Fund', 'GROWTH', 'Growth', 'INVESTOR', 'IPO', 'IPO', 'Investor', 'LAYOFF', 'Layoff', 'MARKET', 'MERGER', 'Market', 'Merger', 'NEW', 'New', 'PAY', 'PAYABLES', 'PROFIT', 'Pay', 'Payables', 'Profit', 'RATING', 'RECEIVABLES', 'REVENUES', 'Rating', 'Receivables', 'Revenues', 'SALES', 'SHARE', 'SHEET', 'SIZE', 'STOCK', 'SUE', 'Sales', 'Share', 'Sheet', 'Size', 'Stock', 'Sue', 'TREND', 'Trend', 'USAGE', 'Usage', 'acquisition', 'balance', 'ban', 'bond', 'brand', 'capital', 'costs', 'crisis', 'customers', 'debt', 'demand', 'economy', 'finance', 'financial', 'fund', 'growth', 'investor', 'ipo', 'layoff', 'market', 'merger', 'new', 'pay', 'payables', 'profit', 'rating', 'receivables', 'revenues', 'sales', 'share', 'sheet', 'size', 'stock', 'sue', 'trend', 'usage'] #List of keywords to filter
#UPDATE + MAIL VERSION
Execution = Scraper()
metadata = Execution.fileOpen("Excel for Scraping.csv")
for row in metadata:
Execution.ScrapeWebStr(row[0])
print(articles_str)

for循环减慢了所有的速度,而且我还有未使用的计算能力。我觉得这个解决方案可能包括多处理,但我不明白如何为主列表中的每个列表取项[0],并在没有for循环的情况下运行多处理。我不想让你为我写代码,但任何提示都会有很大帮助!

非常感谢

我使用了httpxasyncio来并发请求,使用fake_useragent来伪造用户代理,还简化了代码中的CSS选择器。Repl.it.上有一个完整的工作示例

请注意,我已经评论了Execution.fileOpen,并使用了一个搜索词列表。

from bs4 import BeautifulSoup
import csv, time, asyncio, httpx
from fake_useragent import UserAgent

class Scraper():
def __init__(self):
pass
async def ScrapeWebStr(self, account: str) -> str:
async with httpx.AsyncClient() as client:
ua = UserAgent()
headers = {"User-Agent": ua.random}
params = {
"q": f"{account} when",
"hl": "en-US",
"gl": "US",
"ceid": "US:en"
}
response = await client.get(
'https://news.google.com/search',
params=params,
headers=headers)
webcontent = BeautifulSoup(response.text, 'lxml')
counter = 0
articles_str = ""
# All the news in GN have jslog 93789. This iterate in all the news
for i in webcontent.select('div[jslog="93789"]'):
if counter == 5:
break
else:
for link in i.select('h3 a[href*="/articles/"]'):
article_text = link.getText()
# filter articles that have a keyword
if any(keyword in article_text
for keyword in KEYWORD_LIST):
articles_str += f"""{article_text}
https://news.google.com{link.get('href')}
{'-' * 80}
"""
counter += 1
if counter == 5:
break
return articles_str
def fileOpen(self, file):
with open(file, encoding='utf-8') as data:
list(csv.reader(data))

# List of keywords to filter
KEYWORD_LIST = [
'ACQUISITION', 'Acquisition', 'BALANCE', 'BAN', 'BOND', 'BRAND', 'Balance',
'Ban', 'Bond', 'Brand', 'CAPITAL', 'COSTS', 'CRISIS', 'CUSTOMERS',
'Capital', 'Costs', 'Crisis', 'Customers', 'DEBT', 'DEMAND', 'Debt',
'Demand', 'ECONOMY', 'Economy', 'FINANCE', 'FINANCIAL', 'FUND', 'Finance',
'Financial', 'Fund', 'GROWTH', 'Growth', 'INVESTOR', 'IPO', 'IPO',
'Investor', 'LAYOFF', 'Layoff', 'MARKET', 'MERGER', 'Market', 'Merger',
'NEW', 'New', 'PAY', 'PAYABLES', 'PROFIT', 'Pay', 'Payables', 'Profit',
'RATING', 'RECEIVABLES', 'REVENUES', 'Rating', 'Receivables', 'Revenues',
'SALES', 'SHARE', 'SHEET', 'SIZE', 'STOCK', 'SUE', 'Sales', 'Share',
'Sheet', 'Size', 'Stock', 'Sue', 'TREND', 'Trend', 'USAGE', 'Usage',
'acquisition', 'balance', 'ban', 'bond', 'brand', 'capital', 'costs',
'crisis', 'customers', 'debt', 'demand', 'economy', 'finance', 'financial',
'fund', 'growth', 'investor', 'ipo', 'layoff', 'market', 'merger', 'new',
'pay', 'payables', 'profit', 'rating', 'receivables', 'revenues', 'sales',
'share', 'sheet', 'size', 'stock', 'sue', 'trend', 'usage'
]

async def main():
start_time = time.monotonic()
print(f"Started main")
async def scrape_and_print(row):
start_time = time.monotonic()
print(f"Searching for '{row}'")
# Execution.ScrapeWebStr(row[0])
articles_str = await Execution.ScrapeWebStr(row)
print(
f"Finished searching for '{row}' in {time.monotonic() - start_time}"
)
print(articles_str)
# UPDATE + MAIL VERSION
Execution = Scraper()
# metadata = Execution.fileOpen("Excel for Scraping.csv")
metadata = ['stackoverflow', 'google']
await asyncio.gather(
*[scrape_and_print(row) for row in metadata], return_exceptions=True)
print(f"Finished main in {time.monotonic() - start_time}")

asyncio.run(main())

输出

news.google.com.中的拖尾点很好。FQDN的末尾有点(来源:1、2、3(。

Started main
Searching for 'stackoverflow'
Searching for 'google'
Finished searching for 'stackoverflow' in 5.42717178100429
Stack Overflow reports strong growth from COVID-19 workplace changes
https://news.google.com./articles/CBMiY2h0dHBzOi8vd3d3LnpkbmV0LmNvbS9hcnRpY2xlL3N0YWNrLW92ZXJmbG93LXJlcG9ydHMtc3Ryb25nLWdyb3d0aC1mcm9tLWNvdmlkLTE5LXdvcmtwbGFjZS1jaGFuZ2VzL9IBbmh0dHBzOi8vd3d3LnpkbmV0LmNvbS9nb29nbGUtYW1wL2FydGljbGUvc3RhY2stb3ZlcmZsb3ctcmVwb3J0cy1zdHJvbmctZ3Jvd3RoLWZyb20tY292aWQtMTktd29ya3BsYWNlLWNoYW5nZXMv?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Stack Overflow gets $85M Series E funding to expand SaaS product
https://news.google.com./articles/CBMiZWh0dHBzOi8vd3d3LnRlY2hyZXB1YmxpYy5jb20vYXJ0aWNsZS9zdGFjay1vdmVyZmxvdy1nZXRzLTg1bS1zZXJpZXMtZS1mdW5kaW5nLXRvLWV4cGFuZC1zYWFzLXByb2R1Y3Qv0gFwaHR0cHM6Ly93d3cudGVjaHJlcHVibGljLmNvbS9nb29nbGUtYW1wL2FydGljbGUvc3RhY2stb3ZlcmZsb3ctZ2V0cy04NW0tc2VyaWVzLWUtZnVuZGluZy10by1leHBhbmQtc2Fhcy1wcm9kdWN0Lw?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Stack Overflow expands its Teams service with new integrations
https://news.google.com./articles/CAIiELYwgKfey9CR8SXDOjhn0zMqFAgEKg0IACoGCAowlIEBMLEXMOc_?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
MongoDB: A Database For The New Era
https://news.google.com./articles/CAIiEC1Lr6ctqyb6pi2PkH_xsfAqFggEKg0IACoGCAowkqEGMJBZMLLouwY?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
EXCLUSIVE: Communities, brand and then product — CEO explains how Stack Overflow flipped the script on softwar
https://news.google.com./articles/CBMivQFodHRwczovL3d3dy5idXNpbmVzc2luc2lkZXIuaW4vdGVjaC9lbnRlcnByaXNlL25ld3Mvc3RhY2stb3ZlcmZsb3ctY2VvLXByYXNoYW50aC1jaGFuZHJhc2VrYXItZXhwbGFpbnMtaG93LXN0YWNrLW92ZXJmbG93LWZsaXBwZWQtdGhlLXNjcmlwdC1vbi1zb2Z0d2FyZS1kZXZlbG9wbWVudC9hcnRpY2xlc2hvdy83ODg1NDU2NC5jbXPSAcEBaHR0cHM6Ly93d3cuYnVzaW5lc3NpbnNpZGVyLmluL3RlY2gvZW50ZXJwcmlzZS9uZXdzL3N0YWNrLW92ZXJmbG93LWNlby1wcmFzaGFudGgtY2hhbmRyYXNla2FyLWV4cGxhaW5zLWhvdy1zdGFjay1vdmVyZmxvdy1mbGlwcGVkLXRoZS1zY3JpcHQtb24tc29mdHdhcmUtZGV2ZWxvcG1lbnQvYW1wX2FydGljbGVzaG93Lzc4ODU0NTY0LmNtcw?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------

Finished searching for 'google' in 2.2639846410020255
Google Home: 5 ways to fix the issue when Google Assistant doesn't understand you
https://news.google.com./articles/CAIiEEH9NNUgXleahne_qOb-cRcqFQgEKgwIACoFCAow4GowoAgwhuCMBg?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Pocket Casts is up for sale nearly three years after acquisition by public radio consortium
https://news.google.com./articles/CAIiEE0kkabL7el7Et9iUgNvaFoqGQgEKhAIACoHCAowyoD5CjD5z-ACMM_rvwU?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Google reportedly requires new Android TV devices support AV1 video decoding
https://news.google.com./articles/CBMiUWh0dHBzOi8vd3d3LnhkYS1kZXZlbG9wZXJzLmNvbS9nb29nbGUtcmVxdWlyZXMtbmV3LWFuZHJvaWQtdHYtYXYxLXZpZGVvLWRlY29kaW5nL9IBVWh0dHBzOi8vd3d3LnhkYS1kZXZlbG9wZXJzLmNvbS9nb29nbGUtcmVxdWlyZXMtbmV3LWFuZHJvaWQtdHYtYXYxLXZpZGVvLWRlY29kaW5nL2FtcC8?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Google previewing new Nest Hub alarms with more customization, tones, and ‘Sunrise’
https://news.google.com./articles/CAIiEPFJ-0ZIW0D3KmRfgtg-QLkqGQgEKhAIACoHCAowyoD5CjD5z-ACMM_rvwU?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------
Google 'throwing its weight around' by burying links to some commercial news sites, experts say
https://news.google.com./articles/CAIiEMds5Hwsm-sBaVKmnmg4yZ8qFggEKg4IACoGCAow3vI9MPeaCDDciw4?hl=en-US&gl=US&ceid=US%3Aen
--------------------------------------------------------------------------------

Finished main in 6.28738788398914

或者,您可以使用SerpApi访问从谷歌新闻中提取的数据。它是免费试用的。

免责声明:我在SerpApi工作。

最新更新