Python抓取谷歌搜索结果



我正在尝试抓取谷歌搜索结果的所有数据——标题、URL和描述。但是,我无法获取搜索结果的描述,它返回了一个空字符串。

# check Chrome version: Menue (the three dots - upper right corner -> Help -> About Google Chrome)
# download ChromeDriver according to the Chrome version (example version 79)
# download from https://sites.google.com/a/chromium.org/chromedriver/downloads
# place the chromedriver.exe file in the current working directory
# pip install selenium
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from bs4.element import Tag
import pandas as pd
import random

keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:
# Scraper that gives bacck: titles, links, descriptions
driver = webdriver.Chrome()
google_url = "https://www.google.com/search?gl=US&q=" + i + "&num=" + str(10)
driver.get(google_url)
time.sleep(random.randrange(15,50))
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
links = []
titles = []
descriptions = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = None
title = r.find('h3')
if isinstance(title,Tag):
title = title.get_text()
description = None
description = r.find('span', attrs={'class': 'st'})
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
for link, title, description in zip(links, titles, descriptions):
df = df.append({'keyword': i, 'title': title, 'url': link, 'description': description}, ignore_index=True)
df.to_csv(r'final_dataset.csv', index=False)

任何人都知道如何在谷歌搜索结果中获取描述。

使用以下代码获取描述节点。

description = r.select('.aCOpRe span:not(.f)')

此外,您可以使用requests而不是selenium。完整的示例在在线IDE中。

from requests import Session
from bs4 import BeautifulSoup
from bs4.element import Tag
import pandas as pd
keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:
# Scraper that gives back: titles, links, descriptions
params = {"q": i, 'gl': 'US', 'num': 10}
headers = {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 Edg/80.0.361.62"
}
with Session() as session:
r = session.get(
"https://google.com/search", params=params, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
links = []
titles = []
descriptions = []
for r in result_div:
# Checks if each element is present, else, raise exception
try:
link = r.find('a', href=True)
title = r.find('h3')
if isinstance(title, Tag):
title = title.get_text()
description = r.select('.aCOpRe span:not(.f)')
if isinstance(description, Tag):
description = description.get_text()
# Check to make sure everything is present before appending
if link != '' and title != '' and description != '':
links.append(link['href'])
titles.append(title)
descriptions.append(description)
# Next loop if one element is not present
except Exception as e:
print(e)
continue
for link, title, description in zip(links, titles, descriptions):
df = df.append({
'keyword': i,
'title': title,
'url': link,
'description': description
}, ignore_index=True)
df.to_csv(r'final_dataset.csv', index=False)

或者,您可以通过SerpApi从谷歌搜索中提取数据。


免责声明:我在SerpApi工作

最新更新