如何在包含链接列表的表中提取可下载的链接,并抓取多个页面



当点击表中的特定标题(即本例中的公告(时,我想提取所有.doc链接。

根据以下代码,我只能提取一级的标题、日期和所有链接一页:

from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
import pandas as pd
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
import requests
frame =[]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options = chrome_options)
for page_number in range(1,78):
url = 'http://example.com/index{}.html'.format(page_number)
driver.get(url)
html = etree.HTML(driver.page_source)
extract_announcements_list = html.xpath('//table[@id="14681"]/tbody/tr/td/table[@width="90%"][position()>=2 and position() <= (last())]')
for i in list:
date = i.xpath('./tbody/tr/td[3]/text()')
title = i.xpath('./tbody/tr/td[2]/font/a/@title')
link = i.xpath('./tbody/tr/td[2]/font/a/@href')
real_link = 'http://example.com'+ link[0]
print(title,date,real_link)
frame.append({
'title': title,
'link': real_link,
'date': date,
**'content': doc_link,** #this is the doc_link I want to extract in the second level
})
dfs = pd.DataFrame(frame)
dfs.to_csv('myscraper.csv',index=False,encoding='utf-8-sig')

我花了好几个小时来寻找解决方案。如果有人能帮我提取第二个链接来获取.doc链接的内容("内容":doc_link(,以及抓取网站中所有页面的方法,我将不胜感激。

提前非常感谢!

更新:非常感谢@Ares Zephyr分享您的代码。以下是我根据建议对代码所做的修改。但它并没有因为能够获得内部链接而产生任何结果。

from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import requests
frame =[]
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options = chrome_options)
for page_number in range(1,2):
url = 'http://example.com/index{}.html'.format(page_number)
print('Downloading page %s...' % url)
driver.get(url)
html = etree.HTML(driver.page_source)
html_page = urllib.request.urlopen(url)
soup = BeautifulSoup(html_page, "html.parser")
extract_announcements_list = html.xpath('//table[@id="14681"]/tbody/tr/td/table[@width="90%"][position()>=2 and position() <= (last())]')
for i in list:
date = i.xpath('./tbody/tr/td[3]/text()')
title = i.xpath('./tbody/tr/td[2]/font/a/@title')
link = i.xpath('./tbody/tr/td[2]/font/a/@href')
real_link = 'http://example.com'+ link[0]
soup = BeautifulSoup(requests.get(real_link).content, 'html.parser')
for doc_link in soup.findAll('a'):
thelink = doc_link.get('href')
frame.append({
'title': title,
'link': real_link,
'date': date,
'doclink': thelink,
})
dfs = pd.DataFrame(frame)
dfs.to_csv('myscraper.csv',index=False,encoding='utf-8-sig')

您需要在代码中缩进这一部分,以便append函数处理所有刮取的数据。我相信这也是@arundeep chohan想要强调的。

frame.append({
'title': announcement_title,
'link': real_link,
'date': announcement_date,
**'content': doc_link,** #this is the doc_link I want to extract in the second level
})

查找单据文件的逻辑如下。请修改并使用它。这是我用来下载pdf文件的代码的一部分。

for link in soup.findAll('a'):
theLink = link.get('href')
name= link.string
# Logic to find .pdf files

if theLink[-4:]  == ".pdf" or theLink[-4:] == ".pdf":
if theLink[-4:] ==".pdf":
fileExtension = ".pdf"

最新更新