如何使用python/panda从href获取href链接



我需要获取href中存在的href链接(我已经有了(,所以我需要点击该href链接并收集另一个href。我试过了,但从那个代码中只得到了第一个href,想点击那个并收集上一个中出现的href。那我怎么能这么做呢。我尝试过:

from bs4 import BeautifulSoup
import requests
url = 'https://www.iea.org/oilmarketreport/reports/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
#soup.prettify()
#table = soup.find("table")
#print(table)
links = []
for href in soup.find_all(class_='omrlist'):
#print(href)
links.append(href.find('a').get('href'))
print(links) 

这里是如何循环以获得报告url

import requests
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))  # add prefix 'http://....'
return all_links
yearLinks = getLinks(root_url + '/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
reportLinks.extend(links)
print(reportLinks)
for url in reportLinks:
if '.pdf' in url:
url = url.replace('../../..', '')
# do download pdf file
....
else:
# do extract pdf url from html and download it
....
....

现在你可以循环reportLinks来获得pdf url

最新更新