如何从网站上抓取PDF文件的URL ?



谁能帮我最后一个列表名称- EventLinks。实际上,我想要抓取以下代码中提到的pdf和其他数据的url。但是,我正在努力从这里获得url - https://ibbi.gov.in/public-announcement?ann=&title=&date=

CompanyName = driver.find_elements_by_xpath('/html/body/div[5]/div/div/div/div/div/div/div/div[2]/table/tbody/tr/td[4]')
Date = driver.find_elements_by_xpath('/html/body/div[5]/div/div/div/div/div/div/div/div[2]/table/tbody/tr/td[2]')
EventType = driver.find_elements_by_xpath('/html/body/div[5]/div/div/div/div/div/div/div/div[2]/table/tbody/tr/td[1]')
EvidenceLink = driver.find_elements_by_xpath('/html/body/div[5]/div/div/div/div/div/div/div/div[2]/table/tbody/tr/td[7]/a')

for i in range(len(CompanyName)):
print(CompanyName[i].text)
Name_.append(CompanyName[i].text)


for i in range(len(Date)):
print(Date[i].text)
Date_.append(Date[i].text)

for i in range(len(EventType)):
print(EventType[i].text)
EventType_.append(EventType[i].text)

for i in range(len(EvidenceLink)):
print(EvidenceLink[i])
EvidenceLink_.append(EvidenceLink[i])

URL的XPATH是-/html/body/div[5]/div/div/div/div/div/div/div/div/div/div/div[2]/table/tbody/tr[1]/td[7]/a

您需要从td标签的'onclick'参数中提取pdf url:

import requests as rq
from bs4 import BeautifulSoup as bs
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
final_data = []
for curr_page in range(1, 300): # loop through page 1 to 300
# curr_page = 1
url = "https://ibbi.gov.in/public-announcement?ann=&title=&date=&page=%s" % curr_page
resp = rq.get(url, headers=headers, verify=False)
soup = bs(resp.content, "lxml")
table = soup.find_all("div", {"class": "table-responsive"})[0].find('tbody')
rows = table.find_all("tr")
data = []
for row in rows:
row_data = []
for (icol, col) in enumerate(row.find_all('td')):
if icol == 6:
pdf_link = col.find('a')['onclick']
start = pdf_link.index('https://')
end = pdf_link.index('.pdf')
row_data.append(pdf_link[start:end+4])
else:
row_data.append(col.text.strip())
data.append(row_data)
final_data.extend(data)
time.sleep(2)

导致以下输出:

[['Public Announcement of Corporate Insolvency Resolution Process',
'07-09-2021',
'21-09-2021',
'SUBHASHRI BIO-ENERGIES PRIVATE LIMITED',
'Indian Overseas Bank',
'Palanigounder Eswaramoorthy',
'https://ibbi.gov.in//uploads/announcement/9c534bb61bb51c02ec1b59df7c9f416b.pdf',
''],
['Public Announcement of Corporate Insolvency Resolution Process',
'06-09-2021',
'17-09-2021',
'VME PROPERTIES PRIVATE LIMITED',
'ALCHEMIST ASSET RECONSTRUCTION COMPANY LIMITED',
'Sapan Mohan Garg',
'https://ibbi.gov.in//uploads/announcement/a9fc3a2266d4138b6fc693696dd1f6f9.pdf',
''],
...]

相关内容

  • 没有找到相关文章

最新更新