从带有校验和的动态链接中抓取pdf



我一直在尝试从这样的页面上抓取pdf:https://www.oecd-ilibrary.org/science-and-technology/oecd-digital-economy-papers_20716826?page=4

…用BeautifulSoup也没用

如何抓取实际的pdf文档?

import trio
import httpx
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin, urlparse
mainurl = 'https://www.oecd-ilibrary.org/science-and-technology/oecd-digital-economy-papers_20716826'

async def downloader(client, link, channel):
fname = urlparse(link)[2].split('/')[-1]
async with channel, await trio.open_file(fname, 'wb') as f:
r = await client.get(link)
await f.write(r.content)
print(f'Downloaded: {link}')

async def get_links(content):
return (urljoin(mainurl, x['href']) for x in BeautifulSoup(content, 'lxml', parse_only=SoupStrainer(
id='collectionsort')).select('a.action-pdf'))

async def worker(channel):
async with channel:
async for client, page, nurse in channel:
params = {
'page': page
}
r = await client.get(mainurl, params=params)
links = await get_links(r.text)
for link in links:
nurse.start_soon(downloader, client, link, channel.clone())

async def main():
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(5):
nurse.start_soon(worker, receiver.clone())
async with sender:
for page in range(1, 19):
await sender.send([client, page, nurse])

if __name__ == "__main__":
try:
trio.run(main)
except KeyboardInterrupt:
exit('Bye!')

最新更新