使用asyncio递归地从网页中的子目录中收集链接



我正在尝试编写一个程序,该程序可以获取网页上的所有链接,甚至是子目录中的链接。我在requests包中使用了这个,但当你必须从很多子目录中获取链接时,它会很慢。这是我的工作代码,大约需要4分钟才能从https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/.

import requests
import re
from bs4 import BeautifulSoup
def get_html(base_url):
    req = requests.get(base_url)
    return req.text if (req.status_code == 200) else ''
def get_links(html_page):
    soup = BeautifulSoup(html_page, "lxml")  # removed "html.parser"
    regex = r'(.nc$)|(/$)'
    links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
    return links
def get_sub_dirs(links):
    sub_dirs = [link for link in links if re.search(r'/$', link)]
    return sub_dirs 
def get_files(links):
    file_links = [link for link in links if re.search(r'.nc$', link)]
    return file_links
    
def main(base_url):
    files = []
    html_page = get_html(base_url)
    links = get_links(html_page)
    sub_dirs = get_sub_dirs(links)
    base_files = get_files(links)
    files.append(base_files)
    for sub in sub_dirs:
        sub_files = main(sub)
        files.append(sub_files)
        
    return files
# Run programe
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = main(base_url)

我认为代码中的瓶颈是get_html()函数,它需要几秒钟才能取回html。我认为这段代码可以使用异步函数进行优化,但我很难做到这一点。以下是我对异步版本代码的尝试:

import aiohttp
import asyncio
import re
from bs4 import BeautifulSoup
 
async def get_html_async(base_url):
    async with aiohttp.ClientSession() as client:
        async with client.get(base_url) as resp:
            return await resp.text() if (resp.status == 200) else ''
        
def get_links(html_page):
    soup = BeautifulSoup(html_page, "lxml")  # removed "html.parser"
    regex = r'(.nc$)|(/$)'
    links = [f"{base_url}{link.get('href')}" for link in soup.findAll('a', attrs={'href': re.compile(regex)})]
    return links
def get_sub_dirs(links):
    sub_dirs = [link for link in links if re.search(r'/$', link)]
    return sub_dirs 
def get_files(links):
    file_links = [link for link in links if re.search(r'.nc$', link)]
    return file_links
  
async def get_tasks(session):
    async with aiohttp.ClientSession() as client:
        async with client.get(url) as resp:
            return await resp.text() if (resp.status == 200) else ''
  
    
async def main(base_url):
    files = []
    html_page = await asyncio.gather(get_html_async(base_url))
    links = get_links(html_page[0])
    sub_dirs = get_sub_dirs(links)
    base_files = get_files(links)
    files.append(base_files)
    for sub in sub_dirs:
        sub_files = await asyncio.gather(main(sub))
        files.append(sub_files)
        
    return files
# Run program
base_url = 'https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/'
files = asyncio.gather(main(base_url))

如有任何帮助,我们将不胜感激。谢谢

通过以这种方式调用asyncio.gather(),您可以像以前一样按顺序运行请求。asyncio.gather()将多个可迭代项作为参数来同时运行它们。只调用一个可用的asyncio.gather()是没有意义的,因为那样你就可以简单地等待它。通过在main()中创建所有coro而不等待它们,然后将它们全部传递给asyncio.gather(),你可以获得显著的加速:

# some minor fixes added
import asyncio
import re
from itertools import chain
import aiohttp
from bs4 import BeautifulSoup

async def get_html_async(base_url):
    async with aiohttp.ClientSession(
        connector=aiohttp.TCPConnector(ssl=False) # I got ssl errors on my machine
    ) as client:
        async with client.get(base_url) as resp:
            return await resp.text() if (resp.status == 200) else ""

def get_links(html_page):
    soup = BeautifulSoup(html_page, "lxml")  # removed "html.parser"
    regex = r"(.nc$)|(/$)"
    links = [
        f"{base_url}{link.get('href')}"
        for link in soup.findAll("a", attrs={"href": re.compile(regex)})
    ]
    return links

def get_sub_dirs(links):
    sub_dirs = [link for link in links if re.search(r"/$", link)]
    return sub_dirs

def get_files(links):
    file_links = [link for link in links if re.search(r".nc$", link)]
    return file_links

async def main(base_url):
    files = []
    html_page = await get_html_async(base_url)
    links = get_links(html_page) # removed indexing 'html_page[0]'
    sub_dirs = get_sub_dirs(links)
    base_files = get_files(links)
    files.extend(base_files) # extend list to get "cleaner" output, keep using 'append' if your downstream code requires it
    coros = [main(sub) for sub in sub_dirs] # create all requests
    new_files = await asyncio.gather(*coros) #  run all requests concurrently
    files.extend(chain(*new_files)) # again, add to list as needed
    return files

# Run program
base_url = "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/"
files = asyncio.run(main(base_url)) # or simply 'await main(base_url)' in IPython
print(files)

最新更新