从页面下载所有扩展名为的文件



我正在尝试在此处下载所有netcdf(.nc)文件:https://www.ncei.noaa.gov/data/avhrr-land-normalized-difference-vegetation-index/access/2000/

import urllib3
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('.nc')

但是,运行此代码后,list_urls为空。我该怎么修?

以下是我所做的soup.find_all(text=lambda t: ".nc" in t)以及使用进度条所做的工作:)

import sys
import requests
import urllib3
import humanize
from bs4 import BeautifulSoup
site = urllib3.PoolManager()
base_url = 'https://www.ncei.noaa.gov//data//avhrr-land-normalized-difference-vegetation-index//access//'
html = site.request('GET', base_url + '//' + '2000')
soup = BeautifulSoup(html.data, "lxml")
link_urls = soup.find_all(text=lambda t: ".nc" in t)
for link in link_urls:
download_link = "{}2000/{}".format(base_url, link)
r = requests.get(download_link, stream=True)
total_length = r.headers.get('content-length')
print("nDownloading: {}nTotalSize: {}".format(download_link, humanize.naturalsize(total_length)))
with open(link, "wb") as f:
print("Downloading %s" % link)
if total_length is None:  # no content length header
f.write(r.content)
else:
dl = 0
total_length = int(total_length)
for data in r.iter_content(chunk_size=4096):
dl += len(data)
f.write(data)
done = int(50 * dl / total_length)
sys.stdout.write("r[%s%s]" % ('=' * done, ' ' * (50 - done)))
sys.stdout.flush()

最新更新