我想从链接列表中下载。
test_list = ['https://dibbs2.bsm.dla.mil/Downloads/RFQ/8/SPE1C122Q0058.PDF', 'https://dibbs2.bsm.dla.mil/Downloads/RFQ/8/SPE2DH22Q0028.PDF',
'https://dibbs2.bsm.dla.mil/Downloads/RFQ/9/SPE2DH22Q0029.PDF', 'https://dibbs2.bsm.dla.mil/Downloads/RFQ/3/SPE2DS22Q0023.PDF',
'https://dibbs2.bsm.dla.mil/Downloads/RFQ/1/SPE2DS22Q0031.PDF', 'https://dibbs2.bsm.dla.mil/Downloads/RFQ/3/SPE2DS22Q0033.PDF']
但是这个脚本也下载了单个文件的许多副本。如何避免这种情况?我只想下载列表中的六个pdf文件。
options = webdriver.ChromeOptions()
# options.add_argument('--no-sandbox')
# # options.add_argument('--disable-dev-shm-usage')
# options.headless = True
# prefs = {"download.default_directory": zip_dir,
# "download.directory_upgrade": True,
# "download.manager.showWhenStarting": False,
# "download.manager.useWindow": False,
# "helperApps.alwaysAsk.force":False,
# "download.manager.showAlertOnComplete": False}
# options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
dn = len(test_list)
for t in range(0,dn):
URL = test_list[t]
sleep(randint(3, 9))
driver.get(URL)
try:
driver.find_element_by_id("butAgree").click()
except:
pass
也尝试过:
for t in test_list:
URL = t
sleep(randint(3, 9))
driver.get(URL)
try:
driver.find_element_by_id("butAgree").click()
except:
pass
你可以不用硒,BeautifulSoup就足够了。诀窍是首先从基本urlhttps://dibbs2.bsm.dla.mil/dodwarning.aspx
中检索验证密钥,然后使用以下密钥下载文件:
from bs4 import BeautifulSoup
import requests
import time
test_list = ['https://dibbs2.bsm.dla.mil/Downloads/RFQ/8/SPE1C122Q0058.PDF', 'https://dibbs2.bsm.dla.mil/Downloads/RFQ/8/SPE2DH22Q0028.PDF',
'https://dibbs2.bsm.dla.mil/Downloads/RFQ/9/SPE2DH22Q0029.PDF', 'https://dibbs2.bsm.dla.mil/Downloads/RFQ/3/SPE2DS22Q0023.PDF',
'https://dibbs2.bsm.dla.mil/Downloads/RFQ/1/SPE2DS22Q0031.PDF', 'https://dibbs2.bsm.dla.mil/Downloads/RFQ/3/SPE2DS22Q0033.PDF']
s = requests.Session()
def get_file(url):
pagereq = s.get('https://dibbs2.bsm.dla.mil/dodwarning.aspx')
soup = BeautifulSoup(pagereq.content, 'html.parser')
viewstategenerator = soup.find("input", attrs = {'id': '__VIEWSTATEGENERATOR'})['value']
viewstate = soup.find("input", attrs = {'id': '__VIEWSTATE'})['value']
eventvalidation = soup.find("input", attrs = {'id': '__EVENTVALIDATION'})['value']
headers = {
'Origin': 'https://dibbs2.bsm.dla.mil',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}
params = (
('goto', url.split('.mil', 1)[1]),
)
data = {
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategenerator,
'__EVENTVALIDATION': eventvalidation,
'butAgree': 'OK'
}
response = requests.post('https://dibbs2.bsm.dla.mil/dodwarning.aspx', headers=headers, params=params, data=data)
with open(url.rsplit('/', 1)[1], 'wb') as f:
f.write(response.content)
for i in test_list:
get_file(i)
time.sleep(1)