当我尝试运行下面的代码时。我试着像beautifulsoap一样运行下载库,但是我的端缺少了一些东西。谁能检查一下我做的对不对?我的座右铭是从网站上下载所有的pdf文件,并将其保存到特定的目录
import os
import requests
import ssl
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = "https://sedar.com/DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202"
headers = {
'Host': 'sedar.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'TE': 'trailers'
}
#If there is no such folder, the script will create one automatically
folder_location = r'C:Usersjay_patel1...test'
base_url = 'https://sedar.com'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
response = requests.get(url=url, headers=headers)
soup= BeautifulSoup(response.text, "html.parser")
forms = soup.find_all('form')
print(len(forms))
counter = 0
for form in forms:
action = form['action']
doc_link = base_url+action
filename = os.path.join(folder_location,str(counter)+'.pdf')
counter = counter + 1
headers['Cookie'] = '__uzma=28481475-7c5c-4e62-ad08-ebabddb1fff0; __uzmb=1657722275; __uzme=5741; __uzmc=7155215141911; __uzmd=1657726129; TS015c16dc=016abe8a18b65821e9668faf87606307d873066e84546d78152ebf0e1734d6f553b800143820e548c5ead2bd19b0cd0d8c6d8b4fc0937d7b83d0619de26e621bf2db8d6741; __ssds=2; __ssuzjsr2=a9be0cd8e; __uzmaj2=5c5b0a58-f490-41e6-945d-6a4766be7d82; __uzmbj2=1657722277; __uzmcj2=158447041868; __uzmdj2=1657726118; JSESSIONID=0000Hh7sa7ec87A8vlJsaqRiHhX:1884ter20'
with open(filename, 'wb') as f :
print("Writing")
print(doc_link)
content = requests.get(url=doc_link, headers=headers).content
#print(content)
f.write(content)
f.close()
我得到下面的错误。我导入了SSL,但错误仍然显示失败
Traceback (most recent call last):
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3connectionpool.py", line 699, in urlopen
httplib_response = self._make_request(
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3connectionpool.py", line 382, in _make_request
self._validate_conn(conn)
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3connectionpool.py", line 1010, in _validate_conn
conn.connect()
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3connection.py", line 416, in connect
self.sock = ssl_wrap_socket(
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3utilssl_.py", line 449, in ssl_wrap_socket
ssl_sock = _ssl_wrap_socket_impl(
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3utilssl_.py", line 493, in _ssl_wrap_socket_impl
return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
File "C:Usersjay_patel1Anaconda3libssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "C:Usersjay_patel1Anaconda3libssl.py", line 1040, in _create
self.do_handshake()
File "C:Usersjay_patel1Anaconda3libssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:Usersjay_patel1Anaconda3libsite-packagesrequestsadapters.py", line 439, in send
resp = conn.urlopen(
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3connectionpool.py", line 755, in urlopen
retries = retries.increment(
File "C:Usersjay_patel1Anaconda3libsite-packagesurllib3utilretry.py", line 574, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
MaxRetryError: HTTPSConnectionPool(host='sedar.com', port=443): Max retries exceeded with url: /DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:Usersjay_patel1Downloadspdf download (1).py", line 36, in <module>
response = requests.get(url=url, headers=headers)
File "C:Usersjay_patel1Anaconda3libsite-packagesrequestsapi.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "C:Usersjay_patel1Anaconda3libsite-packagesrequestsapi.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:Usersjay_patel1Anaconda3libsite-packagesrequestssessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "C:Usersjay_patel1Anaconda3libsite-packagesrequestssessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "C:Usersjay_patel1Anaconda3libsite-packagesrequestsadapters.py", line 514, in send
raise SSLError(e, request=request)
SSLError: HTTPSConnectionPool(host='sedar.com', port=443): Max retries exceeded with url: /DisplayCompanyDocuments.do?lang=EN&issuerNo=00030202 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)')))
您的代码很可能无法完成SSL握手,您是否在公司网络上/在代理后面?
向请求添加verify=False。Get会帮你解决这个问题,但是这个调用现在是不安全的。
folder_location = r'C:Usersjay_patel1...test'
base_url = 'https://sedar.com'
if not os.path.exists(folder_location):
os.mkdir(folder_location)
response = requests.get(url=url, headers=headers, verify=False)
理想情况下,您可以从有问题的网站下载SSL证书,并将该路径传递给验证参数-更多信息在这里->https://requests.readthedocs.io/en/latest/user/advanced/ssl-cert-verification
试试这个
requests.get('https://url.com', verify=False)