在抓取npiprofile.com时拒绝连接



使用jupyter-notebook我正在尝试构建一个程序,将进入这个网站"https://npiprofile.com/clia/"并在列名"CLIA number"下输入csv文件中的CLIA ID。它将获取本列单元格中的数据,然后在网站中进行搜索。在此之后,NPI下的一个可点击的链接将弹出,我需要的程序点击。完成此操作后,它需要收集:

"npi"、"提供商名称"、"位置地址"、"分类"、"授权官方名称"、"授权官方头衔"one_answers"授权官方电话">

代码运行,但从未像以前那样打开网站以提取数据。我有大约20,000个clia号码,我必须提取

#import all files
import pandas as pd
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('CLIA POL')
clia_numbers = df['CLIA Number']
base_url = 'https://npiprofile.com/clia/'
# Function to extract data from the NPI page
def extract_data(npi_soup):
data = {}
data['NPI'] = npi_soup.find('span', {'class': 'npi'}).text.strip()
data['Provider Name'] = npi_soup.find('h1', {'class': 'name'}).text.strip()
data['Location Address'] = npi_soup.find('div', {'class': 'location'}).find('p', {'class': 'address'}).text.strip()
data['Classification'] = npi_soup.find('div', {'class': 'taxonomy'}).find('span', {'class': 'classification'}).text.strip()
authorized_official = npi_soup.find('div', {'class': 'authorized-official'})
data['Authorized Official Name'] = authorized_official.find('span', {'class': 'name'}).text.strip()
data['Authorized Official Title'] = authorized_official.find('span', {'class': 'title'}).text.strip()
data['Authorized Official Phone'] = authorized_official.find('span', {'class': 'phone'}).text.strip()
return data

# Function to get the NPI page from the CLIA page
def get_npi_page(clia_number):
url = base_url + clia_number
response = requests.get(url, timeout=30)  # Increase the timeout duration
soup = BeautifulSoup(response.content, 'html.parser')
npi_link = soup.find('a', {'class': 'npi'})
if npi_link:
npi_url = npi_link['href']
npi_response = requests.get(npi_url, timeout=30)  # Increase the timeout duration
npi_soup = BeautifulSoup(npi_response.content, 'html.parser')
return npi_soup
else:
return None

# Create an empty DataFrame to store the extracted data
data_columns = ['NPI', 'Provider Name', 'Location Address', 'Classification', 'Authorized Official Name', 'Authorized Official Title', 'Authorized Official Phone']
extracted_data = pd.DataFrame(columns=data_columns)
# Iterate through the CLIA numbers, get the NPI page, and extract the data
for clia_number in clia_numbers:
npi_soup = get_npi_page(clia_number)
if npi_soup:
data = extract_data(npi_soup)
extracted_data = extracted_data.append(data, ignore_index=True)
# Save the extracted data to a new CSV file
extracted_data.to_csv('extracted_data.csv', index=False)

我得到的错误信息如下:

TimeoutError                              Traceback (most recent call last)
~anaconda3libsite-packagesurllib3connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702             # Make the request on the httplib connection object.
--> 703             httplib_response = self._make_request(
704                 conn,
~anaconda3libsite-packagesurllib3connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385         try:
--> 386             self._validate_conn(conn)
387         except (SocketTimeout, BaseSSLError) as e:
~anaconda3libsite-packagesurllib3connectionpool.py in _validate_conn(self, conn)
1041         if not getattr(conn, "sock", None):  # AppEngine might not have  `.sock`
-> 1042             conn.connect()
1043 
~anaconda3libsite-packagesurllib3connection.py in connect(self)
413 
--> 414         self.sock = ssl_wrap_socket(
415             sock=conn,
~anaconda3libsite-packagesurllib3utilssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448     if send_sni:
--> 449         ssl_sock = _ssl_wrap_socket_impl(
450             sock, context, tls_in_tls, server_hostname=server_hostname
~anaconda3libsite-packagesurllib3utilssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492     if server_hostname:
--> 493         return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494     else:
~anaconda3libssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
500         # ctx._wrap_socket()
--> 501         return self.sslsocket_class._create(
502             sock=sock,
~anaconda3libssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1040                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041                     self.do_handshake()
1042             except (OSError, ValueError):
~anaconda3libssl.py in do_handshake(self, block)
1309                 self.settimeout(None)
-> 1310             self._sslobj.do_handshake()
1311         finally:
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
ProtocolError                             Traceback (most recent call last)
~anaconda3libsite-packagesrequestsadapters.py in send(self, request, stream, timeout, verify, cert, proxies)
488             if not chunked:
--> 489                 resp = conn.urlopen(
490                     method=request.method,
~anaconda3libsite-packagesurllib3connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
786 
--> 787             retries = retries.increment(
788                 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
~anaconda3libsite-packagesurllib3utilretry.py in increment(self, method, url, response, error, _pool, _stacktrace)
549             if read is False or not self._is_method_retryable(method):
--> 550                 raise six.reraise(type(error), error, _stacktrace)
551             elif read is not None:
~anaconda3libsite-packagesurllib3packagessix.py in reraise(tp, value, tb)
768             if value.__traceback__ is not tb:
--> 769                 raise value.with_traceback(tb)
770             raise value
~anaconda3libsite-packagesurllib3connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702             # Make the request on the httplib connection object.
--> 703             httplib_response = self._make_request(
704                 conn,
~anaconda3libsite-packagesurllib3connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385         try:
--> 386             self._validate_conn(conn)
387         except (SocketTimeout, BaseSSLError) as e:
~anaconda3libsite-packagesurllib3connectionpool.py in _validate_conn(self, conn)
1041         if not getattr(conn, "sock", None):  # AppEngine might not have  `.sock`
-> 1042             conn.connect()
1043 
~anaconda3libsite-packagesurllib3connection.py in connect(self)
413 
--> 414         self.sock = ssl_wrap_socket(
415             sock=conn,
~anaconda3libsite-packagesurllib3utilssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448     if send_sni:
--> 449         ssl_sock = _ssl_wrap_socket_impl(
450             sock, context, tls_in_tls, server_hostname=server_hostname
~anaconda3libsite-packagesurllib3utilssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492     if server_hostname:
--> 493         return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494     else:
~anaconda3libssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
500         # ctx._wrap_socket()
--> 501         return self.sslsocket_class._create(
502             sock=sock,
~anaconda3libssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1040                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041                     self.do_handshake()
1042             except (OSError, ValueError):
~anaconda3libssl.py in do_handshake(self, block)
1309                 self.settimeout(None)
-> 1310             self._sslobj.do_handshake()
1311         finally:
ProtocolError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
During handling of the above exception, another exception occurred:
ConnectionError                           Traceback (most recent call last)
~AppDataLocalTempipykernel_137201785300690.py in <module>
39 # Iterate through the CLIA numbers, get the NPI page, and extract the data
40 for clia_number in clia_numbers:
---> 41     npi_soup = get_npi_page(clia_number)
42 `your text```your text``
43     if npi_soup:
~AppDataLocalTempipykernel_137201785300690.py in get_npi_page(clia_number)
19     url = base_url + clia_number
20 
---> 21     response = requests.get(url)
22     soup = BeautifulSoup(response.content, 'html.parser')
23 
~anaconda3libsite-packagesrequestsapi.py in get(url, params, **kwargs)
71     """
72 
---> 73     return request("get", url, params=params, **kwargs)
74 
75 
~anaconda3libsite-packagesrequestsapi.py in request(method, url, **kwargs)
57     # cases, and look like a memory leak in others.
58     with sessions.Session() as session:
---> 59         return session.request(method=method, url=url, **kwargs)
60 
61 
~anaconda3libsite-packagesrequestssessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
585         }
586         send_kwargs.update(settings)
--> 587         resp = self.send(prep, **send_kwargs)
588 
589         return resp
~anaconda3libsite-packagesrequestssessions.py in send(self, request, **kwargs)
699 
700         # Send the request
--> 701         r = adapter.send(request, **kwargs)
702 
703         # Total elapsed time of the request (approximately)
~anaconda3libsite-packagesrequestsadapters.py in send(self, request, stream, timeout, verify, cert, proxies)
545 
546         except (ProtocolError, OSError) as err:
--> 547             raise ConnectionError(err, request=request)
548 
549         except MaxRetryError as e:
ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))

我试着增加等待时间,但它从来没有完全工作。

尝试设置User-Agent标头以从服务器获得正确的响应:

import requests
import pandas as pd
url = "https://npiprofile.com/clia"
clia_numbers = ["52D2072838", "24D1040592"] # <-- read clia numbers from the CSV file
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0"
}
all_dfs = []
for n in clia_numbers:
r = requests.get(url, params={"sSearch": n}, headers=headers).text
# here handle exceptions as necessary:
all_dfs.append(pd.read_html(r)[0])
final_df = pd.concat(all_dfs)
print(final_df)

打印:

NPI        CLIA                              Name          Type                     Taxonomy                                    Address           Phone
0  1487296307  52D2072838  EXACT SCIENCES LABORATORIES, LLC  Organization  Clinical Medical Laboratory        650 FORWARD DRIVE MADISON, WI 53711  (608) 284-5700
0  1114993136  24D1040592    MAYO CLINIC HOSPITAL-ROCHESTER  Organization                Clinic/Center  1705 SE BROADWAY AVE ALBERT LEA, MN 56007  (507) 373-2384