使用jupyter-notebook我正在尝试构建一个程序,将进入这个网站"https://npiprofile.com/clia/"并在列名"CLIA number"下输入csv文件中的CLIA ID。它将获取本列单元格中的数据,然后在网站中进行搜索。在此之后,NPI下的一个可点击的链接将弹出,我需要的程序点击。完成此操作后,它需要收集:
"npi"、"提供商名称"、"位置地址"、"分类"、"授权官方名称"、"授权官方头衔"one_answers"授权官方电话">
代码运行,但从未像以前那样打开网站以提取数据。我有大约20,000个clia号码,我必须提取
#import all files
import pandas as pd
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('CLIA POL')
clia_numbers = df['CLIA Number']
base_url = 'https://npiprofile.com/clia/'
# Function to extract data from the NPI page
def extract_data(npi_soup):
data = {}
data['NPI'] = npi_soup.find('span', {'class': 'npi'}).text.strip()
data['Provider Name'] = npi_soup.find('h1', {'class': 'name'}).text.strip()
data['Location Address'] = npi_soup.find('div', {'class': 'location'}).find('p', {'class': 'address'}).text.strip()
data['Classification'] = npi_soup.find('div', {'class': 'taxonomy'}).find('span', {'class': 'classification'}).text.strip()
authorized_official = npi_soup.find('div', {'class': 'authorized-official'})
data['Authorized Official Name'] = authorized_official.find('span', {'class': 'name'}).text.strip()
data['Authorized Official Title'] = authorized_official.find('span', {'class': 'title'}).text.strip()
data['Authorized Official Phone'] = authorized_official.find('span', {'class': 'phone'}).text.strip()
return data
# Function to get the NPI page from the CLIA page
def get_npi_page(clia_number):
url = base_url + clia_number
response = requests.get(url, timeout=30) # Increase the timeout duration
soup = BeautifulSoup(response.content, 'html.parser')
npi_link = soup.find('a', {'class': 'npi'})
if npi_link:
npi_url = npi_link['href']
npi_response = requests.get(npi_url, timeout=30) # Increase the timeout duration
npi_soup = BeautifulSoup(npi_response.content, 'html.parser')
return npi_soup
else:
return None
# Create an empty DataFrame to store the extracted data
data_columns = ['NPI', 'Provider Name', 'Location Address', 'Classification', 'Authorized Official Name', 'Authorized Official Title', 'Authorized Official Phone']
extracted_data = pd.DataFrame(columns=data_columns)
# Iterate through the CLIA numbers, get the NPI page, and extract the data
for clia_number in clia_numbers:
npi_soup = get_npi_page(clia_number)
if npi_soup:
data = extract_data(npi_soup)
extracted_data = extracted_data.append(data, ignore_index=True)
# Save the extracted data to a new CSV file
extracted_data.to_csv('extracted_data.csv', index=False)
我得到的错误信息如下:
TimeoutError Traceback (most recent call last)
~anaconda3libsite-packagesurllib3connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
~anaconda3libsite-packagesurllib3connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385 try:
--> 386 self._validate_conn(conn)
387 except (SocketTimeout, BaseSSLError) as e:
~anaconda3libsite-packagesurllib3connectionpool.py in _validate_conn(self, conn)
1041 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1042 conn.connect()
1043
~anaconda3libsite-packagesurllib3connection.py in connect(self)
413
--> 414 self.sock = ssl_wrap_socket(
415 sock=conn,
~anaconda3libsite-packagesurllib3utilssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448 if send_sni:
--> 449 ssl_sock = _ssl_wrap_socket_impl(
450 sock, context, tls_in_tls, server_hostname=server_hostname
~anaconda3libsite-packagesurllib3utilssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492 if server_hostname:
--> 493 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494 else:
~anaconda3libssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
500 # ctx._wrap_socket()
--> 501 return self.sslsocket_class._create(
502 sock=sock,
~anaconda3libssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1040 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041 self.do_handshake()
1042 except (OSError, ValueError):
~anaconda3libssl.py in do_handshake(self, block)
1309 self.settimeout(None)
-> 1310 self._sslobj.do_handshake()
1311 finally:
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~anaconda3libsite-packagesrequestsadapters.py in send(self, request, stream, timeout, verify, cert, proxies)
488 if not chunked:
--> 489 resp = conn.urlopen(
490 method=request.method,
~anaconda3libsite-packagesurllib3connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
786
--> 787 retries = retries.increment(
788 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
~anaconda3libsite-packagesurllib3utilretry.py in increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
~anaconda3libsite-packagesurllib3packagessix.py in reraise(tp, value, tb)
768 if value.__traceback__ is not tb:
--> 769 raise value.with_traceback(tb)
770 raise value
~anaconda3libsite-packagesurllib3connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
~anaconda3libsite-packagesurllib3connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385 try:
--> 386 self._validate_conn(conn)
387 except (SocketTimeout, BaseSSLError) as e:
~anaconda3libsite-packagesurllib3connectionpool.py in _validate_conn(self, conn)
1041 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1042 conn.connect()
1043
~anaconda3libsite-packagesurllib3connection.py in connect(self)
413
--> 414 self.sock = ssl_wrap_socket(
415 sock=conn,
~anaconda3libsite-packagesurllib3utilssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448 if send_sni:
--> 449 ssl_sock = _ssl_wrap_socket_impl(
450 sock, context, tls_in_tls, server_hostname=server_hostname
~anaconda3libsite-packagesurllib3utilssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492 if server_hostname:
--> 493 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494 else:
~anaconda3libssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
500 # ctx._wrap_socket()
--> 501 return self.sslsocket_class._create(
502 sock=sock,
~anaconda3libssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1040 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041 self.do_handshake()
1042 except (OSError, ValueError):
~anaconda3libssl.py in do_handshake(self, block)
1309 self.settimeout(None)
-> 1310 self._sslobj.do_handshake()
1311 finally:
ProtocolError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
~AppDataLocalTempipykernel_137201785300690.py in <module>
39 # Iterate through the CLIA numbers, get the NPI page, and extract the data
40 for clia_number in clia_numbers:
---> 41 npi_soup = get_npi_page(clia_number)
42 `your text```your text``
43 if npi_soup:
~AppDataLocalTempipykernel_137201785300690.py in get_npi_page(clia_number)
19 url = base_url + clia_number
20
---> 21 response = requests.get(url)
22 soup = BeautifulSoup(response.content, 'html.parser')
23
~anaconda3libsite-packagesrequestsapi.py in get(url, params, **kwargs)
71 """
72
---> 73 return request("get", url, params=params, **kwargs)
74
75
~anaconda3libsite-packagesrequestsapi.py in request(method, url, **kwargs)
57 # cases, and look like a memory leak in others.
58 with sessions.Session() as session:
---> 59 return session.request(method=method, url=url, **kwargs)
60
61
~anaconda3libsite-packagesrequestssessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
585 }
586 send_kwargs.update(settings)
--> 587 resp = self.send(prep, **send_kwargs)
588
589 return resp
~anaconda3libsite-packagesrequestssessions.py in send(self, request, **kwargs)
699
700 # Send the request
--> 701 r = adapter.send(request, **kwargs)
702
703 # Total elapsed time of the request (approximately)
~anaconda3libsite-packagesrequestsadapters.py in send(self, request, stream, timeout, verify, cert, proxies)
545
546 except (ProtocolError, OSError) as err:
--> 547 raise ConnectionError(err, request=request)
548
549 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
我试着增加等待时间,但它从来没有完全工作。
尝试设置User-Agent
标头以从服务器获得正确的响应:
import requests
import pandas as pd
url = "https://npiprofile.com/clia"
clia_numbers = ["52D2072838", "24D1040592"] # <-- read clia numbers from the CSV file
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/111.0"
}
all_dfs = []
for n in clia_numbers:
r = requests.get(url, params={"sSearch": n}, headers=headers).text
# here handle exceptions as necessary:
all_dfs.append(pd.read_html(r)[0])
final_df = pd.concat(all_dfs)
print(final_df)
打印:
NPI CLIA Name Type Taxonomy Address Phone
0 1487296307 52D2072838 EXACT SCIENCES LABORATORIES, LLC Organization Clinical Medical Laboratory 650 FORWARD DRIVE MADISON, WI 53711 (608) 284-5700
0 1114993136 24D1040592 MAYO CLINIC HOSPITAL-ROCHESTER Organization Clinic/Center 1705 SE BROADWAY AVE ALBERT LEA, MN 56007 (507) 373-2384