从分页网页中抓取表数据,其中url没有更改,但表数据发生了更改



网站:nafda.gov.ng/our-services/registered-products

下面的代码运行,但需要7个小时才能呈现5802个页面中的200个页面,如果有人能帮我找到如何更快地浏览这个网站,我将不胜感激

# pip install webdriver-manager --user
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, 
StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as ec
import pandas as pd
import time
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get('https://www.nafdac.gov.ng/our-services/registered- 
products/')
container2 = []
wait_time_out = 20
ignored_exceptions 
=NoSuchElementException,StaleElementReferenceException,)

for _ in range(0, 5802+1):
rows = WebDriverWait(driver, wait_time_out, 
ignored_exceptions=ignored_exceptions).until(
ec.presence_of_all_elements_located((By.XPATH, '//* 
[@id="table_1"]/tbody/tr')))
for row in rows:
time.sleep(10)
container2.append([table_data.text for table_data in 
row.find_elements(By.TAG_NAME, 'td')])
WebDriverWait(driver, wait_time_out, 
ignored_exceptions=ignored_exceptions).until(
ec.presence_of_element_located((By.XPATH, '//* 
[@id="table_1_next"]'))).click()
time.sleep(10)

通过ajax调用检索数据。直接从源获取数据即可。这让我在大约50秒内获得了所有数据:

import requests
import pandas as pd
from bs4 import BeautifulSoup
# Get _wp_http_referer
url = 'https://www.nafdac.gov.ng/our-services/registered-products/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
wdtNonce = soup.find_all('input', {'type':'hidden'})[0]['value']  

# URL to ajax
url = 'https://www.nafdac.gov.ng/wp-admin/admin-ajax.php?action=get_wdtable&table_id=1'
# Headers for the request
headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Mobile Safari/537.36'}
# form data needed to query in the post
payload = {
'draw': '1',
'columns[0][data]': '0',
'columns[0][name]': 'ID',
'columns[0][searchable]': 'true',
'columns[0][orderable]': 'true',
'columns[0][search][value]': '',
'columns[0][search][regex]': 'false',
'columns[1][data]': '1',
'columns[1][name]': 'product_group',
'columns[1][searchable]': 'true',
'columns[1][orderable]': 'true',
'columns[1][search][value]': '',
'columns[1][search][regex]': 'false',
'columns[2][data]': '2',
'columns[2][name]': 'product_name',
'columns[2][searchable]': 'true',
'columns[2][orderable]': 'true',
'columns[2][search][value]': '',
'columns[2][search][regex]': 'false',
'columns[3][data]': '3',
'columns[3][name]': 'presentation',
'columns[3][searchable]': 'true',
'columns[3][orderable]': 'true',
'columns[3][search][value]': '',
'columns[3][search][regex]': 'false',
'columns[4][data]': '4',
'columns[4][name]': 'active_ingredent',
'columns[4][searchable]': 'true',
'columns[4][orderable]': 'true',
'columns[4][search][value]': '',
'columns[4][search][regex]': 'false',
'columns[5][data]': '5',
'columns[5][name]': 'applicant_name',
'columns[5][searchable]': 'true',
'columns[5][orderable]': 'true',
'columns[5][search][value]': '',
'columns[5][search][regex]': 'false',
'columns[6][data]': '6',
'columns[6][name]': 'country',
'columns[6][searchable]': 'true',
'columns[6][orderable]': 'true',
'columns[6][search][value]': '',
'columns[6][search][regex]': 'false',
'columns[7][data]': '7',
'columns[7][name]': 'manufacturer',
'columns[7][searchable]': 'true',
'columns[7][orderable]': 'true',
'columns[7][search][value]': '',
'columns[7][search][regex]': 'false',
'columns[8][data]': '8',
'columns[8][name]': 'date_approved',
'columns[8][searchable]': 'true',
'columns[8][orderable]': 'true',
'columns[8][search][value]': '',
'columns[8][search][regex]': 'false',
'columns[9][data]': '9',
'columns[9][name]': 'expiry_date',
'columns[9][searchable]': 'true',
'columns[9][orderable]': 'true',
'columns[9][search][value]': '',
'columns[9][search][regex]': 'false',
'columns[10][data]': '10',
'columns[10][name]': 'registration_number',
'columns[10][searchable]': 'true',
'columns[10][orderable]': 'true',
'columns[10][search][value]': '',
'columns[10][search][regex]': 'false',
'order[0][column]': '0',
'order[0][dir]': 'asc',
'start': '0',
'length': '10000',
'search[value]': '',
'search[regex]': 'false',
'wdtNonce': wdtNonce}

# Iterating through the form data above to pull out column names
cols = []
for k, v in payload.items():
if 'name' in k:
cols.append(v)
# initialize a list of rows
rows = []
start = 0
while True:

# Update the start value of the form data to go through the "pages"
payload.update({'start':str(start)})

# Retunr the json fronm the post ajax
jsonData = requests.post(url, headers=headers, data=payload).json()

# Add the list of data into the list of "rows"
rows += jsonData['data']

print('Gathered rows: %d - %d' %(start+1, start+len(jsonData['data'])))

# If the data is less than 10000 items, we know we are on the last "page"
# So we'll break the loop 
if len(jsonData['data']) < 10000:
print('Done!')
break

# Update the start variable so that on the next loop, it
# updates the start parameter in the form data to then get the
# next "page"
start = len(rows)
# Create the table from the final list of rows
df = pd.DataFrame(rows, columns=cols)

输出:

print(df)
ID product_group  ... expiry_date registration_number
0           1                ...  30/10/2022             03-0740
1           3   ANIMAL FEED  ...  30/07/2023             A9-0735
2           4   ANIMAL FEED  ...  30/07/2023             A9-0744
3           5   ANIMAL FEED  ...  27/06/2023             A9-0721
4           6   ANIMAL FEED  ...  27/06/2023             A9-0722
...           ...  ...         ...                 ...
58011  58.013                ...                          Apr-65
58012  58.014                ...                         A4-2582
58013  58.015                ...                         A4-0851
58014  58.016                ...                         A4-6613
58015  58.017                ...                         A4-3601
[58016 rows x 11 columns]

最新更新