使用python从.aspx网站进行网页抓取



我正在尝试从这个网站抓取一些数据: https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx

我能够使用我的方法获取前 11 页,但由于某种原因,它退出了第 11 页。我已经阅读了与.aspx相关的其他帖子,但没有看到任何适用于我的情况。

我是新手,所以我的代码有点冗长,但它在某种程度上完成了工作。我已经尝试过调整标题和一堆其他东西,但无法通过第 11 页。对我来说毫无意义。

我相当确定问题出在视图状态和视图生成器标头参数上。我不确定如何为您想要循环访问的页面获取这些内容。我几乎对所有页面使用相同的值。出于某种原因,这种方法一直有效到第 11 页并包括第 11 页,然后它中断了。这很奇怪,因为看起来每个页面都有不同的视图状态值。

提前谢谢。

import pandas as pd
import re
import pandas as pd
import numpy as np
import urllib
from requests import Session
from bs4 import BeautifulSoup
import time
import requests

# List of pages to loop over
page_list = ['Page$1','Page$2','Page$3','Page$4','Page$5','Page$6','Page$7','Page$8','Page$9','Page$10',
'Page$11','Page$12','Page$13','Page$14','Page$15','Page$16','Page$17','Page$18','Page$19','Page$20']
wa_url = 'https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx'
# Getting header elements from url
session = requests.Session()
session.headers.update({
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
})
val_get = session.get(wa_url)
soup = BeautifulSoup(val_get.content, "html.parser")
tags = soup.find_all('input')
# Header elements I need for the POST request
view_state = tags[3]['value']
view_generator = tags[4]['value']
evnt_validation = tags[6]['value']

no_emps = []
date = []
#Looping through pages of WARN database
for page in page_list:

data = {
# Form data header stuff
"__EVENTTARGET": "ucPSW$gvMain",
"__EVENTARGUMENT": page,
"__LASTFOCUS": "",
"__VIEWSTATE": view_state,
"__VIEWSTATEGENERATOR": view_generator,
"__VIEWSTATEENCRYPTED": "",
"__EVENTVALIDATION": evnt_validation,
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"en-US,en;q=0.9",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Content-Type":"application/x-www-form-urlencoded",
"Cookie":"_ga=GA1.2.1011893740.1592948578; _gid=GA1.2.1433455844.1592948578",
"Host":"fortress.wa.gov",
"Origin":"https://fortress.wa.gov",
"Referer":"https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx",
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"same-origin",
"Sec-Fetch-User":"?1",
"Upgrade-Insecure-Requests":"1"
}

# Getting data from each page
session = requests.Session()
session.headers.update({
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
})

get_warn_data = session.post(wa_url, data=data)
soup = BeautifulSoup(get_warn_data.content, "html.parser")

# Getting all rows of data and desired table data after some cleaning up
work = soup.find_all('tr')
work = [a.get_text('@') for a in work]
work = [re.sub(r'n', '', a) for a in work]
work = [re.sub(r'^@|@$', '', a) for a in work]
work = [a.split('@') for a in work]


work = [a for a in work if len(a) == 7]
no_emps_u = [a[3] for a in work]
date_use = [a[6] for a in work]

no_emps.append(no_emps_u)
date.append(date_use)

# Dynamically Updating header values with stuff in current html
# Only applicable for page2 and on
if page != 'Page$1':
tags = soup.find_all('input')
view_state = tags[3]['value']
view_generator = tags[4]['value']
evnt_validation = tags[6]['value']
else:
pass

# Wrapping up results into lists
from pandas.core.common import flatten
WA_WARN_no_emps = list(flatten(no_emps))
WA_WARN_date = list(flatten(date))

更新<<<:按照Andrej在评论中的建议,我在for循环的末尾添加了if语句,用于更新标头值;此添加修复了代码并获取了page_list中的所有页面。

您可以使用此示例从站点获取所有页面(总共 67 个((它动态获取所有<input>值 - 因此它得到正确的__VIEWSTATE等(:

import requests
from bs4 import BeautifulSoup

url = 'https://fortress.wa.gov/esd/file/warn/Public/SearchWARN.aspx'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def get_data(soup, page_num):
data = {}
for i in soup.select('input'):
data[i['name']] = i.get('value', '')
del data['ucPSW$btnSearchCompany']
data['__EVENTTARGET'] = 'ucPSW$gvMain'
data['__EVENTARGUMENT'] = 'Page${}'.format(page_num)
data['__LASTFOCUS'] = ''
return data
page = 1
while True:
print('Page {}...'.format(page))
total = 1
for total, tr in enumerate(soup.select('#ucPSW_gvMain > tr:not(:has(table)):has(td)'), 1):
tds = [td.get_text(strip=True) for td in tr.select('td')]
print('{:<3}{:<50}{:<25}{:<15}{:<15}{:<15}{:<15}{:<15}'.format(total, *tds))
if total % 15:
break
page += 1
soup = BeautifulSoup( requests.post(url, get_data(soup, page)).content, 'html.parser' )

指纹:

Page 1...
1  Safran Cabin Materials, LLC                       Marysville and Newport   6/23/2020      85             Layoff         Permanent      6/24/2020      
2  Swissport Fueling                                 SeaTac                   5/8/2020       69             Layoff         Permanent      6/19/2020      
3  Swissport USA, Inc                                SeaTac                   5/22/2020      62             Layoff         Permanent      6/19/2020      
4  Swissport USA, Inc                                SeaTac                   3/20/2020      167            Layoff         Temporary      6/19/2020      
5  Tool Gauge and Machine Works                      Tacoma                   6/17/2020      59             Layoff         Permanent      6/18/2020      
6  Hyatt Corporation Motif Seattle                   Seattle                  3/14/2020      91             Layoff         Temporary      6/18/2020      
7  Jacobsen Daniel's Enterprise, Inc                 Tacoma                   6/12/2020      1              Layoff         Permanent      6/18/2020      
8  Benchmark Stevenson, LLC d/b/a Skamania Lodge     Stevenson                3/18/2020      185            Layoff         Temporary      6/17/2020      
9  Seattle Art Museum                                Seattle                  7/5/2020       76             Layoff         Temporary      6/16/2020      
10 Chihuly Garden & Glass                            Seattle                  3/21/2020      97             Layoff         Temporary      6/16/2020      
11 Seattle Center                                    Seattle                  3/21/2020      182            Layoff         Temporary      6/16/2020      
12 Sekisui Aerospace                                 Renton and Sumner        6/12/2020      111            Layoff         Permanent      6/15/2020      
13 Pioneer Human Services                            Seattle                  8/14/2020      59             Layoff         Permanent      6/15/2020      
14 Crista Senior Living                              Shoreline                8/16/2020      156            Closure        Permanent      6/15/2020      
15 Hyatt Corporation / Hyatt Regency Bellevue        Bellevue                 3/15/2020      223            Layoff         Temporary      6/15/2020      
Page 2...
1  Toray Composite Materials America, Inc            Tacoma                   8/8/2020       146            Layoff         Permanent      6/12/2020      
2  Embassy Suites Seattle Bellevue                   Seattle                  6/1/2020       57             Layoff         Temporary      6/12/2020      
3  Triumph Aerospace Structures                      Spokane                  6/15/2020      12             Layoff         Permanent      6/11/2020      
4  Hyatt Corporation / Hyatt Regency Lake Washington Renton                   6/30/2020      129            Layoff         Temporary      6/9/2020       
5  Lamb Weston, Inc                                  Connell, WA              6/15/2020      360            Layoff         Temporary      6/8/2020       
6  Lamb Weston, Inc                                  Warden                   6/15/2020      300            Layoff         Temporary      6/8/2020       
... and so on.

最新更新