我是数据抓取的新手,最近,我试图用python的selenium库从wunderground.com上抓取数据。但是,我发现有时候,selenium web driver无法成功打开网页,我认为这个问题可能与网站使用的JavaScript有一定的关系,但不确定是哪个部分出了问题。有人知道怎么解吗?提前谢谢。
下面是正确显示的示例:正确显示
的示例这里是有问题的:有问题的
示例我的代码在这里,这是一个非常简单的硒调用import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ChromeOptions
from selenium.webdriver import ActionChains
import time
# url for scraping
url = "https://www.wunderground.com/history/daily/us/ca/san-diego/KSAN/date/2021-2-1"
# define properties of selenium webdriver
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1}) #value 1 enables it , if you set to 2 it disables it
option.add_argument('--disable-gpu')
option.add_argument("--disable-blink-features")
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument("--enable-javascript")
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(5) # wait for webpage loading
页面发送HTTP GET到:https://api.weather.com/v1/location/KSAN:9:US/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=e&startDate=20210201
这个调用的响应是一个巨大的JSON,其中包含您正在寻找的数据。(下面是一个子集)
{
"metadata": {
"language": "en-US",
"transaction_id": "1631220781880:2112944028",
"version": "1",
"location_id": "KSAN:9:US",
"units": "e",
"expire_time_gmt": 1631224381,
"status_code": 200
},
"observations": [
{
"key": "KSAN",
"class": "observation",
"expire_time_gmt": 1612176660,
"obs_id": "KSAN",
"obs_name": "San Diego",
"valid_time_gmt": 1612169460,
"day_ind": "N",
"temp": 59,
"wx_icon": 27,
"icon_extd": 2700,
"wx_phrase": "Mostly Cloudy",
"pressure_tend": 2,
"pressure_desc": "Falling",
"dewPt": 45,
"heat_index": 59,
"rh": 60,
"pressure": 30.04,
"vis": 10,
"wc": 59,
"wdir": null,
"wdir_cardinal": "CALM",
"gust": null,
"wspd": 0,
"max_temp": null,
"min_temp": null,
"precip_total": null,
"precip_hrly": 0,
"snow_hrly": null,
"uv_desc": "Low",
"feels_like": 59,
"uv_index": 0,
"qualifier": null,
"qualifier_svrty": null,
"blunt_phrase": null,
"terse_phrase": null,
"clds": "BKN",
"water_temp": null,
"primary_wave_period": null,
"primary_wave_height": null,
"primary_swell_period": null,
"primary_swell_height": null,
"primary_swell_direction": null,
"secondary_swell_period": null,
"secondary_swell_height": null,
"secondary_swell_direction": null
},
{
"key": "KSAN",
"class": "observation",
"expire_time_gmt": 1612180260,
"obs_id": "KSAN",
"obs_name": "San Diego",
"valid_time_gmt": 1612173060,
"day_ind": "N",
"temp": 59,
"wx_icon": 27,
"icon_extd": 2700,
"wx_phrase": "Mostly Cloudy",
"pressure_tend": null,
"pressure_desc": null,
"dewPt": 47,
"heat_index": 59,
"rh": 64,
"pressure": 30.04,
"vis": 10,
"wc": 59,
"wdir": 260,
"wdir_cardinal": "W",
"gust": null,
"wspd": 5,
"max_temp": null,
"min_temp": null,
"precip_total": null,
"precip_hrly": 0,
"snow_hrly": null,
"uv_desc": "Low",
"feels_like": 59,
"uv_index": 0,
"qualifier": null,
"qualifier_svrty": null,
"blunt_phrase": null,
"terse_phrase": null,
"clds": "BKN",
"water_temp": null,
"primary_wave_period": null,
"primary_wave_height": null,
"primary_swell_period": null,
"primary_swell_height": null,
"primary_swell_direction": null,
"secondary_swell_period": null,
"secondary_swell_height": null,
"secondary_swell_direction": null
} ]