我想读取url,但他们给我错误,URL must be a string
是否有任何可能的解决方案请建议我如何使url字符串请建议任何方法
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(start_urls)
@JeJe评论,
你需要遍历start_urls列表
start_urls
是一个字符串列表,driver.get
只期望一个url作为字符串-类似于
# driver.get(start_urls) # NOT like this
for sUrl in start_urls:
driver.get(sUrl)
### SCRAPE AS NEEDED ###
或者,如果你想跟踪进度,像
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
顺便说一句,你不需要做with open...
下的所有事情-获得start_urls
就足够了:
with open("data.csv") as file:
start_urls=[line.strip() for line in file]
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###
,但你只能使用start_urls=[line.strip() for line in file]
,如果你的csv只有一个列没有标题。
你可以用函数来重写你的代码来处理你代码的每一步。
def read_file(has_header:bool=True):
with open("data.csv") as file:
start_urls = [line.strip() for line in file]
# assumes there is a header
if has_header:
return start_urls[1:]
else:
return start_urls
def scrap_urls():
urls = read_file(has_header=True)
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
for url in urls:
driver.get(url)
if __name__ == '__main__':
scrap_urls()