从url读取csv文件



我想读取url,但他们给我错误,URL must be a string是否有任何可能的解决方案请建议我如何使url字符串请建议任何方法

from selenium import webdriver           
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from csv import writer

with open("data.csv") as file:
start_urls=[line.strip() for line in file]  
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(start_urls)

@JeJe评论,

你需要遍历start_urls列表

start_urls是一个字符串列表,driver.get只期望一个url作为字符串-类似于

# driver.get(start_urls) # NOT like this
for sUrl in start_urls:
driver.get(sUrl)
### SCRAPE AS NEEDED ###

或者,如果你想跟踪进度,像

suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###

顺便说一句,你不需要做with open...下的所有事情-获得start_urls就足够了:

with open("data.csv") as file:
start_urls=[line.strip() for line in file]  
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
suLen = len(start_urls)
for si, sUrl in enumerate(start_urls):
print(f'[{si} of {suLen}] reading {sUrl}')
driver.get(sUrl)
### SCRAPE AS NEEDED ###

,但你只能使用start_urls=[line.strip() for line in file],如果你的csv只有一个列没有标题。

你可以用函数来重写你的代码来处理你代码的每一步。


def read_file(has_header:bool=True):
with open("data.csv") as file:
start_urls = [line.strip() for line in file]
# assumes there is a header
if has_header:
return start_urls[1:]
else:
return start_urls
def scrap_urls():
urls = read_file(has_header=True)
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
for url in urls:
driver.get(url)
if __name__ == '__main__':

scrap_urls()

最新更新