我有多个URL要抓取,存储在一个csv文件中,其中每一行都是一个单独的URL,我使用此代码来运行它
def start\_requests(self):
with open('csvfile', 'rb') as f:
list=[]
for line in f.readlines():
array = line.split(',')
url = array[9]
list.append(url)
list.pop(0)
for url in list:
if url != "":
yield scrapy.Request(url=url, callback=self.parse)
它给了我以下错误IndexError: list index out of range
,有人能帮我纠正这个错误吗?或者建议另一种使用csv文件的方法吗?
编辑:csv文件如下所示:
http://example.org/page1
http://example.org/page2
有9个这样的行
您应该能够通过将csv文件读取到列表变量中来实现这一点,而无需执行上面的一些代码。因此不需要split
、pop
和append
工作示例
import csv
import scrapy
from scrapy.crawler import CrawlerProcess
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
with open('websites.csv') as csv_file:
data = csv.reader(csv_file)
for row in data:
# Supposing that the data is in the first column
url = row[0]
if url != "":
# We need to check this has the http prefix or we get a Missing scheme error
if not url.startswith('http://') and not url.startswith('https://'):
url = 'https://' + url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# Do my data extraction
print("test")
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
})
c.crawl(QuotesSpider)
c.start()