如何将多个网页爬网数据输出到CSV文件中使用Python与Scrapy



我在下面有以下代码,它从网站上抓取所有可用的页面。这是完美地"抓取"有效页面,因为当我使用打印功能时 - 我可以看到"项目"列表中的数据,但是当我尝试使用".csv"作为目标文件转储统计信息时,我没有看到任何输出。(在命令提示符下使用此命令:"刮擦爬行 craig -o test.csv -t csv'),..请帮我将数据输出到"csv"文件中。

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"

class MySpider(BaseSpider):
  name = "craig"
  allowed_domains = ["xyz.com"]
  #for u in URL:
  start_urls = [URL % 1]
  def __init__(self):
        self.page_number = 1
  def parse(self, response):
      hxs = HtmlXPathSelector(response)
      titles = hxs.select("//div[@class='thumb']")
      if not titles:
            raise CloseSpider('No more pages')
      items = []
      for titles in titles:
          item = CraigslistSampleItem()
          item ["title"] = titles.select("a/@title").extract()
          item ["url"] = titles.select("a/@href").extract()
          items.append(item)
      yield items

      self.page_number += 1
      yield Request(URL % self.page_number)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"

class MySpider(BaseSpider):
  name = "craig"
  allowed_domains = ["xyz.com"]
  def start_requests(self):
      for i in range(10):
          yield Request(URL % i, callback=self.parse)
  def parse(self, response):
      titles = response.xpath("//div[@class='thumb']")
      if not titles:
            raise CloseSpider('No more pages')
      for title in titles:
          item = CraigslistSampleItem()
          item ["title"] = title.xpath("./a/@title").extract()
          item ["url"] = title.xpath("./a/@href").extract()
          yield item

最新更新