我在下面有以下代码,它从网站上抓取所有可用的页面。这是完美地"抓取"有效页面,因为当我使用打印功能时 - 我可以看到"项目"列表中的数据,但是当我尝试使用".csv"作为目标文件转储统计信息时,我没有看到任何输出。(在命令提示符下使用此命令:"刮擦爬行 craig -o test.csv -t csv'),..请帮我将数据输出到"csv"文件中。
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
#for u in URL:
start_urls = [URL % 1]
def __init__(self):
self.page_number = 1
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[@class='thumb']")
if not titles:
raise CloseSpider('No more pages')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/@title").extract()
item ["url"] = titles.select("a/@href").extract()
items.append(item)
yield items
self.page_number += 1
yield Request(URL % self.page_number)
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.exceptions import CloseSpider
from scrapy.http import Request
from test.items import CraigslistSampleItem
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
URL = "http://example.com/subpage/%d"
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["xyz.com"]
def start_requests(self):
for i in range(10):
yield Request(URL % i, callback=self.parse)
def parse(self, response):
titles = response.xpath("//div[@class='thumb']")
if not titles:
raise CloseSpider('No more pages')
for title in titles:
item = CraigslistSampleItem()
item ["title"] = title.xpath("./a/@title").extract()
item ["url"] = title.xpath("./a/@href").extract()
yield item