我已经创建了一个Scrapy爬行蜘蛛与规则对象和链接提取器抓取亚马逊畅销产品,但Scrapy有时返回None作为结果,即使我知道我的xpath表达式是正确的。奇怪的是,Scrapy只是偶尔返回None,而不是一直返回None。下面是我的spider.py代码
class AmzcrawlSpider(CrawlSpider):
name = 'amzcrawl'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/best-sellers-books-Amazon/zgbs/books/ref=zg_bs_unv_b_1_1_1/']
rules = (
#THIS RULE IS FOR THE FIRST PAGE OF BESTSELLERS
Rule(LinkExtractor(restrict_xpaths='//span[@class="zg_selected"]/../following-sibling::ul/li/a'), callback='parse_category', follow=True),
# THIS RULE IS FOR THE SECOND PAGE OF BESTSELLERS
Rule(LinkExtractor(restrict_xpaths='//ul[@class="a-pagination"]/li[@class="a-last"]/a'), callback='parse_category', follow=True),
)
def parse_category(self, response):
item = AmzbestsellerItem()
item['dir_level_1'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/li[@class = "zg_browseUp"]/a/text())').get()
item['dir_level_2'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
item['dir_level_3'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
item['dir_level_4'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
item['dir_level_5'] = response.xpath('normalize-space(//ul[@id="zg_browseRoot"]/ul/ul/ul/ul/ul/li[@class = "zg_browseUp"]/a/text())').get()
#NAME OF CURRENT BESTSELLER CATEGORY PAGE
item['category_name'] = response.xpath('normalize-space(//span[@class="zg_selected"]/text())').get()
#URL OF CURRENT BESTSELLER CATEGORY PAGE
item['category_url'] = response.url.split('/ref')[0]
#THIS CODE IS FOR FOLLOWING INDIVIDUAL PRODUCT PAGE TO GET INFORMATION
book_containers = response.xpath('//ol[@id="zg-ordered-list" and @class="a-ordered-list a-vertical"]/li')
for book_dir in book_containers:
book_dir = book_dir.xpath('./span[@class="a-list-item"]/div[@class="a-section a-spacing-none aok-relative"]/span[@class="aok-inline-block zg-item"]/a[@class="a-link-normal"]/@href').get()
book_url = response.urljoin(book_dir)
item['book_url'] = book_url.split('/ref')[0]
yield Request(book_url, callback=self.parse_book, meta={'item': item}, dont_filter=True)
#GETTING INDIVIDUAL BOOK DETAIL. THIS IS WHERE PROBLEMS ARISE. I CANNOT GET ALL THE DETAILS EVEN THOUGH THE XPATH EXPRESSIONS ARE CORRECT
def parse_book(self, response):
item = response.meta['item']
item['book_referer'] = response.request.headers.get('Referer', None).decode('utf-8')
item['title'] = response.xpath('normalize-space(//span[@id="productTitle"])').get()
item['edition'] = response.xpath('normalize-space(//h1[@id="title" and @class="a-spacing-none a-text-normal"]/span[@id = "productSubtitle" and @class = "a-size-large a-color-secondary"]/text())').get()
item['author'] = response.xpath('normalize-space(//span[@class="author notFaded"]//a[@class="a-link-normal contributorNameID"]/text() | //span[@class="author notFaded"]/a[@class="a-link-normal"]/text())').getall()
item['rating_num'] = response.xpath('//div[@id="averageCustomerReviews"]//span[@id="acrCustomerReviewText" and @class="a-size-base"]/text()').get()
item['img_url'] = response.xpath('//div[@id="main-image-container"]//img/@src').get()
item['publisher'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"Publisher")]/following-sibling::span/text()').get()
item['language'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"Language")]/following-sibling::span/text()').get()
item['isbn10'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"ISBN-10")]/following-sibling::span/text()').get()
item['isbn13'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"ISBN-13")]/following-sibling::span/text()').get()
item['asin'] = response.xpath('//div[@id = "detailBullets_feature_div"]//span[contains(text(),"ASIN")]/following-sibling::span/text()').get()
item['kindle_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Kindle")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
item['etextbook_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"eTextbook")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
item['paperback_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Paperback")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
item['hardcover_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Hardcover")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
item['spiral_price'] = response.xpath('//span[@class="a-size-large mediaTab_title" and contains(text(),"Spiral-bound")]/../following-sibling::div/span[@class="a-size-base mediaTab_subtitle"]/text()').get()
yield item
我不确定问题是否在我的spider.py文件或我的pipelines.py文件中。下面是我的pipelines.py文件代码:
class AmzbestsellerPipeline:
def __init__(self):
self.files = {}
@classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file, delimiter=";")
self.exporter.fields_to_export = ['dir_level_1', 'dir_level_2', 'dir_level_3', 'dir_level_4', 'dir_level_5',
'category_name', 'category_url', 'cat_page_num', 'cat_referer', 'book_url',
'book_referer', 'title', 'edition', 'author', 'rating_num', 'img_url',
'publisher', 'language', 'isbn10', 'isbn13', 'asin',
'kindle_price', 'etextbook_price', 'paperback_price', 'hardcover_price', 'spiral_price']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
if adapter['book_url'] in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
#self.ids_seen.add(adapter['cat_page_num'])
self.ids_seen.add(adapter['book_url'])
return item
我已经测试了你的代码,你似乎有错误是:503不可用的服务。这是抓取Amazon时的一个典型错误,最简单的解决方案是使用像"Mozilla / 5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 80.0.3987.163 Safari / 537.36"
这样的USER_AGENT并激活cookie。
关于带有None的字段,请记住始终在页面源代码上使用XPATH,而不是在已经编译的页面上使用XPATH。在您的情况下,您查询的许多字段不在页面源代码中,因此将显示为None。