我使用的是scrapy,一个一个网页,一切都如预期。如果我想从第1页提取一个链接,然后从第1页面提取的链接中刮取第2页,那我就麻烦了。
我有项目变量,我希望其中一个等于第二页上元素的值,这是项目发布的时间。
下面是我的代码和堆栈。我知道我肯定走错了方向,但在那些杂乱无章的文档之后,我无法获得回报。如有任何帮助,我们将不胜感激。
Spider文件
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import time
import datetime
from stack.items import StackItem
class StackSpider(Spider):
name = "stack"
allowed_domains = ["donedeal.ie"]
start_urls = [
"http://www.donedeal.ie/find/all/for-sale/Ireland/?sort=AGE+DESC&source=ALL&start=0",
]
def parse(self, response):
adverts = Selector(response).xpath('//div[@class="listing-info"]')
for advert in adverts:
ts = time.time()
item = StackItem()
item['county'] = advert.xpath('span[@class="county"]/text()').extract()[0]
item['section'] = advert.xpath('span[@class="section"]/a/text()').extract()[0]
item['price'] = advert.xpath('div[@class="price shadow-default rnd-corners-left"]/a/text()').extract()[0]
pricelink = advert.xpath('div[@class="price shadow-default rnd-corners-left"]/a/@href').extract()[0]
request = Request(pricelink,callback=self.parse_page2)
item['other'] = request
item['title'] = advert.xpath('div[@class="title"]/a/span/text()').extract()[0]
item['timestamp'] = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
yield item
def parse_page2(self, response):
newitem = response.xpath('//*[@id="adage"]/span').extract()
return newitem
跟踪(对于一次迭代,每个对象都会重复错误)
2015-03-26 12:40:36+0000 [stack] ERROR: Error processing {'county': u'Dublin',
'other': <GET http://cars.donedeal.ie/cars-for-sale/audi-a4-2-0-tdi-120hp-se/8853726?offset=30>,
'price': u'u20ac12,950',
'section': u'Cars',
'timestamp': '2015-03-26 12:40:36',
'title': u'Audi A4 2.0 TDi 120hp SE'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
self._startRunCallbacks(result)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy_mongodb.py", line 222, in process_item
return self.insert_item(item, spider)
File "/usr/local/lib/python2.7/dist-packages/scrapy_mongodb.py", line 251, in insert_item
self.collection.insert(item, continue_on_error=True)
File "/usr/local/lib/python2.7/dist-packages/pymongo/collection.py", line 409, in insert
gen(), check_keys, self.uuid_subtype, client)
bson.errors.InvalidDocument: Cannot encode object: <GET http://cars.donedeal.ie/cars-for-sale/audi-a4-2-0-tdi-120hp-se/8853726?offset=30>
2015-03-26 12:40:36+0000 [stack] INFO: Closing spider (finished)
2015-03-26 12:40:36+0000 [stack] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 273,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 19464,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 3, 26, 12, 40, 36, 72670),
'log_count/DEBUG': 3,
'log_count/ERROR': 30,
'log_count/INFO': 8,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2015, 3, 26, 12, 40, 35, 266551)}
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
import time
import datetime
from stack.items import StackItem
class StackSpider(Spider):
name = "stack"
allowed_domains = ["donedeal.ie"]
start_urls = [
"http://www.donedeal.ie/find/all/for-sale/Ireland/?sort=AGE+DESC&source=ALL&start=0",
]
def parse(self, response):
adverts = Selector(response).xpath('//div[@class="listing-info"]')
for advert in adverts:
ts = time.time()
item = StackItem()
item['county'] = advert.xpath('span[@class="county"]/text()').extract()[0]
item['section'] = advert.xpath('span[@class="section"]/a/text()').extract()[0]
item['price'] = advert.xpath('div[@class="price shadow-default rnd-corners-left"]/a/text()').extract()[0]
pricelink = advert.xpath('div[@class="price shadow-default rnd-corners-left"]/a/@href').extract()[0]
item['other'] = request
item['title'] = advert.xpath('div[@class="title"]/a/span/text()').extract()[0]
item['timestamp'] = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
request = Request(pricelink,callback=self.parse_page2)
request.meta['item'] = item
yield request
def parse_page2(self, response):
item = response.meta['item']
item['something'] = response.xpath('//*[@id="adage"]/span').extract()
return item