我正在尝试抓取这个网站。我正在使用Scrapy的Request
,但它不起作用,代码显示出异常行为。以下是我的代码:
# -*- coding: utf-8 -*-
from scrapy.spiders import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request,Response
import re
import csv
import time
from selenium import webdriver
class ColdWellSpider(BaseSpider):
name = "cwspider"
allowed_domains = ["coldwellbankerhomes.com"]
#start_urls = [''.join(row).strip() for row in csv.reader(open("remaining_links.csv"))]
#start_urls = ['https://www.coldwellbankerhomes.com/fl/boynton-beach/5451-verona-drive-unit-d/pid_9266204/']
start_urls = ['https://www.coldwellbankerhomes.com/fl/miami-dade-county/kvc-17_1,17_3,17_2,17_8/incl-22/']
def parse(self,response):
#browser = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true', '--load-images=false'])
browser = webdriver.Firefox()
browser.maximize_window()
browser.get(response.url)
time.sleep(5)
#to extract all the links from a page and send request to those links
self.getlink(response)
#for clicking the load more button in the page
while True:
try:
browser.find_element_by_class_name('search-results-load-more').find_element_by_tag_name('a').click()
time.sleep(3)
self.getlink(browser)
except:
break
def getlink(self,response):
print 'hhelo'
c = open('data_getlink.csv', 'a')
d = csv.writer(c, lineterminator='n')
print 'hello2'
listclass = response.xpath('//div[@class="list-items"]/div[contains(@id,"snapshot")]')
for l in listclass:
link = 'http://www.coldwellbankerhomes.com/'+''.join(l.xpath('./h2/a/@href').extract())
d.writerow([link])
yield Request(url = str(link),callback=self.parse_link)
#callback function of Request
def parse_link(self,response):
b = open('data_parselink.csv', 'a')
a = csv.writer(b, lineterminator='n')
a.writerow([response.url])
问题出在yield Request(url = str(link),callback=self.parse_link)
上。当我删除这行代码时,getlink
函数被完美地调用,链接被写入data_getlink.csv
文件。但是,如果代码中存在上述行,则不会调用整个getlink函数,因此也不会调用回调函数。任何帮助都是非常有用的
问题出在yield
语句上。
当存在yield
语句时,getlink
函数将成为迭代器函数,因此在请求第一个迭代项之前不会执行其主体。
要解决此问题,请通过以下方式调用getlink
函数:
for i in self.getlink(browser): # actually, browser or response here?
yield i
或蟒蛇3:
yield from self.getlink(browser)