在我的零碎代码中,FormRequest请求获取下一页,我正在重定向到主页,但在浏览器中,页面将转到下一页。我想我已经给出了所有的表格数据。
from scrapy.selector import Selector
from scrapy.http import Request, FormRequest
from scrapy.contrib.spiders import CrawlSpider
from scrapy.shell import inspect_response
class SampleSpider(CrawlSpider):
name = 'samplespider'
start_urls = ['http://jobs.hiltonworldwide.com/en/jobs/job-search-results']
def parse(self, response):
sel = Selector(response)
inspect_response(response)
eventTarget = 'phmain_0$phmaincontent_0$phjobsearchresults_0$next_page'
VIEWSTATE = sel.xpath("//input[@name='__VIEWSTATE']/@value").extract()[0]
EVENTVALIDATION = sel.xpath("//input[@name='__EVENTVALIDATION']/@value").extract()[0]
hdnIPAddress = sel.xpath("//input[@name='phheader_0$hdnIPAddress']/@value").extract()[0]
hdnPageCount = sel.xpath("//input[@name='phmain_0$phmaincontent_0$phjobsearchresults_0$hdnPageCount']/@value").extract()[0]
hdnPageIndex = sel.xpath("//input[@name='phmain_0$phmaincontent_0$phjobsearchresults_0$hdnPageIndex']/@value").extract()[0]
form_data = {
'__EVENTTARGET': eventTarget,
'__EVENTARGUMENT': "",
'__LASTFOCUS': "",
'__VIEWSTATE': VIEWSTATE,
'__EVENTVALIDATION': EVENTVALIDATION,
'phheader_0$hdnIPAddress': hdnIPAddress,
'phmain_0$phbannerinfo_0$phcountryinfo_0$ddlCountry':"Worldwide",
'phmain_0$phmaincontent_0$phjobsearchresults_0$albLanguage': "91351",
'phmain_0$phmaincontent_0$phjobsearchresults_0$LoginEmail': "",
'phmain_0$phmaincontent_0$phjobsearchresults_0$hdnPageCount': hdnPageCount,
'phmain_0$phmaincontent_0$phjobsearchresults_0$hdnPageIndex': hdnPageIndex,
'phmain_0$phmaincontent_0$phjobsearch_0$ddlCity': "-1",
'phmain_0$phmaincontent_0$phjobsearch_0$albBrands': "-1",
'phmain_0$phmaincontent_0$phjobsearch_0$albTalentAreas': "-1",
}
yield FormRequest(
'http://jobs.hiltonworldwide.com/en/jobs/job-search-results',
formdata=form_data,
callback=self.parse
)
我有什么遗漏或做错的吗?正确对进行分页的解决方案是什么
当我试图重现问题时,帮助我的是指定User-Agent
标头:
yield FormRequest(
'http://jobs.hiltonworldwide.com/en/jobs/job-search-results',
formdata=form_data,
callback=self.parse,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36'}
)