我正在尝试找到使用aspx页面登录网站的最佳方式,然后重定向到该网站中的新页面以开始抓取。我相信抓取部分正在工作,因为我已经在一个模拟页面上尝试过,但我以前没有做过身份验证,我发现该网站正在重定向到搜索url。我能想到的唯一一件事就是触发搜索框?
网站是howdidido.co.uk,我的蜘蛛代码如下:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from pgcmh.items import PgcmhItem
from scrapy.http import Request
from scrapy.http import FormRequest
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders.init import InitSpider
class LoginSpider(BaseSpider):
name = 'pgcmh'
allowed_domains = ["howdidido.co.uk"]
login_page = 'http://howdidido.co.uk/Widgets.aspx'
start_urls = ["http://howdidido.co.uk/ClubDetails.aspx?section=55&pagesection=handicaplist&cid=74"]
def start_requests(self):
return self.init_request()
def init_request(self):
return [Request(url=self.login_page, callback=self.login)]
def login(self, response):
return FormRequest.from_response(response, formdata={'username': 'dummyemail', 'password': 'dummypass'}, callback=self.check_login_response)
def check_login_response(self, response):
if "Logout" in response.body:
for url in self.start_urls:
yield self.make_requests_from_url(url)
else:
self.log("Could not log in...")
def make_requests_from_url(url):
sel = Selector(response)
for tablerow in sel.css('table.basictable tr'):
item = PgcmhItem()
item["name"] = tablerow.xpath('td[1]').extract()
item["handicap"] = tablerow.xpath('td[2]').extract()
item["exact"] = tablerow.xpath('td[3]').extract()
item["category"] = tablerow.xpath('td[4]').extract()
yield item
return FormRequest.from_response(response, formdata={'ctl00$WrappedContent$txtUser': 'dummyemail', 'ctl00$WrappedContent$txtPass': 'dummypass'}, callback=self.check_login_response)