>我正在尝试进入网页的多个网址,并按照对下一个解析器的响应来获取页面上的另一组网址。但是,从此页面中,我需要获取下一页的网址,但我想通过解析页面字符串来尝试此操作,然后将其作为下一页传递。但是,抓取器会爬行,但在我加载项目时,它甚至不返回最终解析器上的输出。
注意:我知道我可以简单地在 href 上使用 if 语句来获取下一页。但是,我想尝试一些不同的东西,以防我不得不面对必须这样做的情况。
这是我的刮刀:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[@class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/@href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[@class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[@class="listing-results-right clearfix"]//a)[position() mod 3=1]//@href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[@id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[@id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[@class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[@class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
我知道抓取网址的方法有效,因为我已经使用以下方法在单个网址上尝试过:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
这与上面使用的相同,只是变量的变化。 这将输出以下链接,它们有效:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
您的用例适合使用scrapy
crawl
蜘蛛。您可以编写有关如何提取指向属性的链接以及如何提取指向下一页的链接的规则。我已更改您的代码以使用爬虫爬虫类,并且已更改您的FEEDS
设置以使用推荐的设置。FEED_URI
和FEED_FORMAT
在较新版本的scrapy
中已弃用。
从文档中阅读有关爬行蜘蛛的更多信息
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(@class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[@class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[@class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()