如何删除不返回值的按钮?



我试图从网站https://tonaton.com/en/ads/ghana/electronics刮。有一个"下一个"按钮,我想点击并抓取内容。问题是该按钮的xpath或css选择器在scrapy shell和splash中都不返回任何值,我被卡住了。我没法进去取我需要的东西。请帮忙好吗?这就是我所能做到的,但我没有得到正确的结果。

# -*- coding: utf-8 -*-

进口scrapy进口scrapy_selenium从scrapy_selenium导入SeleniumRequest

类VisionSpider (scrapy.Spider):Name = 'vision'

def start_requests(self):
yield SeleniumRequest(
url= 'https://tonaton.com',
wait_time=3,
screenshot=True,
callback=self.parse
)

def parse(self, response): 
businesses = response.xpath(
"//a[@class='link--1t8hM gtm-home-category-link-click']")
for business in businesses:
link = business.xpath(".//@href").get()
category = business.xpath(".//div[2]/p/text()").get()
yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})

def parse_business(self, response):

category = response.request.meta['business_category']
rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
for row in rows:
new_link = row.xpath(".//@href").get()
yield response.follow(url=new_link, callback=self.next_parse, meta={'business_category': category})
next_page = response.xpath("//div[@class = 'action-button--1O8tU']")
if next_page:
button = next_page.click()
yield SeleniumRequest(
url=button,
wait_time=3,
callback=self.parse
)

def next_parse(self, response):
category = response.request.meta['business_category']
lines = response.xpath("//a[@class='member-link--IzDly gtm-visit-shop']")
for line in lines:
next_link = line.xpath(".//@href").get()
yield response.follow(url=next_link, callback=self.another_parse, meta={'business_category': category})
def another_parse(self, response):
category = response.request.meta['business_category']
button = response.xpath("//button[@class = 'contact-section--1qlvP gtm-show-number']").click()

yield response.follow(url=button, callback=self.new_parse, meta={'business_category': category})

def new_parse(self, response):
category = response.request.meta['business_category']
times = response.xpath("//div[@class='info-container--3pMhK']")
for time in times:
name = time.xpath(".//div/span/text()").get()
location = time.xpath(".//div/div/div/span/text()").get()
phone = time.xpath(".//div[3]/div/button/div[2]/div/text()").get()
yield {
'business_category': category,
'business_name': name,
'phone': phone,
'location': location
}

我已经试过了,但是分页仍然不工作。此外,当我单击调用按钮进行抓取时,需要相当长的时间才能返回所需的输出。有没有办法让它快一点?

class VisionSpider(scrapy.Spider):
name = 'vision'
main_domains = ['tonaton.com']
start_urls =['https://tonaton.com']
def parse(self, response):   
businesses = response.xpath("//a[@class='link--1t8hM gtm-home-category-link-click'][1]")
for business in businesses:
link = business.xpath(".//@href").get()
category = business.xpath(".//div[2]/p/text()").get()
yield response.follow(url=link, callback=self.parse_business, meta={'business_category': category})

def parse_business(self, response):
category = response.request.meta['business_category']
rows = response.xpath("//a[@class='card-link--3ssYv gtm-ad-item']")
for row in rows:
new_link = row.xpath(".//@href").get()
if new_link:
yield response.follow(url=new_link, callback=self.new_parse, meta={'business_category': category, 'newlink':new_link})
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_path = which("chromedriver")
driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
driver.get(response.url)
driver.maximize_window
next_page = wait(driver, 300).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//div[@class='icon--3D09z extra-small--_AIuZ arrow-right--17oRn']"))) 
if  next_page:
next_page.click()
yield SeleniumRequest(callback=self.parse_business)

driver.close()

def new_parse(self, response):
category = response.request.meta['business_category']
chrome_options = Options()
chrome_options.add_argument("--headless")
# options=chrome_options
chrome_path = which("chromedriver")  
driver = webdriver.Chrome(options=chrome_options, executable_path=chrome_path)
driver.get(response.url)
driver.maximize_window
category = response.request.meta['business_category']
call_button = wait(driver, 500).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='call-button--3uvWj']")))
call_button.click()

html = driver.page_source
resp = Selector(text=html)
driver.close()
contacts = resp.xpath("//div[@class='call-button--3uvWj']/div[1]")
for contact in contacts:
phone = contact.xpath(".//text()").get()
times = resp.xpath("//div[@class='details-section--2ggRy']")
for time in times:
name = time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/text()").get()
if name is None:
name =time.xpath(".//div[2]/div/div[2]/div/div/div/div/div/div/div/div/div/text()").get()
location = time.xpath(".//div/div/div/span/a/span/text()[1]").get()
region = time.xpath(".//div/div/div/span/a[2]/span/text()").get()
yield {
'business_category': category,
'business_name': name,
'phone': phone,
'region':region,
'location': location
}

最新更新