Python Scrapy代码从网站提取第一封电子邮件



代码没有按照计划工作。我希望它刮过所有的子页面从一个网站和提取第一个出现的电子邮件。不幸的是,这只适用于第一个网站,但随后的网站不工作。查看下面的代码以获取更多信息。

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = []  # will be set dynamically
start_urls = []  # will be set dynamically
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
email = response.xpath('//a[contains(@href, "mailto:")][1]/@href').get()
if email:
yield {'email': email}
raise CloseSpider('Email found, spider stopped')

driver = webdriver.Chrome()
driver.get('https://www.houzz.com/professionals/kitchen-and-bath/probr0-bo~t_11790?fi=15')
time.sleep(7)
original_handle = driver.current_window_handle
name = driver.find_elements(By.XPATH, '//a[@class="hz-pro-ctl"]')
address = driver.find_elements(By.XPATH, '//span[@class="hz-pro-search-result__location-info__text"]')
for name, address in zip(name, address):
name.click()
time.sleep(5)
driver.switch_to.window(driver.window_handles[1])
driver.execute_script("window.scrollTo(0, 2000);")
time.sleep(10)
time.sleep(2)
print('Success')
time.sleep(10)
url = driver.find_element(By.XPATH, '//a[@class="sc-62xgu6-0 cZBXc sc-mwxddt-0 kCqoeY hui-link"]')
url.click()
time.sleep(10)
driver.switch_to.window(driver.window_handles[2])
new_url = driver.current_url
MySpider.allowed_domains = [new_url.split('/')[2]]
MySpider.start_urls = [new_url]
#original_handle = driver.window_handles[0]  # get the handle of the original window
#driver.switch_to.window(original_handle)
# Call the Scrapy spider
process = CrawlerProcess()
process.crawl(MySpider)
process.start()
driver.close()
driver.switch_to.window(driver.window_handles[1])
driver.close()
driver.switch_to.window(original_handle)

实际上根本不需要使用硒。您可以单独使用scrapy访问每个页面。

例如:

import scrapy
class MySpider(scrapy.Spider):
name = "myspider"
start_urls = ['https://www.houzz.com/professionals/kitchen-and-bath/probr0-bo~t_11790?fi=15']
def parse(self, response):
for link in response.xpath('//a[@class="hz-pro-ctl"]/@href').getall():
yield scrapy.Request(link, callback=self.parse_page)
def parse_page(self, response):
page_link = response.xpath('//a[@class="sc-62xgu6-0 cZBXc sc-mwxddt-0 kCqoeY hui-link"]/@href').get()
yield scrapy.Request(page_link, callback=self.parse_email)
def parse_email(self, response):
email = response.xpath('//a[contains(@href, "mailto:")][1]/@href').get()
if email:
yield {'email': email}

部分输出

{'email': 'mailto:Info@consumersmail.com'}
{'email': 'mailto:echomes@gmail.com'}
{'email': 'mailto:info@GilmerKitchens.com'}
{'email': 'mailto:info@denverdesigngroup.com'}
{'email': 'mailto:Tracy@Homelovely.com'}
{'email': 'mailto:info@vkbkitchenandbath.com'}

最新更新