我正试图编码一个电子邮件刮板,但我有问题。这是我的代码:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from email_validator import validate_email, EmailNotValidError
import requests
import pandas as pd
lista_star = ['vitalebarberiscanonico.it']
class MailSpider(scrapy.Spider):
name = 'email'
data = []
def parse(self, response):
links = LxmlLinkExtractor(allow=()).extract_links(response)
links = [str(link.url) for link in links]
links.append(str(response.url))
for link in links:
yield scrapy.Request(url=link, callback=self.parse_link)
def parse_link(self, response):
for word in self.reject:
if word in str(response.url):
return
html_text = str(response.text)
mail_list = re.findall('w+@w+.{1}w+', html_text)
for email in mail_list:
self.data.append({'email': email, 'link': str(response.url)})
def get_info():
process = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
process.crawl(MailSpider, start_urls=lista_star)
process.start()
df = pd.DataFrame(MailSpider.data)
df = df.drop_duplicates(subset='email')
df = df.reset_index(drop=True)
return df
df = get_info()
我得到:ERROR:在获得启动请求时出错和ValueError:在请求url: vitalebarberiscanonico.it中缺少方案
所以我试了:
for link in links:
parsed_url = urlparse(link)
if not parsed_url.scheme:
link = urlunparse(parsed_url._replace(scheme='http'))
elif parsed_url.scheme not in ['http', 'https']:
continue
try:
yield scrapy.Request(url=link, callback=self.parse_link)
except:
link = link.replace('http', 'https')
yield scrapy.Request(url=link, callback=self.parse_link)
但是还是不行
问题在于您的原始url中没有方案。而不是您尝试的url解析代码。你可以把链接的字符串本身改成http或者https:
lista_star = ['https://vitalebarberiscanonico.it/']