改进请求- Python脚本

我需要帮助，我有一个python脚本。它从给定的URL搜索电子邮件地址。并在CSV输出的结果，我需要从有人的帮助下添加一些行添加另一列在CSV这将显示在其中的url地址被发现。谢谢你

import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv

class EmailCrawler:
processed_urls = set()
unprocessed_urls = set()
emails = set()
def __init__(self, website: str):
self.website = website
self.unprocessed_urls.add(website)
self.headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
}
self.base_url = urlsplit(self.website).netloc
self.outputfile = self.base_url.replace('.','_')+'.csv'
# we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
# for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
self.garbage_extensions = ['.aif','.webp','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd','.gif','.webp']
self.email_count = 0
def crawl(self):
"""
It will continue crawling untill the list unprocessed urls list is empty
"""
url = self.unprocessed_urls.pop()
print("CRAWL : {}".format(url))
self.parse_url(url)

if len(self.unprocessed_urls)!=0:
self.crawl()
else:
print('End of crawling for {} '.format(self.website))
print('Total urls visited {}'.format(len(self.processed_urls)))
print('Total Emails found {}'.format(self.email_count))
print('Dumping processed urls to {}'.format(self.base_url.replace('.','_')+'.txt'))

在您的crawl方法中绝对不需要递归，它可以像

一样简单

class EmailCrawler:
...
def crawl(self):
for url in self.unprocessed_urls:
print("CRAWL : {}".format(url))
self.parse_url(url)

您的parse_emails方法可以返回一组给定文本的电子邮件:

class EmailCrawler:
...
def parse_emails(self, text: str) -> Set[str]:
emails = set(re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+', text, re.I))

# Use a list comprehension to filter emails
filtered_emails = {email for email in emails if not email.endswith(('jpg', 'jpeg', 'png', 'webp', 'gif'))}
return filtered_emails

重构parse_url以返回parse_emails找到的电子邮件(递归在这里更有意义):

class EmailCrawler:
...
def parse_url(self, current_url: str) -> Dict[str, Set[str]]:
response = requests.get(current_url, headers=self.headers)
tree = html.fromstring(response.content)
urls = tree.xpath('//a/@href')
urls = [urljoin(self.website,url) for url in urls]
urls = [url for url in urls if self.base_url == urlsplit(url).netloc]
urls = list(set(urls))
children_urls = []
for url in urls:
for extension in self.garbage_extensions:
if url.endswith(extension) or url.endswith(extension+'/'):
continue
children_urls.append(url)
email_mapping = {}
# Add to the result the child URLs and their emails
for child_url in children_urls:
email_mapping.update(self.parse_url(child_url))
# Add to the result the parent URL and its emails
email_mapping[current_url] = self.parse_emails(response.text)
return email_mapping

回到crawl方法，将结果写入CSV文件:

class EmailCrawler:
...
def crawl(self):
for url in self.unprocessed_urls:
print("CRAWL : {}".format(url))
email_mapping = self.parse_url(url)
for url, emails in email_mapping.items():
for email in emails:
...
# Write to your CSV file the email and its url
# print(','.join((url, email)), file=your_csv_file)

相关内容

最新更新

热门标签：