如何将Django代码与Python连接起来



我对Django和Python相当陌生。我目前正在构建一个搜索电子邮件的爬网程序,我目前使用Python构建了一个爬网程序。但我希望它能链接到Django中的一个搜索栏,当点击它时,代码会检测到URL,Python代码就会开始运行。我不知道该怎么做。

目前,我已经尝试将代码放在views.py中,并将其连接到url.py,然后将该url连接到按钮。但它不起作用。

这是爬网程序代码:

import re
import requests
import requests.exceptions
from urllib.parse import urlsplit, urljoin
from lxml import html
import sys
import csv

class EmailCrawler:
processed_urls = set()
unprocessed_urls = set()
emails = set()
def __init__(self, website: str):
self.website = website
self.unprocessed_urls.add(website)
self.headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/78.0.3904.70 Chrome/78.0.3904.70 Safari/537.36',
}
self.base_url = urlsplit(self.website).netloc
self.outputfile = self.base_url.replace('.','_')+'.csv'
# we will use this list to skip urls that contain one of these extension. This will save us a lot of bandwidth and speedup the crawling process
# for example: www.example.com/image.png --> this url is useless for us. we cannot possibly parse email from images and all other types of files.
self.garbage_extensions = ['.aif','.cda','.mid','.midi','.mp3','.mpa','.ogg','.wav','.wma','.wpl','.7z','.arj','.deb','.pkg','.rar','.rpm','.tar.gz','.z','.zip','.bin','.dmg','.iso','.toast','.vcd','.csv','.dat','.db','.dbf','.log','.mdb','.sav','.sql','.tar','.apk','.bat','.bin','.cgi','.pl','.exe','.gadget','.jar','.py','.wsf','.fnt','.fon','.otf','.ttf','.ai','.bmp','.gif','.ico','.jpeg','.jpg','.png','.ps','.psd','.svg','.tif','.tiff','.asp','.cer','.cfm','.cgi','.pl','.part','.py','.rss','.key','.odp','.pps','.ppt','.pptx','.c','.class','.cpp','.cs','.h','.java','.sh','.swift','.vb','.ods','.xlr','.xls','.xlsx','.bak','.cab','.cfg','.cpl','.cur','.dll','.dmp','.drv','.icns','.ico','.ini','.lnk','.msi','.sys','.tmp','.3g2','.3gp','.avi','.flv','.h264','.m4v','.mkv','.mov','.mp4','.mpg','.mpeg','.rm','.swf','.vob','.wmv','.doc','.docx','.odt','.pdf','.rtf','.tex','.txt','.wks','.wps','.wpd']
self.email_count = 0
def crawl(self):
"""
It will continue crawling until the list unprocessed urls list is empty
"""
url = self.unprocessed_urls.pop()
print("CRAWL : {}".format(url))
self.parse_url(url)

if len(self.unprocessed_urls)!=0:
self.crawl()
else:
print('End of crawling for {} '.format(self.website))
print('Total urls visited {}'.format(len(self.processed_urls)))
print('Total Emails found {}'.format(self.email_count))
print('Dumping processed urls to {}'.format(self.base_url.replace('.','_')+'.txt'))
with open(self.base_url.replace('.','_')+'.txt' ,'w') as f:
f.write('n'.join(self.processed_urls))
def parse_url(self, current_url: str):
"""
It will load and parse a given url. Load it and find all the url in this page.
It also filters the urls and adds them to unprocessed url list.
Finally it scrapes the emails if found on the page and the updates the email list
INPUT:
current_url: URL to parse
RETURN:
None
"""
#we will retry to visit a url for 5 times in case it fails. after that we will skip it in case if it still fails to load
response = requests.get(current_url, headers=self.headers)
tree = html.fromstring(response.content)
urls = tree.xpath('//a/@href')  # getting all urls in the page

#Here we will make sure that we convert the sub domain to full urls
# example --> /about.html--> https://www.website.com/about.html
urls = [urljoin(self.website,url) for url in urls]
# now lets make sure that we only include the urls that fall under our domain i.e filtering urls that point outside our main website.
urls = [url for url in urls if self.base_url == urlsplit(url).netloc]

#removing duplicates
urls = list(set(urls))

#filtering  urls that point to files such as images, videos and other as listed on garbage_extensions
#Here will loop through all the urls and skip them if they contain one of the extension
parsed_url = []
for url in urls:
skip = False
for extension in self.garbage_extensions:
if not url.endswith(extension) and  not url.endswith(extension+'/'):
pass
else:
skip = True
break
if not skip:
parsed_url.append(url)
# finally filtering urls that are already in queue or already visited
for url in parsed_url:
if url not in self.processed_urls and url not in self.unprocessed_urls:
self.unprocessed_urls.add(url)

#parsing email
self.parse_emails(response.text)
# adding the current url to processed list
self.processed_urls.add(current_url)

def parse_emails(self, text: str):
"""
It scans the given texts to find email address and then writes them to csv
Input:
text: text to parse emails from
Returns:
bool: True or false (True if email was found on page)
"""
# parsing emails and then saving to csv
emails = set(re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+', text, re.I))
#TODO: sometime "gFJS3amhZEg_z39D5EErVg@2x.png" gets accepted as email with the above regex. so for now I will check if email ends with jpeg,png and jpg
for email in emails:
skip_email = False
for checker in ['jpg','jpeg','png']:
if email.endswith(checker):
skip_email = True
break
if not skip_email:
if email not in self.emails:
with open(self.outputfile, 'a', newline='') as csvf:
csv_writer = csv.writer(csvf)
csv_writer.writerow([email])
self.email_count +=1
self.emails.add(email)
print(' {} Email found {}'.format(self.email_count,email))
if len(emails)!=0:
return True
else:
return False

print('WELCOME TO EMAIL CRAWLER')
try:
website = sys.argv[1]
except:
website = input("Please enter a website to crawl for emails:")
crawl = EmailCrawler(website)
crawl.crawl()

这是html代码:

<html lang="en" dir="ltr">
<head>
<meta charset="utf-8">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" integrity="sha384-Gn5384xqQ1aoWXA+058RXPxPg6fy4IWvTNh0E263XmFcJlSAwiGgFAW/dAiS6JXm" crossorigin="anonymous">
<title>Email Tracker</title>
<link rel="stylesheet" type="text/css" href="{% static '/css/main.css' %}">
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.1.0/css/all.css" integrity="sha384-lKuwvrZot6UHsBSfcMvOkWwlCMgc0TaWr+30HWe3a4ltaBwTZhyTEggF5tJv8tbt" crossorigin="anonymous">
</head>
<!-- Search form -->
<div class="container">
<br/>
<div class="row justify-content-center">
<div class="col-12 col-md-10 col-lg-8">
<form class="card card-sm" action="Search/">
<div class="card-body row no-gutters align-items-center">
<div class="col-auto">
<i class="fas fa-search h4 text-body"></i>
</div>
<!--end of col-->
<div class="col">
<input class="form-control form-control-lg form-control-borderless" type="search" placeholder="Search topics or keywords">
</div>
<!--end of col-->
<div class="col-auto">
<button class="btn btn-lg btn-success" type="submit" onclick="">Search</button>
</div>
<!--end of col-->
</div>
</form>
</div>
<!--end of col-->
</div>

</div>
</html>

任何帮助都将不胜感激!

您的html代码表示,每当您提交表单时,它都会命中操作中给定的URL,在您的情况下,该URL为"搜索/"。你没有为此编写任何URL和视图。此外,每当您编写搜索视图时,尝试调用EmailCrawler类中的方法,它就会起作用。

最新更新