Web抓取Python BeautifulSoup获取网站中每个网页的元素



我刚开始学习python编程。我要做的是建立一个网站刮板,从一个网站得到所有的链接,然后返回每个网站的元素。我开始使用的代码来自https://www.thepythoncode.com/article/extract-all-website-links-python这真的很好地从一个网站得到所有的链接。因为我只对内部链接感兴趣,所以我添加了一些额外的代码来尝试获取元素(tile, h1,我还没有添加的其他一些位)到代码中。我遇到的问题是,我认为href返回一封电子邮件,然后代码尝试从中提取元素,所以很明显,这个错误。我试图避免它选择电子邮件(我也认为会在def_valid函数),但我显然错过了一些东西。如有任何帮助,我将不胜感激。

import re
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW
internal_urls = set()
external_urls = set()
title_urls = set()
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`
urls = set()
# domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
# is_internal_link == True:
title_check = soup.find_all('title')
if title_check != " " or title_check != None:
get_title(url)
get_heading_tags(url)
for a_tag in soup.findAll("a"):
# is_internal_link = False
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
#print(f"{GRAY}[!] External link: {href}{RESET}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
if re.search('@',href) == True:
continue
urls.add(href)
internal_urls.add(href)
return urls

# number of urls visited so far will be stored here
total_urls_visited = 0
def get_title(url):   # domain name of the URL without the protocol
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
#print("Title of the website is : ")
for title in soup.find_all('title'):
if title == "" and title == None:
continue
title_text = title.get_text()
title_urls.add(title_text)
print(title_text)
print((len(title_text)))
def get_heading_tags(url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
heading_tags = ['h1', 'h2', 'h3']
i = 0
for tags in soup.find_all(heading_tags):
if tags == " " or tags == None:
continue
tags_text = tags.get_text()
letters_in_tags = len(tags_text) - tags_text.count(" ")
i += 1
print(f'{tags.name} {i} -> {tags_text} -> Length ->{letters_in_tags} ')

def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl, default is 30.
"""
global total_urls_visited
total_urls_visited += 1
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
links = get_all_website_links(url)
for link in links:
if re.search('@',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
crawl("https://website.com/") #put website here. 
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total External links:", len(external_urls))
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
for link in links:
if re.search('@',link) != True:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)

您只检查@是否存在于链接中(这也不正确!),以知道它是否是电子邮件。还要注意链接中也可以有@

基本上,<a>内部的电子邮件将是这样的形式:

<a href="mailto:someemail@example.com"></a>

因此,为了区分电子邮件和链接,您可以使用下面的复选框。

for link in links:
if not link.startswith('mailto:'):
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)

这将忽略所有的邮件,只抓取链接。

最新更新