在一些使用BeautifulSoup和Requests的网站上缺少URL



我正在尝试抓取网页获取文章,但链接没有http:,所以我得到request.expections.MissingSchema:无效URL错误。

我知道我必须尝试像'http:'+ href这样的东西,但是我应该把这个放在哪里我不明白。

import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
url = "https://www.lanacion.com.ar/"
# Request
r1 = requests.get(url)
r1.status_code
# We'll save in coverpage the cover page content
coverpage = r1.content
# Soup creation
soup1 = BeautifulSoup(coverpage, 'html5lib')
# News identification
coverpage_news = soup1.find_all('h2', class_='com-title')
len(coverpage_news)
coverpage_news[2]
number_of_articles = 2
# Empty lists for content, links and titles
news_contents = []
list_links = []
list_titles = []
for n in np.arange(0, number_of_articles):

# only news articles (there are also albums and other things)
#if "inenglish" not in coverpage_news[n].find('a')['href']:  
#    continue

# Getting the link of the article
link = coverpage_news[n].find('a')["href"] 
list_links.append(link)

# Getting the title
title = coverpage_news[n].find('a').get_text()
list_titles.append(title)

# Reading the content (it is divided in paragraphs)
article = requests.get(link)
article_content = article.content
soup_article = BeautifulSoup(article_content, 'html5lib')
body = soup_article.find_all('h2', class_='title')
x = body[0].find_all('p')

# Unifying the paragraphs
list_paragraphs = []
for p in np.arange(0, len(x)):
paragraph = x[p].get_text()
list_paragraphs.append(paragraph)
final_article = " ".join(list_paragraphs)

news_contents.append(final_article)

非常感谢!

为链接添加前缀

# Getting the link of the article
link = coverpage_news[n].find('a')["href"] 
link = 'https://www.lanacion.com.ar'+link
list_links.append(link)

最新更新