我想为标题的每个链接创建单独的文本文件,并使用BeautifulSoup将标题文章写入这些文本文件



我希望每个标题的文章/段落都写在不同的文本文件上,与另一个文件中标题序列号的名称相匹配。我想我犯了一些错误,现在标题也没有打印正确,但段落打印正确。我认为我写的代码顺序有问题。

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/02'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list =  list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
count = 0
with open('head.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
#    key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(str(count) + headings + '    ' + URL + '    ' + '---------> ' + d + 'n')
count += 1
for articles in containers:
ar = articles.h5.a.attrs['href']
article = requests.get('https://www.thedailystar.net{}'.format(ar))
p_src = article.text
p_soup = BeautifulSoup(p_src, 'lxml')
p_content = p_soup.find_all('div', class_ = 'field-body view-mode-full')
for p in p_content:
paragraph = p.text
#                        print(paragraph)

应该是正确的(如果我正确理解你的任务(:

import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/02'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list =  list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
count = 0
key_words = ['Road', 'crash', 'dead', 'accidents']
with open('head.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')

for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(str(count) + headings + '    ' + URL + '    ' + '---------> ' + d + 'n')
count += 1
if 'href' in c.h5.a.attrs:
ar = c.h5.a.attrs['href']
with open(str(count) + '.txt') as temp_f:
article = requests.get('https://www.thedailystar.net{}'.format(ar))
p_src = article.text
p_soup = BeautifulSoup(p_src, 'lxml')
p_content = p_soup.find_all('div', class_ = 'field-body view-mode-full')
for p in p_content:
[tag_wrapper.decompose() for tag_wrapper in p.find_all('div', class_ = 'dfp-tag-wrapper')]
paragraph = p.text
#                            print(paragraph)
temp_f.write(paragraph)

您错过了为文章内容创建单独文件的open。另外,您在containers对象上循环了两次,这是不正确的。

我添加了一个if语句来检查hrefarticles.h5.a.attrs中的位置,因为如果href不在那里,就会出现错误,值得以更安全的模式处理这些时刻。

所以我自己解决了。以下是我所做的。

import requests
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/02'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list =  list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
count = 1
key_words = ['Road', 'crash', 'dead', 'accidents']
with open('head.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
ar = c.h5.a.attrs.get('href')
p_link = 'https://www.thedailystar.net{}'.format(ar)
article = requests.get(p_link)
f.write(str(count) + headings + '    ' +  p_link  + '    ' + '---------> ' + d + 'n')
count += 1
with open(str(count) + '.txt', 'w') as temp_f:
#                    ar = c.h5.a.attrs.get('href')
#                    article = requests.get('https://www.thedailystar.net{}'.format(ar))
p_src = article.text
p_soup = BeautifulSoup(p_src, 'lxml')
p_content = p_soup.find_all('div', class_ = 'field-body view-mode-full')
for p in p_content:
paragraph = p.text
#                            print(paragraph)
temp_f.write(paragraph)

最新更新