我希望每个标题的文章/段落都写在不同的文本文件上,与另一个文件中标题序列号的名称相匹配。我想我犯了一些错误,现在标题也没有打印正确,但段落打印正确。我认为我写的代码顺序有问题。
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/02'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
count = 0
with open('head.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
key_words = ['Road', 'crash', 'dead', 'accidents']
# key_word = input('Enter the desired word to search the news: ')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(str(count) + headings + ' ' + URL + ' ' + '---------> ' + d + 'n')
count += 1
for articles in containers:
ar = articles.h5.a.attrs['href']
article = requests.get('https://www.thedailystar.net{}'.format(ar))
p_src = article.text
p_soup = BeautifulSoup(p_src, 'lxml')
p_content = p_soup.find_all('div', class_ = 'field-body view-mode-full')
for p in p_content:
paragraph = p.text
# print(paragraph)
应该是正确的(如果我正确理解你的任务(:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/02'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
count = 0
key_words = ['Road', 'crash', 'dead', 'accidents']
with open('head.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
f.write(str(count) + headings + ' ' + URL + ' ' + '---------> ' + d + 'n')
count += 1
if 'href' in c.h5.a.attrs:
ar = c.h5.a.attrs['href']
with open(str(count) + '.txt') as temp_f:
article = requests.get('https://www.thedailystar.net{}'.format(ar))
p_src = article.text
p_soup = BeautifulSoup(p_src, 'lxml')
p_content = p_soup.find_all('div', class_ = 'field-body view-mode-full')
for p in p_content:
[tag_wrapper.decompose() for tag_wrapper in p.find_all('div', class_ = 'dfp-tag-wrapper')]
paragraph = p.text
# print(paragraph)
temp_f.write(paragraph)
您错过了为文章内容创建单独文件的open
。另外,您在containers
对象上循环了两次,这是不正确的。
我添加了一个if
语句来检查href
在articles.h5.a.attrs
中的位置,因为如果href
不在那里,就会出现错误,值得以更安全的模式处理这些时刻。
所以我自己解决了。以下是我所做的。
import requests
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime, timedelta
s_date = '2018/01/01'
e_date = '2018/01/02'
d1 = datetime.strptime(s_date, '%Y/%m/%d')
d2 = datetime.strptime(e_date, '%Y/%m/%d')
delta = timedelta(days = 1)
date_list = list()
while d1 <= d2:
date_list.append(d1.strftime('%Y/%m/%d'))
d1 += delta
print(date_list)
count = 1
key_words = ['Road', 'crash', 'dead', 'accidents']
with open('head.txt', 'w') as f:
for d in date_list:
URL = 'https://www.thedailystar.net/newspaper?date={}'.format(d)
result = requests.get(URL)
src = result.text
soup = BeautifulSoup(src, 'lxml')
containers = soup.find_all('div',class_ = 'list-content')
for c in containers:
headings = c.h5.a.text
if any(key_word in headings for key_word in key_words):
print(headings)
ar = c.h5.a.attrs.get('href')
p_link = 'https://www.thedailystar.net{}'.format(ar)
article = requests.get(p_link)
f.write(str(count) + headings + ' ' + p_link + ' ' + '---------> ' + d + 'n')
count += 1
with open(str(count) + '.txt', 'w') as temp_f:
# ar = c.h5.a.attrs.get('href')
# article = requests.get('https://www.thedailystar.net{}'.format(ar))
p_src = article.text
p_soup = BeautifulSoup(p_src, 'lxml')
p_content = p_soup.find_all('div', class_ = 'field-body view-mode-full')
for p in p_content:
paragraph = p.text
# print(paragraph)
temp_f.write(paragraph)