你好,我是一个python newbie,很抱歉,当我不知道怎么了,问一个特定的问题。
我正在尝试从韩国新网站抓取新闻文章。当我运行此代码
时 import sys
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import quote
target_url_b4_pn="http://news.donga.com/search?p="
target_url_b4_keyword='&query='
target_url_rest="&check_news1&more=1&sorting1&search_date1&v1=&v2=&range=1"
def get_text(URL, output_file):
source_code_from_URL=urllib.request.urlopen(URL)
soup=BeautifulSoup(source_code_from_URL, 'lxml', from_encoding='UTF-8')
content_of_article=soup.select('div.article')
for item in content_of_article:
string_item=str(item.find_all(text=True))
output_file.write(string_item)
def get_link_from_news_title(page_num, URL, output_file):
for i in range(page_num):
current_page_num=1+i*15
position=URL.index('=')
URL_with_page_num=URL[:position+1]+str(current_page_num)+URL[position+1:]
source_code_from_URL=urllib.request.urlopen(URL_with_page_num)
soup=BeautifulSoup(source_code_from_URL, 'lxml',from_encoding='UTF-8')
for title in soup.find_all('p','tit'):
title_link=title.select('a')
article_URL=title_link[0]['href']
get_text(article_URL, output_file)
def main():
keyword="노무현"
page_num=1
output_file_name="output.txt"
target_url=target_url_b4_pn+target_url_b4_keyword+quote(keyword)+target_url_rest
output_file=open(output_file_name, "w", -1, "utf-8")
get_link_from_news_title(page_num, target_url, output_file)
output_file.close()
if __name__=='__main__':
main()
print(target_url)
print(11111)
jupyter笔记本未响应输入,甚至不会在底部吐出任何简单命令(不打印任何东西(
认为代码在以某种方式冻结它,请告诉我它可能出错了哪里?
它没有响应的图片
-
get_text
函数的第一行,urllib.request.urlopen(URL)
表示您打开URL,但是就像打开文件一样,您必须read
。因此,在此之后添加read()
。
urllib.request.urlopen(URL).read()
否则Beautifulsoup将无法识别。 -
,在您的CSS选择器
soup.select('div.article')
中,页面中没有这样的元素,我想您想要的是soup.select('div.article_txt')
,它与文章的段落匹配。 -
您的
print(target_url)
应该进入您的main
功能,对于仅在main
中定义的target_url
。
代码
import sys
from bs4 import BeautifulSoup
import urllib.request
from urllib.parse import quote
target_url_b4_pn="http://news.donga.com/search?p="
target_url_b4_keyword='&query='
target_url_rest="&check_news1&more=1&sorting1&search_date1&v1=&v2=&range=1"
def get_text(URL, output_file):
source_code_from_URL=urllib.request.urlopen(URL)
soup=BeautifulSoup(source_code_from_URL, 'lxml', from_encoding='UTF-8')
# change your css selector so it match some element
content_of_article=soup.select('div.article_txt')
for item in content_of_article:
string_item=item.find_all(text=True)
#write string to file
output_file.write(" ".join(string_item))
def get_link_from_news_title(page_num, URL, output_file):
for i in range(page_num):
current_page_num=1+i*15
position=URL.index('=')
URL_with_page_num=URL[:position+1]+str(current_page_num)+URL[position+1:]
source_code_from_URL=urllib.request.urlopen(URL_with_page_num)
soup=BeautifulSoup(source_code_from_URL, 'lxml',from_encoding='UTF-8')
for title in soup.find_all('p','tit'):
title_link=title.select('a')
article_URL=title_link[0]['href']
get_text(article_URL, output_file)
def main():
keyword="노무현"
page_num=1
output_file_name="output.txt"
target_url=target_url_b4_pn+target_url_b4_keyword+quote(keyword)+target_url_rest
# move `target_url` here
print(target_url)
output_file=open(output_file_name, "w", -1, "utf-8")
get_link_from_news_title(page_num, target_url, output_file)
output_file.close()
if __name__=='__main__':
main()
print(11111)