如何使用Python 2.7和BeautifulSoup正确解析俄语文本



我正在尝试解析来自俄语网站(http://games4you.ucoz.ua/news/)的所有帖子。我正在使用Python 2.7.9和BeautifulSoup 4。我在PyCharm工作。我已经尝试了很多方法来使其工作,但仍然得到这个而不是俄语文本:'u0421u0442u0440u0430u0442u0435u0433u0456u0457'

这是我的代码:

# Parsing information from games4you.ucoz.ua
# -*- coding: utf-8 -*-
import re
import csv
import urllib
from bs4 import BeautifulSoup
BASE_URL = "http://games4you.ucoz.ua/news/"
def get_html(url):
    response = urllib.urlopen(url)
    return response.read()
def get_page_count(html):
    soup = BeautifulSoup(html)
    paggination = soup.find('div', class_='catPages1')
    return int(paggination.find_all('a')[-2].text)
def save(games, path):
    # with open(path, 'w') as csvfile:
    #     writer = csv.writer(csvfile)
    #
    #     writer.writerow(('Title', 'Category', 'Date', 'Time'))
    #
    #     writer.writerows(
    #         (game['title'], ', '.join(game['category']), game['date'], game['time']) for game in games
    #     )
    with open(path,'w+') as f:
        f.write(str(games).encode("UTF-8"))

def parse(html):
    soup = BeautifulSoup(html)
    # Getting the <div> that contains all posts on page
    all_entries = soup.find('div',id='allEntries')
    # Getting all of the posts (every table represents one post)
    tables = all_entries.find_all('table',class_='eBlock')
    # Creating a list o dictionaries for games information
    games = []
    for table in tables:
        try:
            # Getting the game title
            game_title = table.tr.td.a.text
            game_post_body = table.find('div',class_='eMessage')
            # Getting the game description
            game_description = game_post_body.p.text.split('....')[0] + '.'
            game_details = table.find('div',class_='eDetails')
            # Getting the game category
            game_category = game_details.a.text
            game_post_details = game_details.text
        except:
            print 'Some error'
            continue
        # Getting the post views count
        post_views = game_post_details[:game_post_details.find('function')].split()[-2]
        # Getting the post date
        post_date = game_details.span.text
        # Getting the post time
        post_time = game_details.span['title']
        # print 'Game title: ',game_title,'n'
        # print 'Views: ',post_views,'n'
        # print 'Game category: ',game_category,'n'
        # print 'Game description: ','n',game_description,'n'
        # print 'Post date: ',post_date,'n'
        # print 'Post time: ',post_time,'n'
        games.append({
            'title': game_title,
            'category' : game_category,
            'description' : game_description,
            'date' : post_date,
            'time' : post_time
        })
    return games
def main():
    total_pages = get_page_count(get_html(BASE_URL))
    print('Total found %d pages...' % total_pages)
    games = []
    for page in range(1, total_pages + 1):
        print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
        games.extend(parse(get_html(BASE_URL + "?page%d" % page)))
    print('Saving...')
    save(games, 'games.txt')
main()

in Python2

>>> import HTMLParser
>>> s = 'Ell &#233;s la v&#237;ctima que expia els nostres pecats, i no tan sols els nostres, sin&#243; els del m&#243;n sencer.'
>>> print HTMLParser.HTMLParser().unescape(s)
Ell és la víctima que expia els nostres pecats, i no tan sols els nostres, sinó els del món sencer.

在 Python3 中

>>> import html
>>> html.unescape(s)  

您的示例

'Стратегії'

对于"正常"UTF-8 文件写入(读取)使用

 import codecs
 f = codecs.open(filename, 'w', 'utf-8')

希望这有帮助

是的,我做到了!猜猜我搞砸了解码/编码文本并使用不同的字符集。我所要做的就是简单地将从 BeautifulSoup 获得的数据从 Unicode 转换为 Utf-8,如下所示:

    game_title = game_title.encode("utf-8")
    game_category = game_category.encode("utf-8")
    game_description = game_description.encode("utf-8")
    post_date = post_date.encode("utf-8")
    post_time = post_time.encode("utf-8")

不需要其他任何东西。这是对我有用的结果代码:

# Parsing information from games4you.ucoz.ua
import csv
import urllib
from bs4 import BeautifulSoup
BASE_URL = "http://games4you.ucoz.ua/news/"
def get_html(url):
    response = urllib.urlopen(url)
    return response.read()
def get_page_count(html):
    soup = BeautifulSoup(html)
    paggination = soup.find('div', class_='catPages1')
    return int(paggination.find_all('a')[-2].text)
def save(games, path):
    with open(path, 'w+') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(('Title', 'Category', 'Date', 'Time'))
        writer.writerows(
            (game['title'],game['category'], game['date'], game['time']) for game in games
        )
def parse(html):
    soup = BeautifulSoup(html)
    # Getting the <div> that contains all posts on page
    all_entries = soup.find('div',id='allEntries')
    # Getting all of the posts (every table represents one post)
    tables = all_entries.find_all('table',class_='eBlock')
    # Creating a list o dictionaries for games information
    games = []
    for table in tables:
        try:
            # Getting the game title
            game_title = table.tr.td.a.text
            game_post_body = table.find('div',class_='eMessage')
            # Getting the game description
            game_description = game_post_body.p.text.split('....')[0] + '.'
            game_details = table.find('div',class_='eDetails')
            # Getting the game category
            game_category = game_details.a.text
            game_post_details = game_details.text
        except:
            print 'Some error'
            continue
        # Getting the post views count
        post_views = game_post_details[:game_post_details.find('function')].split()[-2]
        # Getting the post date
        post_date = game_details.span.text
        # Getting the post time
        post_time = game_details.span['title']
        # Converting all data from Unicode to Utf-8
        game_title = game_title.encode("utf-8")
        game_category = game_category.encode("utf-8")
        game_description = game_description.encode("utf-8")
        post_date = post_date.encode("utf-8")
        post_time = post_time.encode("utf-8")
        # Writing data to the list
        games.append({
            'title': game_title,
            'category' : game_category,
            'description' : game_description,
            'date' : post_date,
            'time' : post_time
        })
    return games
def main():
    total_pages = get_page_count(get_html(BASE_URL))
    print('Total found %d pages...' % total_pages)
    games = []
    for page in range(1, total_pages + 1):
        print('Parsing %d%% (%d/%d)' % (page / total_pages * 100, page, total_pages))
        games.extend(parse(get_html(BASE_URL + "?page%d" % page)))
    print('Saving...')
    save(games, 'games.csv')
main()

最新更新