Python中带有文件的Unicode解码错误

所以我在解码时遇到了这个问题。我在其他线程中找到了如何使用u‘string’.encode对简单字符串进行处理的方法。但我找不到一种方法使其与文件一起工作。

任何帮助都将不胜感激！

这是代码。

text = file.read()
text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
file.seek(0)  # rewind
file.write(text.encode('utf-8'))

如果有帮助的话，下面是整个代码。

#!/usr/bin/env python
# coding: utf-8
"""
 Script to helps on translate some code's methods from
 portuguese to english.
"""
from multiprocessing import Pool
from mock import MagicMock
from goslate import Goslate
import fnmatch
import logging
import os
import re
import urllib2
_MAX_PEERS = 1
try:
    os.remove('traducoes.log')
except OSError:
    pass
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('traducoes.log')
logger.addHandler(handler)

def fileWalker(ext, dirname, names):
    """
    Find the files with the correct extension
    """
    pat = "*" + ext[0]
    for f in names:
        if fnmatch.fnmatch(f, pat):
            ext[1].append(os.path.join(dirname, f))

def encontre_text(file):
    """
    find on the string the works wich have '_' on it
    """
    text = file.read().decode('utf-8')
    return re.findall(r"w+(?<=_)w+", text)
    #return re.findall(r""w+"", text)

def traduza_palavra(txt):
    """
        Translate the word/phrase to english
    """
    try:
        # try connect with google
        response = urllib2.urlopen('http://google.com', timeout=2)
        pass
    except urllib2.URLError as err:
        print "No network connection "
        exit(-1)
    if txt[0] != '_':
        txt = txt.replace('_', ' ')
    txt = txt.replace('media'.decode('utf-8'), 'média'.decode('utf-8'))
    gs = Goslate()
    #txt = gs.translate(txt, 'en', gs.detect(txt))
    txt = gs.translate(txt, 'en', 'pt-br')  # garantindo idioma tupiniquim
    txt = txt.replace(' en ', ' br ')
    return txt.replace(' ', '_')  # .lower()

def subistitua(file, txt, novo_txt):
    """
    should rewrite the file with the new text in the future
    """
    text = file.read()
    text.replace(txt.encode('utf-8'), novo_txt.encode('utf-8'))
    file.seek(0)  # rewind
    file.write(text.encode('utf-8'))

def magica(File):
    """
    Thread Pool. Every single thread should play around here with
    one element from list os files
    """
    global _DONE
    if _MAX_PEERS == 1:  # inviavel em multithread
        logger.info('n---- File %s' % File)
    with open(File, "r+") as file:
        list_txt = encontre_text(file)
        for txt in list_txt:
            novo_txt = traduza_palavra(txt)
            if txt != novo_txt:
                logger.info('%s -> %s [%s]' % (txt, novo_txt, File))
            subistitua(file, txt, novo_txt)
        file.close()
    print File.ljust(70) + '[OK]'.rjust(5)
if __name__ == '__main__':
    try:
        response = urllib2.urlopen('http://www.google.com.br', timeout=1)
    except urllib2.URLError as err:
        print "No network connection "
        exit(-1)
    root = './app'
    ex = ".py"
    files = []
    os.path.walk(root, fileWalker, [ex, files])
    print '%d files found to be translated' % len(files)
    try:
        if _MAX_PEERS > 1:
            _pool = Pool(processes=_MAX_PEERS)
            result = _pool.map_async(magica, files)
            result.wait()
        else:
            result = MagicMock()
            result.successful.return_value = False
            for f in files:
                pass
                magica(f)
            result.successful.return_value = True
    except AssertionError, e:
        print e
    else:
        pass
    finally:
        if result.successful():
            print 'Translated all files'
        else:
            print 'Some files were not translated'

谢谢大家的帮助！

在Python 2中，从文件中读取会生成规则（字节）字符串对象，而不是unicode对象。不需要对这些调用.encode()；事实上，这只会首先触发自动解码到Unicode，这可能会失败。

经验法则：使用unicode三明治。无论何时读取数据，都要在该阶段解码为unicode。在整个代码中使用unicode值。无论何时写入数据，都要在该点进行编码。您可以使用io.open()打开为您自动编码和解码的文件对象。

这也意味着您可以在任何地方使用unicode文字；对于正则表达式，对于字符串文字。所以使用：

def encontre_text(file):
    text = file.read()  # assume `io.open()` was used
    return re.findall(ur"w+(?<=_)w+", text)  # use a unicode pattern

和

def subistitua(file, txt, novo_txt):
    text = file.read()  # assume `io.open()` was used
    text = text.replace(txt, novo_txt)
    file.seek(0)  # rewind
    file.write(text)

因为程序中的所有字符串值已经是unicode，并且

txt = txt.replace(u'media', u'média')

因为u'..' unicode字符串文字不再需要解码。

相关内容

最新更新

热门标签：