来自 Python 中 txt 文件的单词和短语频率



我正在进行一些文本分析。基本上,我正在尝试获取某个文件夹中每个文件的总字数(基于单词列表(和总短语计数(基于短语列表(。到目前为止,我有以下内容。但是我不断收到错误'str' object has no attribute 'words'.我尝试编写的代码是其他几个代码的组合,所以我不知道哪个部分造成了问题。任何帮助将不胜感激。

import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
vdictionary = {}
_odata = [0] * 4
tokens = re.findall('w(?:[-w]*w)?', doc)
for token in tokens:
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1
for w1, w2 in zip(phrases, phrases[1:]):
phrase = w1 + " " + w2
if phrase.phrases: _odata[3] += 1
return _odata
if __name__ == '__main__':
print('n' + time.strftime('%c') + 'nUncertainty.pyn')
main()
print('n' + time.strftime('%c') + 'nNormal termination.')

错误是一致的if token.words: _odata[2] += 1很可能是token不是某些具有支持属性的数据结构的类型字典

for token in tokens:
print(token) # print token here to see the what is the value of token
if token not in vdictionary:
vdictionary[token] = 1
if token.words: _odata[2] += 1

所以我自己解决了这个问题。这是代码。

import csv
import glob
import re
import string
import sys
import time
target_files = r'C:/Users/Mansoor/Documents/Files/*.*'
output_file = r'C:/Users/Mansoor/Documents/Parser.csv'
output_fields = ['file name,', 'file size,', 'words,', 'phrases,']
words = {'uncertainty', 'downturn', 'shock'}
phrases = {'economic downturn', 'political uncertainty'}
def main():
f_out = open(output_file, 'w')
wr = csv.writer(f_out, lineterminator='n')
wr.writerow(output_fields)
file_list = glob.glob(target_files)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
doc = f_in.read()
doc_len = len(doc)
doc = doc.lower()
output_data = get_data(doc)
output_data[0] = file
output_data[1] = doc_len
wr.writerow(output_data)
def get_data(doc):
_odata = [0] * 4
tokens = re.findall('w(?:[-w]*w)?', doc)
for token in tokens:
if token in words:
_odata[2] += 1
for w1, w2 in zip(tokens, tokens[1:]):
phrase = w1 + " " + w2
if phrase in phrases:
_odata[3] += 1
return _odata
if __name__ == '__main__':
print('n' + time.strftime('%c') + 'nUncertainty.pyn')
main()
print('n' + time.strftime('%c') + 'nNormal termination.')

最新更新