我如何仅找到文本文件独有的单词?如果其他文件中经常使用一个单词,则将其删除。
这是参考http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html
我需要一个脚本,该脚本通过文件夹中的所有文本文件循环,并以JSON格式输出结果。
到目前为止我的代码:
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os
def get_raw_data():
texts = []
for x in range(1,95):
file_name = str(x+1)+".txt"
with codecs.open(file_name,"rU","utf-8") as myfile:
data = myfile.read()
texts.append(data)
yield file_name, 'n'.join(texts)
class StemTokenizer(object):
def __init__(self):
self.ignore_set = {'footnote'}
def __call__(self, doc):
words = []
for word in word_tokenize(doc):
word = word.lower()
w = wn.morphy(word)
if w and len(w) > 1 and w not in self.ignore_set:
words.append(w)
return words
def process_text(counts, vectorizer, text, file_name, index):
result = {w: counts[index][vectorizer.vocabulary_.get(w)]
for w in vectorizer.get_feature_names()}
result = {w: c for w, c in result.iteritems() if c > 4}
normalizing_factor = max(c for c in result.itervalues())
result = {w: c / normalizing_factor
for w, c in result.iteritems()}
return result
def main():
data = list(get_raw_data())
print('Data loaded')
n = len(data)
vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
counts = vectorizer.fit_transform(text for p, text in data).toarray()
print('Vectorization done.')
print (counts)
for x in range(95):
file_name = str(x+1)+".txt"
# print (text)
for i, (text) in enumerate(data):
print (file_name)
# print (text)
with codecs.open(file_name,"rU","utf-8") as myfile:
text = myfile.read()
result = process_text(counts, vectorizer, text, file_name, i)
print (result)
if __name__ == '__main__':
main()
看来您有一堆名为 1.txt
, 2.txt
,... 95.txt
的文件,您想仅在一个文件中找到单词。我只是收集所有单词,计算每个文件中有多少个文件;并打印出单例。
from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
with open(fname) as fp: # Add encoding if really needed
text = fp.read().lower()
words = re.split(r"W+", text) # Keep letters, drop the rest
filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))
完成。您不需要Scikit,您不需要NLTK,您不需要一堆IR算法。您可以使用IR算法中的单例列表,但这是另一个故事。
def parseText():
# oFile: text file to test
# myWord: word we are looking for
# Get all lines into list
aLines = oFile.readlines()
# Perform list comprehension on lines to test if the word is found
for sLine in aLines:
# Parse the line (remove spaces), returns list
aLine = sLine.split()
# Iterate words and test to see if they match our word
for sWord in aLines:
# if it matches, append it to our list
if sWord == myWord: aWords.append( sWord )
# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'