如何使用Python在一束文本文件中为每个文本文件找到唯一的单词

我如何仅找到文本文件独有的单词？如果其他文件中经常使用一个单词，则将其删除。

这是参考http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html

我需要一个脚本，该脚本通过文件夹中的所有文本文件循环，并以JSON格式输出结果。

到目前为止我的代码：

from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os

def get_raw_data():
    texts = []
    for x in range(1,95):
        file_name = str(x+1)+".txt"
        with codecs.open(file_name,"rU","utf-8") as myfile:
            data = myfile.read()
    texts.append(data)
    yield file_name, 'n'.join(texts)

class StemTokenizer(object):
    def __init__(self):
        self.ignore_set = {'footnote'}
    def __call__(self, doc):
        words = []
        for word in word_tokenize(doc):
            word = word.lower()
            w = wn.morphy(word)
            if w and len(w) > 1 and w not in self.ignore_set:
                words.append(w)
        return words

def process_text(counts, vectorizer, text, file_name, index):
    result = {w: counts[index][vectorizer.vocabulary_.get(w)]
              for w in vectorizer.get_feature_names()}
    result = {w: c for w, c in result.iteritems() if c > 4}
    normalizing_factor = max(c for c in result.itervalues())
    result = {w: c / normalizing_factor
              for w, c in result.iteritems()}
    return result

def main():
    data = list(get_raw_data())
    print('Data loaded')
    n = len(data)
    vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
    counts = vectorizer.fit_transform(text for p, text in data).toarray()
    print('Vectorization done.')
    print (counts)
    for x in range(95):
        file_name = str(x+1)+".txt"
            # print (text)
        for i, (text) in enumerate(data):
            print (file_name)
            # print (text)
            with codecs.open(file_name,"rU","utf-8") as myfile:
                text = myfile.read()
            result = process_text(counts, vectorizer, text, file_name, i)
            print (result)  
if __name__ == '__main__':
    main()

看来您有一堆名为 1.txt， 2.txt，... 95.txt的文件，您想仅在一个文件中找到单词。我只是收集所有单词，计算每个文件中有多少个文件；并打印出单例。

from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
    with open(fname) as fp:    # Add encoding if really needed
        text = fp.read().lower()
        words = re.split(r"W+", text)  # Keep letters, drop the rest
        filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))

完成。您不需要Scikit，您不需要NLTK，您不需要一堆IR算法。您可以使用IR算法中的单例列表，但这是另一个故事。

def parseText():
    # oFile: text file to test
    # myWord: word we are looking for
    # Get all lines into list
    aLines = oFile.readlines()
    # Perform list comprehension on lines to test if the word is found
    for sLine in aLines:
        # Parse the line (remove spaces), returns list
        aLine = sLine.split()
        # Iterate words and test to see if they match our word
        for sWord in aLines:
            # if it matches, append it to our list
            if sWord == myWord: aWords.append( sWord )

# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'

相关内容

最新更新

热门标签：