我最近开发了一个Python程序,它可以从某个文档中的术语中生成倒排索引。我现在想创建职位发布,例如
to, 993427:
⟨ 1, 6: ⟨7, 18, 33, 72, 86, 231⟩;
2, 5: ⟨1, 17, 74, 222, 255⟩; 4, 5: ⟨8, 16, 190, 429, 433⟩; 5, 2: ⟨363, 367⟩;
7, 3: ⟨13, 23, 191⟩; …⟩
我知道如上所述的代码不完整,我只是在尝试实现功能。
from pprint import pprint as pp
from collections import Counter
import pprint
import re
import sys
import string
import fileinput
try:
reduce
except:
from functools import reduce
try:
raw_input
except:
raw_input = input
def readIn(fileglob): #Reads in multiple files and strips punctation/uppercase.
texts, words = {}, set()
for txtfile in (fileglob):
with open(txtfile, 'r') as splitWords:
txt = splitWords.read().lower().split()
txt = str(txt)
txt = re.findall(r'w+', txt)
words |= set(txt)
texts[txtfile.split('\')[-1]] = txt
return texts, words
def search(indexes): # Inverted index, based off the book and the web.
return reduce(set.intersection,
(index[word] for word in indexes),
set(texts.keys()))
def getWordBins(posOfWords):
cnt = Counter()
for word in posOfWords:
cnt[posOfWords] += 1
return cnt
def main(fileList, topWords):
tempArray = []
for x in range(1,len(fileList)):
tempArray.append(fileList[x])
texts, words = readIn(tempArray)
index = {word:set(txt
for txt, wrds in texts.items() if word in wrds)
for word in words}
test =({k + " " + str(len(v)) + " " + str(sorted(v)) for k,v in index.items()})
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
#Writes out the information requested to a ".idx" file.
doc = open("document.idx", "w")
doc.write("# INPUT DOCUMENT REFERENCE LEGENDn")
for fileNumber in range(1, len(fileList)):
doc.write(str(fileNumber) + "t" + fileList[fileNumber] + "n")
doc.write("# INVERTED INDEX RESULTSn")
tempTest = []
for x in test:
tempTest.append(x.split(" "))
for x in tempTest:
tempStr = ""
for y in x:
tempStr += y + "t"
doc.write(tempStr + "n")
doc.close
main(sys.argv, sys.argv)
这就是我目前所拥有的,唯一的新功能是getWordBins函数和循环:
txt = readIn(fileList)
posWord = getWordBins(txt)
for key, value in posWord.most_common(topWords):
print key, value
现在,当我尝试运行代码时发生的情况是这样的:
Traceback (most recent call last):
File "Intro3.py", line 82, in <module>
main(sys.argv, sys.argv)
File "Intro3.py", line 60, in main
posWord = getWordBins(txt)
File "Intro3.py", line 41, in getWordBins
cnt[posOfWords] += 1
TypeError: unhashable type: 'dict'
任何关于这个令人不安的错误的任何指导都很高兴收到。它不是字典,那么为什么会出现错误呢?谢谢你的时间!
你在做什么的地方:
cnt[posOfWords] += 1
我想你可能的意思是:
cnt[word] += 1
你的 readin 函数还返回一个字典和一个集合,所以你的 txt 变量是 ( dict
, set
) 的元组
所以你的问题归结为试图使用一个包含字典的元组作为键(我怀疑这是你的意图)。它对cnt[word] += 1
不起作用,因为这仍然会尝试使用字典作为密钥。您可能需要这样做:
txt, _ = readIn(fileList)
然后这可能会起作用:
cnt[word] += 1