我为一个invertedIndex编写了如下代码。但是我对它不太满意,并且想知道如何使它更紧凑和更python化
class invertedIndex(object):
def __init__(self,docs):
self.docs,self.termList,self.docLists=docs,[],[]
for index,doc in enumerate(docs):
for term in doc.split(" "):
if term in self.termList:
i=self.termList.index(term)
if index not in self.docLists[i]:
self.docLists[i].append(index)
else:
self.termList.append(term)
self.docLists.append([index])
def search(self,term):
try:
i=self.termList.index(term)
return self.docLists[i]
except:
return "No results"
docs=["new home sales top forecasts june june june",
"home sales rise in july june",
"increase in home sales in july",
"july new home sales rise"]
i=invertedIndex(docs)
print invertedIndex.search("sales")
将doc索引存储在Python集合中,并使用字典为每个术语引用"doc集合"
from collections import defaultdict
class invertedIndex(object):
def __init__(self,docs):
self.docSets = defaultdict(set)
for index, doc in enumerate(docs):
for term in doc.split():
self.docSets[term].add(index)
def search(self,term):
return self.docSets[term]
docs=["new home sales top forecasts june june june",
"home sales rise in july june",
"increase in home sales in july",
"july new home sales rise"]
i=invertedIndex(docs)
print i.search("sales") # outputs: set([0, 1, 2, 3])
set
的工作方式有点像列表,但它是无序的,不能包含重复的条目。
defaultdict
基本上是一个dict
,当没有数据可用时,它有一个默认类型(在这种情况下是一个空集)。
这个解决方案几乎与@Peter Gibson的解决方案相同。在这个版本中,索引是数据,没有涉及委托的docSets对象。这使得代码更短,更清晰。
代码还保留了文档的原始顺序…这是个bug,我更喜欢Peter的set()
实现。
还要注意,对不存在的术语(如ix['garbage']
)的引用会隐式地修改索引。如果唯一的API是search
,这很好,但这种情况值得注意。
class InvertedIndex(dict):
def __init__(self, docs):
self.docs = docs
for doc_index,doc in enumerate(docs):
for term in doc.split(" "):
self[term].append(doc_index)
def __missing__(self, term):
# operate like defaultdict(list)
self[term] = []
return self[term]
def search(self, term):
return self.get(term) or 'No results'
docs=["new home sales top forecasts june june june",
"home sales rise in july june",
"increase in home sales in july",
"july new home sales rise",
'beer',
]
ix = InvertedIndex(docs)
print ix.__dict__
print
print 'sales:',ix.search("sales")
print 'whiskey:', ix.search('whiskey')
print 'beer:', ix.search('beer')
print 'nTEST OF KEY SETTING'
print ix['garbage']
print 'garbage' in ix
print ix.search('garbage')
输出{'docs': ['new home sales top forecasts june june june', 'home sales rise in july june', 'increase in home sales in july', 'july new home sales rise', 'beer']}
sales: [0, 1, 2, 3]
whiskey: No results
beer: [4]
TEST OF KEY SETTING
[]
True
No results
class InvertedIndex(dict):
def __init__(self, docs):
self.docs = docs
for doc_index,doc in enumerate(docs):
for term in doc.split(" "):
self[term].append(doc_index)
def __missing__(self, term):
# operate like defaultdict(list)
self[term] = []
return self[term]
def search(self, term):
return self.get(term) or 'No results'
docs=["new home sales top forecasts june june june",
"home sales rise in july june",
"increase in home sales in july",
"july new home sales rise",
'beer',
]
ix = InvertedIndex(docs)
print ix.__dict__
print
print 'sales:',ix.search("sales")
print 'whiskey:', ix.search('whiskey')
print 'beer:', ix.search('beer')
print 'nTEST OF KEY SETTING'
print ix['garbage']
print 'garbage' in ix
print ix.search('garbage')
{'docs': ['new home sales top forecasts june june june', 'home sales rise in july june', 'increase in home sales in july', 'july new home sales rise', 'beer']}
sales: [0, 1, 2, 3]
whiskey: No results
beer: [4]
TEST OF KEY SETTING
[]
True
No results