AttributeError抛出,表示列表对象没有更低的属性



我正在研究一个训练数据和训练标签是列表的例子,但是当我在代码上适合它时,它会抛出一个错误。我想,问题出在文本预处理类上。

下面是代码,在这里我创建了一个管道,但它说期望str或字节像对象或有时,列表没有属性低。

import string
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin

class NLTKPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, stopwords=None, punct=None,
lower=False, strip=True):
self.lower      = lower
self.strip      = strip
self.stopwords  = stopwords or set(sw.words('english'))
self.punct      = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
def fit(self, X, y=None):
return self
def inverse_transform(self, X):
return [" ".join(doc) for doc in X]
def transform(self, X):
return [
list(self.tokenize(doc)) for doc in X
]
def tokenize(self, document):
# Break the document into sentences
for sent in sent_tokenize(document):
# Break the sentence into part of speech tagged tokens
for token, tag in pos_tag(wordpunct_tokenize(sent)):
# Apply preprocessing to the token
token = token.lower() if self.lower else token
token = token.strip() if self.strip else token
token = token.strip('_') if self.strip else token
token = token.strip('*') if self.strip else token
# If stopword, ignore token and continue
if token in self.stopwords:
continue
# If punctuation, ignore token and continue
if all(char in self.punct for char in token):
continue
# Lemmatize the token and yield
lemma = self.lemmatize(token, tag)
yield lemma
def lemmatize(self, token, tag):
tag = {
'N': wn.NOUN,
'V': wn.VERB,
'R': wn.ADV,
'J': wn.ADJ
}.get(tag[0], wn.NOUN)
return self.lemmatizer.lemmatize(token, tag)
from sklearn.feature_extraction.text import TfidfVectorizer
text_clf = Pipeline([('preprocess', NLTKPreprocessor()),
('vect', CountVectorizer()),
('tfidf', TfidfTransformer(smooth_idf=True,use_idf=True)),
('clf', SGDClassifier(loss='log', penalty='l2',
alpha=1e-3, random_state=42)),
])
#---show only the best performace results: accuracy, other metrics, and confusion matrix
text_clf3.fit(X_train, y_train)
y_pred = text_clf3.predict(X_test)
print ('Accuracy Score (%):', accuracy_score(y_test, y_pred)*100)
print(metrics.classification_report(y_test, y_pred,
target_names=docs_data.target_names))
<代码>

你会得到错误,因为这个方法。

def transform(self, X):
return [
list(self.tokenize(doc[0])) for doc in X.values
]

X是一个数据框,当你迭代数据框时,你会得到每一行的索引。因此,您在第一次迭代时得到0,然后sent_tokenizer失败,因为它期望其他内容。

我不确定整个管道。但你可以像这样一步搞定。注意,这只适用于一列。

X_train = pd.DataFrame([['asfas saf asf. dwqdwqwd '],['asdasdasd32d23  wedw ed wed. dwqdwq. ']])
y_train = [[1], [2]]

上面的方法适用于以下匹配输入。

def fit(self, X, y=None):
return self

大胆猜测(我不太熟悉扩展sklearn类):问题会在这里吗?似乎你是重写父方法不适合任何东西,只是返回类对象。

PP_8

相关内容

  • 没有找到相关文章

最新更新