为什么没有根据特征结果对文本进行分类



使用以下自定义标记器

class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, articles):
result = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
# print(result)
return result

经过一些预处理步骤后,

descript_data= descript_data.replace(np.nan, '', regex=True)
descript_data= descript_data.str.replace('d+', ' ')
descript_data= descript_data.str.replace(r'(bw{1,2}b)', ' ')
descript_data= descript_data.str.replace('[^ws]', ' ')

我运行了以下内容:

vect = TfidfVectorizer(strip_accents = 'ascii', 
stop_words = 'english', 
lowercase = True, 
max_df = 0.8, 
min_df = 10,
analyzer='word',
tokenizer=LemmaTokenizer()) 
final = vect.fit_transform(descript_data)
print(vect.get_feature_names())

其中CCD_ 1是文本数据的列。在结果中,它仍然得到原始单词和带有"s"、"ly"等的补语单词。我该怎么修?

使用以下示例语句的代码是没有问题的。预处理步骤中可能存在一些问题。

#Invoke libraries
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, articles):
result = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
#print(result)
return result
#build example sentences with plural and singular words to check lemmatization
d = ["this is a good song by this singer",
"there are other songs that are better from other singers"]
#note that the max_df and min_df are commented as the sample size is too small
vect = TfidfVectorizer(strip_accents = 'ascii', 
stop_words = 'english', 
lowercase = True, 
#max_df = 0.8, 
#min_df = 10,
analyzer='word',
tokenizer=LemmaTokenizer()) 
final = vect.fit_transform(d)
print(vect.get_feature_names())
#output. Note that there is no "singers" or "songs"
[u'better', u'good', u'singer', u'song']

最新更新