使用以下自定义标记器
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, articles):
result = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
# print(result)
return result
经过一些预处理步骤后,
descript_data= descript_data.replace(np.nan, '', regex=True)
descript_data= descript_data.str.replace('d+', ' ')
descript_data= descript_data.str.replace(r'(bw{1,2}b)', ' ')
descript_data= descript_data.str.replace('[^ws]', ' ')
我运行了以下内容:
vect = TfidfVectorizer(strip_accents = 'ascii',
stop_words = 'english',
lowercase = True,
max_df = 0.8,
min_df = 10,
analyzer='word',
tokenizer=LemmaTokenizer())
final = vect.fit_transform(descript_data)
print(vect.get_feature_names())
其中CCD_ 1是文本数据的列。在结果中,它仍然得到原始单词和带有"s"、"ly"等的补语单词。我该怎么修?
使用以下示例语句的代码是没有问题的。预处理步骤中可能存在一些问题。
#Invoke libraries
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self, articles):
result = [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
#print(result)
return result
#build example sentences with plural and singular words to check lemmatization
d = ["this is a good song by this singer",
"there are other songs that are better from other singers"]
#note that the max_df and min_df are commented as the sample size is too small
vect = TfidfVectorizer(strip_accents = 'ascii',
stop_words = 'english',
lowercase = True,
#max_df = 0.8,
#min_df = 10,
analyzer='word',
tokenizer=LemmaTokenizer())
final = vect.fit_transform(d)
print(vect.get_feature_names())
#output. Note that there is no "singers" or "songs"
[u'better', u'good', u'singer', u'song']