我有一个简单的数据帧
sdf0 = spark.createDataFrame(
[
("eng", "BlackBerry sells legacy patents of mobile devices"),
("eng", "Amazon to shut down publishing house Westland Books"),
],
["lang", "title"],
)
lang | 标题 |
---|---|
eng | 黑莓出售移动设备的遗留专利 |
eng | 亚马逊关闭Westland Books出版社 |
找到了它-您的代码中至少有3个错误。在新会话上使用IDE,你会看到的所有错误
tokenize
def tokenize(self):
tokenizer = Tokenizer(inputCol="low_title", outputCol="tokens")
self.sdf = tokenizer.transform(sdf)
# should be
def tokenize(self):
tokenizer = Tokenizer(inputCol="low_title", outputCol="tokens")
self.sdf = tokenizer.transform(self.sdf) # self missing
stop_words
def stop_words(self):
available_lang = {"eng": "en"}
stopwords_iso = {}
for lang in available_langs:
stopwords_iso[lang] = stopwordsiso.stopwords(available_langs[lang])
stopwords = {k: list(v) for k, v in stopwords_iso.items()}
self.sdf = reduce(
lambda a, b: a.unionAll(b),
(
StopWordsRemover(
inputCol="tokens", outputCol="filtered_words", stopWords=value
).transform(sdf.where(F.col("lang") == key))
for key, value in stopwords.items()
),
)
# should be
def stop_words(self):
available_langs = {"eng": "en"} # final -s missing
stopwords_iso = {}
for lang in available_langs:
stopwords_iso[lang] = stopwordsiso.stopwords(available_langs[lang])
stopwords = {k: list(v) for k, v in stopwords_iso.items()}
self.sdf = reduce(
lambda a, b: a.unionAll(b),
(
StopWordsRemover(
inputCol="tokens", outputCol="filtered_words", stopWords=value
).transform(self.sdf.where(F.col("lang") == key)) # self missing
for key, value in stopwords.items()
),
)