如何使用Pyspark在OOP中重写代码



我有一个简单的数据帧

sdf0 = spark.createDataFrame(
[
("eng", "BlackBerry sells legacy patents of mobile devices"),
("eng", "Amazon to shut down publishing house Westland Books"),
],
["lang", "title"],
)
lang标题
eng黑莓出售移动设备的遗留专利
eng亚马逊关闭Westland Books出版社

找到了它-您的代码中至少有3个错误。在新会话上使用IDE,你会看到的所有错误


  1. tokenize

def tokenize(self):
tokenizer = Tokenizer(inputCol="low_title", outputCol="tokens")
self.sdf = tokenizer.transform(sdf)
# should be 
def tokenize(self):
tokenizer = Tokenizer(inputCol="low_title", outputCol="tokens")
self.sdf = tokenizer.transform(self.sdf) # self missing
  1. stop_words
def stop_words(self):
available_lang = {"eng": "en"}
stopwords_iso = {}
for lang in available_langs:
stopwords_iso[lang] = stopwordsiso.stopwords(available_langs[lang])
stopwords = {k: list(v) for k, v in stopwords_iso.items()}
self.sdf = reduce(
lambda a, b: a.unionAll(b),
(
StopWordsRemover(
inputCol="tokens", outputCol="filtered_words", stopWords=value
).transform(sdf.where(F.col("lang") == key))
for key, value in stopwords.items()
),
)
# should be 
def stop_words(self):
available_langs = {"eng": "en"} # final -s missing
stopwords_iso = {}
for lang in available_langs:
stopwords_iso[lang] = stopwordsiso.stopwords(available_langs[lang])
stopwords = {k: list(v) for k, v in stopwords_iso.items()}
self.sdf = reduce(
lambda a, b: a.unionAll(b),
(
StopWordsRemover(
inputCol="tokens", outputCol="filtered_words", stopWords=value
).transform(self.sdf.where(F.col("lang") == key))  # self missing
for key, value in stopwords.items()
),
)

最新更新