编写一个函数从数据帧中删除停止字(非nltk)



我不能使用nltk,因为在工作中下载问题我想创建一个删除停止词(荷兰语)的函数。我有一个荷兰停止词的文本文件,我想在阅读和使用在熊猫数据框架中找到停止词。我把数据文件保存为文本文件。文件,但我得到了副本。有人能帮我解决这个问题吗,我写了下面的函数。

import pandas as pd 
import numpy as np
import re 
from nltk.tokenize import word_tokenize
dictionary = {'í':'i','á':'a','ö': 'o','ë':'e'}
pd.set_option('display.max_colwidt',-1)
df = pd.read_csv('Map1.csv', error_bad_lines=False, encoding='latin1')
df.replace(dictionary, regex=True, inplace=True)
# I want to remove it from df['omschrijving skill']
stopwords =['de','Een','van','ik','te','dat','die','in','een','hij','het','niet','zijn','is','was','of','aan']
querywords = query.split()
resultwords  = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)
print(result)

可以这样写:

from nltk.tokenize import word_tokenize
# you could get those from here https://raw.githubusercontent.com/stopwords-iso/stopwords-nl/master/stopwords-nl.txt
stopwords_to_remove = ['aan',
'aangaande',
'aangezien',
'achte',
'achter',
'achterna']
text = "Nick likes achter to play football, aangezien however he is achter not too fond of tennis."
#text_tokens = word_tokenize(text)
text_tokens = [word for word in text.split(' ')]
tokens_without_sw = [word for word in text_tokens if not word in stopwords_to_remove]
print(tokens_without_sw)
['Nick', 'likes', 'to', 'play', 'football,', 'however', 'he', 'is', 'not', 'too', 'fond', 'of', 'tennis.']

将上述内容解释为数据框架

import pandas as pd
import string
# you could get those from here https://raw.githubusercontent.com/stopwords-iso/stopwords-nl/master/stopwords-nl.txt
stopwords_to_remove = ['aan',
'aangaande',
'aangezien',
'achte',
'achter',
'achterna']
df = pd.DataFrame(['aangezien however he is achter not too '  , 'achter to play football'])
def a_tokenizer(x):
# to remove punctuation
x = x.translate(str.maketrans('', '', string.punctuation))
# to lower case and create tokens
text_tokens = [word.lower() for word in str(x).split(' ')]
# to remove stopwords
tokens_without_sw = [word for word in text_tokens if not word in stopwords_to_remove]
return tokens_without_sw
df[0].apply(lambda x: a_tokenizer(x))
0    [however, he, is, not, too, ]
1             [to, play, football]

最新更新