使用nltk使用send_tokenize时,如何从输出屏幕上删除 n



我正在使用句子tokenizer,但是如何从输出中删除不需要的/n

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import PyPDF2 as p2
pdf_file = open("Muhammad_CV.pdf", 'rb')
pdf_read = p2.PdfFileReader(pdf_file)
count = pdf_read.numPages
for i in range(count):
    page = pdf_read.getPage(i)
    text = page.extractText()                               #Extract text
    tokenized = sent_tokenize(text)                 #Token
    all_words = []
    for w in tokenized:
    all_words.append(w.lower())                  #Lower case
# ///////////////// Stop Words ///////////////////////////
    stop_words = set(stopwords.words('english'))
    filtered = []
    for w in all_words:
        if w not in stop_words:
        filtered.append(w)
    print(filtered)

我得到的输出:

{'the specialization includes:n nn nintroductionn nton ndatan nsciencen nn nbign ndatan n&n ncloudn ncomputingn nn ndatan nminingn nn nmachinen nlearnning'}

所需的输出:

{'the specialization includes: introduction to data science big data cloudn computing data mining machine learning'}
 text = '''n Apple has quietly  hired Dr. Rajiv B. Kumar, a pediatric endocrinologist n. He will continue working at the hospital part time n '''
 tokenized_sent_before_remove_n = nltk.sent_tokenize(text)
 #op 
 ['n Apple has quietly  hired Dr. Rajiv B. Kumar, a pediatric endocrinologist n.',
'He will continue working at the hospital part time']

 tokenized_sent_after_remove_n = [x.replace('n','') for x in tokenized_sent]
 #o/p
 [' Apple has quietly  hired Dr. Rajiv B. Kumar, a pediatric endocrinologist .',
 'He will continue working at the hospital part time']

您只需要调用字符串strip()方法即可删除周围的空白。

这是一个示例(也使用综合,因为那是Pythonic的方式:)(

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import PyPDF2 as p2
pdf_file = open("Muhammad_CV.pdf", 'rb')
pdf_read = p2.PdfFileReader(pdf_file)
count = pdf_read.numPages
for i in range(count):
    page = pdf_read.getPage(i)
    text = page.extractText()
    tokenized = sent_tokenize(text)
    all_words = [w.strip().lower() for w in tokenized]
    stop_words = set(stopwords.words('english'))
    filtered = [w for w in all_words if w not in stop_words]
    print(filtered)

编辑:对trim纠正至strip:(

最新更新