如何优化代码,动态获取converted_data而不是固定大小6


def preprocess_pattern_data(converted_data):
corpus_1 = []
corpus_2 = []
corpus_3 = []
corpus_4 = []
corpus_5 = [] 
corpus_6 = []
c_1 = []
c_2 = []
c_3 = []
c_4 = []
c_5 = []
c_6 = []
doc = converted_data[0].split(",")
for i in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[i])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus_1.append(review)
for i in corpus_1:
if i == '':
corpus_1.remove('')
else:
c_1.append(i)
sentence_1 = ' '.join(word for word in c_1) 
doc = converted_data[1].split(",")
for j in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[j])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus_2.append(review)
for i in corpus_2:
if i == '':
corpus_2.remove('')
else:
c_2.append(i)
sentence_2 = ' '.join(word for word in c_2) 
doc = converted_data[2].split(",")
for k in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[k])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus_3.append(review)
for i in corpus_3:
if i == '':
corpus_3.remove('')
else:
c_3.append(i)
sentence_3 = ' '.join(word for word in c_3) 
doc = converted_data[3].split(",")
for l in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[l])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus_4.append(review)
for i in corpus_4:
if i == '':
corpus_4.remove('')
else:
c_4.append(i)
sentence_4 = ' '.join(word for word in c_4) 
doc = converted_data[4].split(",")
for m in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[m])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus_5.append(review)
for i in corpus_5:
if i == '':
corpus_5.remove('')
else:
c_5.append(i)
sentence_5 = ' '.join(word for word in c_5) 
doc = converted_data[5].split(",")
for n in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[n])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus_6.append(review)
for i in corpus_6:
if i == '':
corpus_6.remove('')
else:
c_6.append(i)
sentence_6 = ' '.join(word for word in c_6) 
sentences = [sentence_1, sentence_2, sentence_3, sentence_4, sentence_5, sentence_6]
return sentences

请帮助我优化代码,我希望它是动态的,以获取转换后的文本长度并在代码中循环。在这里,我取了六个列表,并使用静态6列表附加了我的代码,但我希望它是动态的,并从converted_text循环中取转换文本的长度,并将其附加到句子中。

我试过用这个代码,但问题是我创建的所有六个句子都在一个列表中连接在一起,所以如果我做句子[0],我只得到第一个单词,而不是第一个完整的句子。

def preprocess_pattern_data(converted_data):
corpus = []
c = []
sentences = []

for i in range(len(converted_data)):

doc = converted_data[i].split(',')

for j in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[j])
review = review.split()
review = [word for word in review if not word in custom_stopwords]
review = ''.join(review)
corpus.append(review)
for k in corpus:
if k == '':
corpus.remove('')
else:
c.append(k)
sent = ' '.join(word for word in c)
sentences.append(sent)

return sentences
pattern_sentences = preprocess_pattern_data(converted_data)
print(pattern_sentences)

也许是这样?目前还不清楚你想做什么,但这应该会让你走上正轨。

你的脚本中有几个问题:

  • "单词不在";而不是";不在">
  • 也许你忘了在每次cicle重置变量(c,语料库(

这是代码:

import re
custom_stopwords = ['and', 'or']
converted_data = ['one, two, three', 'four,five,six']
def preprocess_pattern_data(converted_data):
sentences = []
for i in range(len(converted_data)):
sent = ''
corpus = []
c = []
doc = converted_data[i].split(',')
for j in range(len(doc)):
review = re.sub('[^a-zA-Z]', ' ', doc[j])
review = review.split()
review = [word for word in review if word not in custom_stopwords]
review = ''.join(review)
corpus.append(review)
for k in corpus:
if k == '':
corpus.remove('')
else:
c.append(k)
sent = ' '.join(word for word in c)
sentences.append(sent)
return sentences
pattern_sentences = preprocess_pattern_data(converted_data)
print(pattern_sentences)

此输出:

['one two three', 'four five six']

最新更新