Scikit使用机器学习在用户输入上学习/预测文本(存在于xlsx中)



我有 Xlsx 文件,其中包含只有一列的预定义文本。用户将输入一个或多个单词,输出将是包含单词或单词的文本。

import numpy as np
import pandas as pd
import time
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances
import pickle

def load_df(path):
df = pd.read_excel(path)
print(df.shape)
return df

def splitDataFrameList(df, target_column, separator):
def splitListToRows(row, row_accumulator, target_column, separator):
split_row = row[target_column].split(separator)
for s in split_row:
new_row = row.to_dict()
new_row[target_column] = s
row_accumulator.append(new_row)
new_rows = []
df.apply(splitListToRows, axis=1, args=(new_rows, target_column, separator))
new_df = pd.DataFrame(new_rows)
return new_df

class Autocompleter:
def __init__(self):
pass
def import_json(self, json_filename):
print("load Excel file...")
df = load_df(json_filename)
return df
def process_data(self, new_df):
# print("select representative threads...")
# new_df = new_df[new_df.IsFromCustomer == False]
print("split sentenses on punctuation...")
for sep in ['. ', ', ', '? ', '! ', '; ']:
new_df = splitDataFrameList(new_df, 'UserSays', sep)
print("UserSays Cleaning using simple regex...")
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: " ".join(x.split()))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.strip("."))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: " ".join(x.split()))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.strip("-"))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: " ".join(x.split()))
# new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.lower())
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' i ', ' I '))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' ?', '?'))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' !', '!'))
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace(' .', '.'))
# new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.replace('OK', 'Ok'))
# new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x[0].upper() + x[1:])
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x + "?" if re.search(r'^(Wh|How).+([^?])$', x) else x)
new_df['UserSays'] = new_df['UserSays'].apply(lambda x: x.capitalize())
print("calculate nb words of sentenses...")
new_df['nb_words'] = new_df['UserSays'].apply(lambda x: len(str(x).split(' ')))
new_df = new_df[new_df['nb_words'] > 2]
print("count occurence of sentenses...")
new_df['Counts'] = new_df.groupby(['UserSays'])['UserSays'].transform('count')
print("remove duplicates (keep last)...")
new_df = new_df.drop_duplicates(subset=['UserSays'], keep='last')
new_df = new_df.reset_index(drop=True)
print(new_df.shape)
return new_df
def calc_matrice(self, df):
# define tfidf parameter in order to count/vectorize the description vector and then normalize it.
model_tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
tfidf_matrice = model_tf.fit_transform(df['UserSays'])
pickle.dump(model_tf, open("model.pkl", 'wb'))
pickle.dump(tfidf_matrice, open("train.pkl", 'wb'))
print("tfidf_matrice ", tfidf_matrice.shape)
return model_tf, tfidf_matrice
def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):
prefix_string = str(prefix_string)
new_df = data.reset_index(drop=True)
weights = new_df['Counts'].apply(lambda x: 1 + np.log1p(x)).values
# tranform the string using the tfidf model
tfidf_matrice_spelling = model_tf.transform([prefix_string])
# calculate cosine_matrix
cosine_similarite = cosine_similarity(tfidf_matrice, tfidf_matrice_spelling)
# sort by order of similarity from 1 to 0:
similarity_scores = list(enumerate(cosine_similarite))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[0:10]
similarity_scores = [i for i in similarity_scores]
similarity_indices = [i[0] for i in similarity_scores]
# add weight to the potential results that had high frequency in orig data
for i in range(len(similarity_scores)):
similarity_scores[i][1][0] = similarity_scores[i][1][0] * weights[similarity_indices][i]
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
similarity_scores = similarity_scores[0:]
similarity_indices_w = [i[0] for i in similarity_scores]
final_result = new_df.loc[similarity_indices_w]['UserSays'].tolist()
return final_result

在输入中,如果我什么都不输入,它为我提供了这个输出

['How to access outlook on open network?', 'Email access outside ril network', 'Log in outlook away from office']

这是不希望的 如果只有一个文本匹配,它会给出以下输出

input - sccm
['What is sccm', 'How to access outlook on open network?', 'Email access outside ril network']

我希望以这样的方式输出,如果输入的单词或单词在 xlsx 文件中不存在,那么输出不应该返回我任何东西。 和

我认为您的代码返回相似性得分为 0 的值。 您可以将generate_completions函数中的行更改为仅保留相似性得分大于零的值:

similarity_scores = [i for i in similarity_scores if i[1] > 0]

最新更新