我目前有以下代码,用于处理搜索和候选字典之间的相似性计算。从4000名候选人那里得到计算结果大约需要13秒。我做了一些研究,认为可以通过使用nlp.pipe((来改进它。然而,我仍然不明白我如何才能做到这一点?请告知。下面是我的python代码。
import os
import sys
from flask import Flask, request, jsonify
import spacy
nlp = spacy.load("en_core_web_lg")
all_stopwords = nlp.Defaults.stop_words
app = Flask(__name__)
@app.route("/")
def index():
return "Page does not exist"
@app.route('/calculate-matches', methods=['POST'])
def calculate_matches():
data = request.get_json()
candidates = data['candidates']
cur_search = nlp('Looking for someone with experience in building vue frontend applications')
tmp_search = ''
for x in cur_search:
if x.pos_ == "NOUN" or x.pos_ == "PROPN" or x.pos_=="PRON" or x.is_stop==False:
tmp_search += str(x) + ' '
cur_search = nlp(tmp_search)
for member in candidates:
member_bio = nlp(member['bio']+ ' ' + member['education']+ ' ' + member['experience'])
#calculate similarity
member['match_score'] = ( cur_search.similarity(member_bio) * 100 )
#sort canidates' match_score from high to low
results = sorted(candidates, key=lambda k: k['match_score'], reverse=True)
return jsonify(results)
if __name__ == "__main__":
currentdir = os.path.dirname(os.path.realpath(__file__))
if currentdir not in sys.path:
sys.path.insert(0, currentdir)
app.run(host='0.0.0.0', port=5000)
您可以使用线性代数以广播的方式计算这种相似性:
import numpy as np
def cosine_similarity(v, A):
return np.argmax(np.dot(v, A.T) / (np.linalg.norm(v, ord=2) * np.linalg.norm(A, axis=1, ord=2))
A = np.stack([member.vector for member in member_bio])
v = cur_search.vector
closest_idx = cosine_similarity(v, A)