9240 0.401958 92410.105928
我在python中有以下代码,我想在其中获得id和它的概率
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
train_df=pd.read_csv('ML_EX4_train.csv')
test_df=pd.read_csv('ML_EX4_test.csv')
feature_names=['title_word_count','document_entropy','freshness','easiness','fraction_stopword_presence','normalization_rate',
'speaker_speed','silent_period_rate']
test_df.index=test['id']
train_df.index=train['id']
X=train_df[feature_names]
y=train_df['engagement']
X_test=test_df[feature_names]
rf_clf=RandomForestClassifier(random_state=0,max_depth= 20, min_samples_leaf= 1, n_estimators= 300).fit(X_train,y_train)
y_test_proba=rf_clf.predict_proba(X_test)
ans=pd.Series(y_test_proba,index=test_df['id'])
print(ans)
它应该以下列方式给出答案
我理解这是一个二元分类问题。predict_proba
返回多个列中所有类的概率。如果只需要其中一个类的概率,可以通过以下操作只选择一列:
y_test_proba=rf_clf.predict_proba(X_test)[:,1] # or [:,0] depending on the class you are interested in
ans=pd.Series(y_test_proba,index=test_df['id'])
为了与您选择的列保持一致,您可以这样做:
class_to_predict = 1 # or 0 depending on the class you are interested in
column = np.where(rf_clf.classes_ == class_to_predict)[0][0]
y_test_proba=rf_clf.predict_proba(X_test)[:,column]
ans=pd.Series(y_test_proba,index=df.index)