Python随机森林和机器学习 - 改进



我是使用Python进行机器学习的新手。我来自Fortran的编程背景,因此您可以想象,Python是一个相当飞跃。我从事化学工作,已经参与化学形象(将数据科学技术应用于化学)。因此,Python广泛的机器学习库的应用很重要。我还需要我的代码有效。我编写了一个运行的代码,似乎可以正常工作。我想知道的是:

1如何最好地改善它/使其更有效。

2关于我使用过的替代配方的任何建议,如果可能的话,另一个路线可能会出色的原因?

我倾向于与连续的数据和回归模型一起工作。

任何建议都很棒,并事先感谢您。

import scipy
import math
import numpy as np
import pandas as pd
import plotly.plotly as py
import os.path
import sys
from time import time
from sklearn import preprocessing, metrics, cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
fname = str(raw_input('Please enter the input file name containing total dataset and descriptors (assumes csv file, column headings and first column are labelsn'))
if os.path.isfile(fname) :
    SubFeAll = pd.read_csv(fname, sep=",")
else:
    sys.exit("ERROR: input file does not exist")
#SubFeAll = pd.read_csv(fname, sep=",")
SubFeAll = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor
header = SubFeAll.columns.values # Use the column headers as the descriptor labels
SubFeAll.head()
# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1)  
# Random Forest results initialised
RFr2 = []
RFmse = []
RFrmse = []
# Predictions results initialised 
RFpredictions = []
metcount = 0
# Give the array from pandas to numpy
npArray = np.array(SubFeAll)
print header.shape
npheader = np.array(header[1:-1])
print("Array shape X = %d, Y = %d " % (npArray.shape))
datax, datay =  npArray.shape
# Print specific nparray values to check the data
print("The first element of the input data set, as a minial check please ensure this is as expected = %s" % npArray[0,0])
# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point
names = npArray[:,0]
X = npArray[:,1:-1].astype(float)
y = npArray[:,-1] .astype(float)
X = preprocessing.scale(X)
print X.shape
# Open output files
train_name = "Training.csv"
test_name = "Predictions.csv"
fi_name = "Feature_importance.csv"
with open(train_name,'w') as ftrain, open(test_name,'w') as fpred, open(fi_name,'w') as ffeatimp:
        ftrain.write("This file contains the training information for the Random Forest modelsn")
        ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,n")
        ftrain.write("Interation %d ,n" %(metcount+1))
        fpred.write("This file contains the prediction information for the Random Forest modelsn")
        fpred.write("Predictions are made over a ten fold cross validation hence training on 90% test on 10%. The final prediction are return iteratively over this ten fold cros validation once,n")
        fpred.write("optimised parameters are located via a grid search at each fold,n")
        fpred.write("Interation %d ,n" %(metcount+1))
        ffeatimp.write("This file contains the feature importance information for the Random Forest model,n")
        ffeatimp.write("Interation %d ,n" %(metcount+1))
        # Begin the K-fold cross validation over ten folds
        kf = KFold(datax, n_folds=10, shuffle=True, random_state=0)
        print "------------------- Begining Ten Fold Cross Validation -------------------"
        for train, test in kf:
            XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test]
            ytestdim = yTest.shape[0]
                print("The test set values are : ")
                i = 0
                if ytestdim%5 == 0:
                        while i < ytestdim:
                                print round(yTest[i],2),'t', round(yTest[i+1],2),'t', round(yTest[i+2],2),'t', round(yTest[i+3],2),'t', round(yTest[i+4],2)
                                ftrain.write(str(round(yTest[i],2))+','+ str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+','+str(round(yTest[i+4],2))+',n')
                                i += 5
                elif ytestdim%4 == 0:
                        while i < ytestdim:
                                print round(yTest[i],2),'t', round(yTest[i+1],2),'t', round(yTest[i+2],2),'t', round(yTest[i+3],2)
                                ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+','+str(round(yTest[i+3],2))+',n')
                                i += 4
                elif ytestdim%3 == 0 :
                        while i < ytestdim :
                                print round(yTest[i],2),'t', round(yTest[i+1],2),'t', round(yTest[i+2],2)
                                ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+','+str(round(yTest[i+2],2))+',n')
                                i += 3
                elif ytestdim%2 == 0 :
                        while i < ytestdim :
                                print round(yTest[i],2), 't', round(yTest[i+1],2)
                                ftrain.write(str(round(yTest[i],2))+','+str(round(yTest[i+1],2))+',n')
                                i += 2
                        else :
                                while i< ytestdim :
                                        print round(yTest[i],2)
                                        ftrain.write(str(round(yTest[i],2))+',n')
                                        i += 1        
                print "n"
                # random forest grid search parameters
            print "------------------- Begining Random Forest Grid Search -------------------"
                rfparamgrid = {"n_estimators": [10], "max_features": ["auto", "sqrt", "log2"], "max_depth": [5,7]}
                rf = RandomForestRegressor(random_state=0,n_jobs=2)
                RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error',cv=10)
                start = time()
                RfGridSearch.fit(XTrain,yTrain)
                # Get best random forest parameters
                print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_)))
                RFtime = time() - start,len(RfGridSearch.grid_scores_)
                #print(RfGridSearch.grid_scores_)  # Diagnos
                print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators'])
                ne = RfGridSearch.best_params_['n_estimators']
                print("max_features = %s " % RfGridSearch.best_params_['max_features'])
                mf = RfGridSearch.best_params_['max_features']
                print("max_depth = %d " % RfGridSearch.best_params_['max_depth'])
                md = RfGridSearch.best_params_['max_depth']
                ftrain.write("Random Forest")
                ftrain.write("RF search time, %s ,n" % (str(RFtime)))
                ftrain.write("Number of Trees, %s ,n" % str(ne))
                ftrain.write("Number of feature at split, %s ,n" % str(mf))
                ftrain.write("Max depth of tree, %s ,n" % str(md))
                # Train random forest and predict with optimised parameters
                print("nn------------------- Starting opitimised RF training -------------------")
                optRF = RandomForestRegressor(n_estimators = ne, max_features = mf, max_depth = md, random_state=0)
                optRF.fit(XTrain, yTrain)       # Train the model
                RFfeatimp = optRF.feature_importances_
                indices = np.argsort(RFfeatimp)[::-1]
                print("Training R2 = %5.2f" % optRF.score(XTrain,yTrain))
                print("Starting optimised RF prediction")
                RFpreds = optRF.predict(XTest)
                print("The predicted values now follow :")
                RFpredsdim = RFpreds.shape[0]
                i = 0
                if RFpredsdim%5 == 0:
                        while i < RFpredsdim:
                                print round(RFpreds[i],2),'t', round(RFpreds[i+1],2),'t', round(RFpreds[i+2],2),'t', round(RFpreds[i+3],2),'t', round(RFpreds[i+4],2)
                                i += 5
                elif RFpredsdim%4 == 0:
                        while i < RFpredsdim:
                                print round(RFpreds[i],2),'t', round(RFpreds[i+1],2),'t', round(RFpreds[i+2],2),'t', round(RFpreds[i+3],2)
                                i += 4
                elif RFpredsdim%3 == 0 :
                        while i < RFpredsdim :
                                print round(RFpreds[i],2),'t', round(RFpreds[i+1],2),'t', round(RFpreds[i+2],2)
                                i += 3
                elif RFpredsdim%2 == 0 :
                        while i < RFpredsdim :
                                print round(RFpreds[i],2), 't', round(RFpreds[i+1],2)
                                i += 2
                else :
                        while i< RFpredsdim :
                                print round(RFpreds[i],2)
                i += 1
                print "n"
                RFr2.append(optRF.score(XTest, yTest))
                RFmse.append( metrics.mean_squared_error(yTest,RFpreds))
                RFrmse.append(math.sqrt(RFmse[metcount]))
                print ("Random Forest prediction statistics for fold %d are; MSE = %5.2f RMSE = %5.2f R2 = %5.2fnn" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount]))
                ftrain.write("Random Forest prediction statistics for fold %d are, MSE =, %5.2f, RMSE =, %5.2f, R2 =, %5.2f,nn" % (metcount+1, RFmse[metcount], RFrmse[metcount],RFr2[metcount]))

                ffeatimp.write("Feature importance rankings from random forest,n")
                for i in range(RFfeatimp.shape[0]) :
                        ffeatimp.write("%d. , feature %d , %s,  (%f),n" % (i + 1, indices[i], npheader[indices[i]], RFfeatimp[indices[i]]))

                # Store prediction in original order of data (itest) whilst following through the current test set order (j)
            metcount += 1
                ftrain.write("Fold %d, n" %(metcount))
            print "------------------- Next Fold %d -------------------" %(metcount+1)
            j = 0
            for itest in test :
                RFpredictions.append(RFpreds[j])
                j += 1

        lennames = names.shape[0]
        lenpredictions = len(RFpredictions)
        lentrue = y.shape[0]
        if lennames == lenpredictions == lentrue :
                fpred.write("Names/Label,, Prediction Random Forest,, True Value,n") 
                for i in range(0,lennames) :
                        fpred.write(str(names[i])+",,"+str(RFpredictions[i])+",,"+str(y[i])+",n")
        else :
                fpred.write("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csvn")
                fpred.write("Array printed in the order names/Labels, predictions RF and true valuesn")
                fpred.write(names+"n")
                fpred.write(RFpredictions+"n")
                fpred.write(y+"n")
                sys.exit("ERROR - names, prediction and true value array size mismatch. Dumping arrays for manual inspection in predictions.csv")
        print "Final averaged Random Forest metrics : "
        RFamse  = sum(RFmse)/10
        RFmse_sd = np.std(RFmse)
        RFarmse = sum(RFrmse)/10
        RFrmse_sd = np.std(RFrmse)
        RFslope, RFintercept, RFr_value, RFp_value, RFstd_err = scipy.stats.linregress(RFpredictions, y)
        RFR2 = RFr_value**2
        print "Average Mean Squared Error = ", RFamse, " +/- ", RFmse_sd 
        print "Average Root Mean Squared Error = ", RFarmse, " +/- ", RFrmse_sd
        print "R2 Final prediction against True values = ", RFR2
        fpred.write("n")
        fpred.write("FINAL PREDICTION STATISTICS,n")
        fpred.write("Random Forest average MSE, %s, +/-, %s,n" %(str(RFamse), str(RFmse_sd)))
        fpred.write("Random Forest average RMSE, %s, +/-, %s,n" %(str(RFarmse), str(RFrmse_sd)))
    fpred.write("Random Forest slope, %s,   Random Forest intercept, %s,n" %(str(RFslope), str(RFintercept)))
        fpred.write("Random Forest standard error, %s,n" %(str(RFstd_err)))
    fpred.write("Random Forest R, %s,n" %(str(RFr_value)))
        fpred.write("Random Forest R2, %s,n" %(str(RFR2)))
ftrain.close()
fpred.close()
ffeatimp.close()

您还可以将功能选择添加到数据:

Sickit学习功能选择

Sickit学习提供了一些功能选择技术,您可以使用它来改善DM项目的某些方面

相关内容

  • 没有找到相关文章

最新更新