在多个数据集上使用R自动化机器学习过程



我有多个不同长度的数据集。我想应用一个相关函数来删除98%的相关变量。如何使用循环在同一时间将关联函数应用于多个数据集,并将所选变量存储在新的数据帧中?

如何在多个数据集上使用套索回归,也使用循环函数?谢谢

H<-data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C<-data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R<-data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E<-data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
# Corrélation
library("caret")
library("dplyr")
data.cor <- cor(subset(H, select = -c(X10)))
high.cor <- findCorrelation(data.cor, cutoff=0.98)
remove <- names(H[high.cor]) 
remove <- c(remove)
myvars <- names(H) %in% remove
var_selected <- H[!myvars]
new_data_H <- var_selected

这里有一种(几种(方法:

# Corrélation
library(caret)
library(dplyr)
set.seed(99)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
# Combine input datasets a list
inputs <- list(H, C, R, E)
# Empty list to hold results
outputs <- list()
# Loop over each dataset, one at a time
for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
# Subset the dataset based on `high.cor`
# Add the subsetted dataset to a output list of datasets
outputs <- append(outputs, list(df[,-high.cor]))
}
# This is the first dataset processed by the loop
outputs[[1]]
# Second...
outputs[[2]]
# Third...
outputs[[3]]

编辑:集成你的套索程序

library(glmnet)
library(caret)
set.seed(99)
## Define data (indpendent variables)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
inputs <- list(H, C, R, E)
## Define targets (dependent variables)
Y_H <- data.frame(label_1 = replicate(1,sample(20:35, 10, rep = TRUE)))
Y_C <- data.frame(label_2 = replicate(1,sample(15:65, 10, rep = TRUE)))
Y_R <- data.frame(label_3 = replicate(1,sample(25:45, 10, rep = TRUE)))
Y_E <- data.frame(label_4 = replicate(1,sample(21:80, 10, rep = TRUE)))
targets <- list(Y_H, Y_C, Y_R, Y_E)
## Remove coorelated independent variables
outputs <- list()
for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
outputs <- append(outputs, list(df[,-high.cor]))
}
## Do lasso regression
lasso_cv <- list()
lasso_model <- list()
for(i in 1:length(outputs)){
for(j in 1:length(targets)){

lasso_cv[[i]] <- cv.glmnet(
as.matrix(outputs[[i]]), as.matrix(targets[[j]]), standardize = TRUE, type.measure = "mse",  alpha = 1, nfolds = 3)

lasso_model[[i]] <- glmnet(
as.matrix(outputs[[i]]), as.matrix(targets[[j]]), lambda = lasso_cv[[i]]$lambda_cv, standardize = TRUE, alpha = 1)

}
}
  • 为每个数据帧创建目标变量
  • 组合列表中的所有数据帧
  • 合并列表中的所有目标
  • 注意:每个目标变量对应一个数据帧
  • 相关性:删除相关变量
  • 对所有列表执行套索回归

创建数据帧

set.seed(99)
H <- data.frame(replicate(10,sample(0:20,10,rep=TRUE)))   
C <- data.frame(replicate(5,sample(0:100,10,rep=FALSE)))
R <- data.frame(replicate(7,sample(0:30,10,rep=TRUE)))
E <- data.frame(replicate(4,sample(0:40,10,rep=FALSE)))
Y_H <- data.frame(replicate(1,sample(20:35, 10, rep = TRUE)))
Y_H
names(Y_H)<-
names(Y_H)names(Y_H)=="replicate.1..sample.20.35..10..rep...TRUE.."] <-"label_1"
Y_C <- data.frame(replicate(1,sample(15:65, 10, rep = TRUE)))
names(Y_C) <-
names(Y_C)[names(Y_C)=="replicate.1..sample.15.65..10..rep...TRUE.."] <-"label_2" 
Y_R <- data.frame(replicate(1,sample(25:45, 10, rep = TRUE)))
names(Y_R) <-names(Y_R)[names(Y_R) == "replicate.1..sample.25.45..10..rep...TRUE.."] <- "label_3"

Y_E <- data.frame(replicate(1,sample(21:80, 10, rep = TRUE)))
names(Y_E) <-names(Y_E)[names(Y_E) == "replicate.1..sample.15.65..10..rep...TRUE.."] <- "label_4"

inputs <- list(H, C, R, E)
targets <- list(Y_H, Y_C, Y_R, Y_E)

outputs <- list()

for(df in inputs){
data.cor <- cor(df)
high.cor <- findCorrelation(data.cor, cutoff=0.40)
outputs <- append(outputs, list(df[,-high.cor]))
}
library("glmnet")
lasso_cv <- list()
lasso_model <- list()
for(i in outputs){
for(j in targets){
lasso_cv[i] <- cv.glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]), 
standardize = TRUE, type.measure="mse",  alpha = 1,nfolds = 3)
lasso_model[i] <- glmnet(as.matrix(outputs[[i]]), as.matrix(targets[[j]]),lambda = lasso_cv[i]$lambda_cv, alpha = 1, standardize = TRUE)

}}

最新更新