我有来自数据集的子集,并希望对这些子集执行回归。我可以编码的一切单独,但我正在寻找一个干净和快速的解决方案,可能与循环。
我从mtcars中创建了一个示例数据集,其中包含3个子集:df1, df2, df3。
如果该解决方案原则上也可以应用于其他分析:vif,逐步回归,ANN…
# Define 3 datasets from mtcars: df1, df2, df3
df1 <- mtcars
library(dplyr)
df2 <- mtcars %>%
filter(cyl <= median(cyl, na.rm = T))
df3 <- mtcars %>%
filter(cyl > median(cyl, na.rm = T))
# regression 1
model_df1 <- lm(df1$mpg ~ df1$disp + df1$hp)
# regression 2
model_df2 <- lm(df2$mpg ~ df2$disp + df2$hp)
# regression 3
model_df3 <- lm(df3$mpg ~ df3$disp + df3$hp)
将这些子集存储在(命名的)列表中将允许您方便地使用apply- or map-family中的函数:
library(dplyr)
library(purrr)
library(broom)
df_list <- list()
df_list$mtcars <- mtcars
df_list$lt_median <- mtcars %>% filter(cyl <= median(cyl, na.rm = T))
df_list$gt_median <- mtcars %>% filter(cyl > median(cyl, na.rm = T))
# fit a model on each dataset in df_list, returns list of models:
model_list <- map(df_list, ~ lm(mpg ~ disp + hp, data = .x))
summary(model_list[[1]])
#>
#> Call:
#> lm(formula = mpg ~ disp + hp, data = .x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.7945 -2.3036 -0.8246 1.8582 6.9363
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 30.735904 1.331566 23.083 < 2e-16 ***
#> disp -0.030346 0.007405 -4.098 0.000306 ***
#> hp -0.024840 0.013385 -1.856 0.073679 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.127 on 29 degrees of freedom
#> Multiple R-squared: 0.7482, Adjusted R-squared: 0.7309
#> F-statistic: 43.09 on 2 and 29 DF, p-value: 2.062e-09
# apply tidy / glance / augment from broom to all models in a list,
# either collect results into same data frame
map(model_list, tidy) %>% imap_dfr(~ mutate(.x, dataset = .y))
#> # A tibble: 9 × 6
#> term estimate std.error statistic p.value dataset
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 (Intercept) 30.7 1.33 23.1 3.26e-20 mtcars
#> 2 disp -0.0303 0.00740 -4.10 3.06e- 4 mtcars
#> 3 hp -0.0248 0.0134 -1.86 7.37e- 2 mtcars
#> 4 (Intercept) 38.1 2.59 14.7 2.50e-10 lt_median
#> 5 disp -0.0546 0.0160 -3.40 3.93e- 3 lt_median
#> 6 hp -0.0688 0.0277 -2.48 2.53e- 2 lt_median
#> 7 (Intercept) 24.0 4.05 5.94 9.69e- 5 gt_median
#> 8 disp -0.0186 0.00946 -1.97 7.46e- 2 gt_median
#> 9 hp -0.0113 0.0126 -0.900 3.87e- 1 gt_median
map(model_list, glance) %>% imap_dfr(~ mutate(.x, dataset = .y))
#> # A tibble: 3 × 13
#> r.squ…¹ adj.r…² sigma stati…³ p.value df logLik AIC BIC devia…⁴ df.re…⁵
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 0.748 0.731 3.13 43.1 2.06e-9 2 -80.3 169. 174. 283. 29
#> 2 0.699 0.659 2.91 17.4 1.24e-4 2 -43.1 94.2 97.8 127. 15
#> 3 0.320 0.197 2.29 2.59 1.20e-1 2 -29.8 67.6 70.2 57.9 11
#> # … with 2 more variables: nobs <int>, dataset <chr>, and abbreviated variable
#> # names ¹r.squared, ²adj.r.squared, ³statistic, ⁴deviance, ⁵df.residual
# or keep as a list
map(model_list, augment, newdata = head(mtcars[c("mpg", "disp", "hp")], n = 5))
#> $mtcars
#> # A tibble: 5 × 6
#> .rownames mpg disp hp .fitted .resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 160 110 23.1 -2.15
#> 2 Mazda RX4 Wag 21 160 110 23.1 -2.15
#> 3 Datsun 710 22.8 108 93 25.1 -2.35
#> 4 Hornet 4 Drive 21.4 258 110 20.2 1.23
#> 5 Hornet Sportabout 18.7 360 175 15.5 3.24
#>
#> $lt_median
#> # A tibble: 5 × 6
#> .rownames mpg disp hp .fitted .resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 160 110 21.8 -0.815
#> 2 Mazda RX4 Wag 21 160 110 21.8 -0.815
#> 3 Datsun 710 22.8 108 93 25.8 -3.02
#> 4 Hornet 4 Drive 21.4 258 110 16.5 4.93
#> 5 Hornet Sportabout 18.7 360 175 6.43 12.3
#>
#> $gt_median
#> # A tibble: 5 × 6
#> .rownames mpg disp hp .fitted .resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 160 110 19.8 1.18
#> 2 Mazda RX4 Wag 21 160 110 19.8 1.18
#> 3 Datsun 710 22.8 108 93 21.0 1.82
#> 4 Hornet 4 Drive 21.4 258 110 18.0 3.41
#> 5 Hornet Sportabout 18.7 360 175 15.4 3.34
创建于2023-01-16与reprex v2.0.2