>我在 R 中导入了一个 excel 工作表,它是具有相同列名的几个数据帧的编译。为了说明它,它看起来像这样:
df <- tibble( empty = c(runif(3), NA, NA, NA, NA),
A = c(runif(3), NA, NA, NA, NA),
B = c(runif(3), NA, NA, NA, NA),
C = c(runif(3), NA, NA, NA, NA),
empty = c(runif(6), NA),
A = c(runif(6), NA),
B = c(runif(6), NA),
C = c(runif(6), NA),
empty = c(runif(5), NA, NA),
A = c(runif(5), NA, NA),
B = c(runif(5), NA, NA),
C = c(runif(5), NA, NA),
.name_repair = "minimal")
如何在此结果中转换此数据帧:
> df1
# A tibble: 3 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.200 0.0665 0.723 0.487
2 0.576 0.990 0.969 0.289
3 0.727 0.192 0.780 0.243
> df2
# A tibble: 6 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.556 0.698 0.796 0.357
2 0.308 0.542 0.867 0.103
3 0.643 0.792 0.385 0.882
4 0.675 0.504 0.489 0.0515
5 0.426 0.775 0.410 0.748
6 0.343 0.752 0.185 0.542
> df3
# A tibble: 5 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.229 0.0508 0.0880 0.486
2 0.146 0.295 0.562 0.731
3 0.292 0.804 0.133 0.0480
4 0.0404 0.399 0.366 0.152
5 0.226 0.702 0.476 0.416
名称为空的列实际上没有名称,尽管我不知道如何在此示例中分配它。
我问这个问题的原因是因为我还有其他几张纸,每张纸的相似列数量不同(D、E 等)。
我在这里找到了一个不错的帖子: 使用重复列名称拆分数据框 虽然这篇文章看起来是一样的,但它是不同的。
谢谢!
这会将结果放在一个list
中,这应该比按顺序命名的数据框更方便。
first_col = "empty"
name_groups = cumsum(names(df) == "empty")
result = split.default(df, name_groups)
# omit rows that have only missing values
result = lapply(result, (x) x[rowSums(is.na(x)) < ncol(x), ])
result
# $`1`
# # A tibble: 3 × 4
# empty A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 0.590 0.602 0.527 0.900
# 2 0.0450 0.713 0.936 0.911
# 3 0.567 0.781 0.349 0.686
#
# $`2`
# # A tibble: 6 × 4
# empty A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 0.480 0.543 0.744 0.0684
# 2 0.0423 0.799 0.927 0.537
# 3 0.962 0.0745 0.851 0.0639
# 4 0.615 0.546 0.390 0.0985
# 5 0.258 0.857 0.139 0.172
# 6 0.944 0.375 0.356 0.715
#
# $`3`
# # A tibble: 5 × 4
# empty A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 0.790 0.572 0.600 0.701
# 2 0.732 0.610 0.0395 0.283
# 3 0.130 0.168 0.120 0.0682
# 4 0.112 0.682 0.586 0.640
# 5 0.211 0.267 0.0189 0.606
如果确实要在全局环境中df1, df2, ...
,请添加以下行:
names(result) = paste0("df", names(result))
list2env(result, envir = .GlobalEnv)
当重复次数恒定(此处为 4)时,我们可以做这样的事情:
基数 R:
df1 <- df[,1:4]
df2 <- df[,5:8]
df3 <- df[,9:12]
> df1
# A tibble: 7 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.120 0.448 0.0453 0.315
2 0.337 0.296 0.757 0.448
3 0.533 0.574 0.681 0.324
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
7 NA NA NA NA
> df2
# A tibble: 7 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.420 0.306 0.472 0.107
2 0.639 0.666 0.349 0.768
3 0.469 0.311 0.100 0.744
4 0.00122 0.586 0.437 0.796
5 0.122 0.00989 0.289 0.408
6 0.570 0.253 0.877 0.197
7 NA NA NA NA
> df3
# A tibble: 7 x 4
empty A B C
<dbl> <dbl> <dbl> <dbl>
1 0.812 0.0464 0.473 0.638
2 0.340 0.482 0.269 0.164
3 0.0323 0.952 0.842 0.282
4 0.511 0.263 0.934 0.183
5 0.0711 0.483 0.763 0.639
6 NA NA NA NA
7 NA NA NA NA
df1 <- df[,1:4][1:3,]
df2 <- df[,5:8][1:6,]
df3 <- df[,9:12][1:5,]
另一种可能的解决方案,基于tidyverse
:
library(tidyverse)
stack(df) %>%
filter(!is.na(values)) %>%
group_by(aux = cumsum(ind == "empty" & lag(ind, default = "") != "empty")) %>%
group_split() %>%
map(~ pivot_wider(.x %>% select(-aux), names_from = "ind",
values_from = "values", values_fn = list) %>% unnest(everything()))
#> [[1]]
#> # A tibble: 3 × 4
#> empty A B C
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.865 0.0634 0.127 0.136
#> 2 0.343 0.431 0.943 0.985
#> 3 0.482 0.635 0.150 0.263
#>
#> [[2]]
#> # A tibble: 6 × 4
#> empty A B C
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.0656 0.514 0.834 0.662
#> 2 0.977 0.657 0.878 0.427
#> 3 0.670 0.641 0.910 0.175
#> 4 0.402 0.0494 0.433 0.0241
#> 5 0.211 0.388 0.971 0.273
#> 6 0.681 0.355 0.749 0.0536
#>
#> [[3]]
#> # A tibble: 5 × 4
#> empty A B C
#> <dbl> <dbl> <dbl> <dbl>
#> 1 0.440 0.856 0.00734 0.0474
#> 2 0.0347 0.328 0.471 0.845
#> 3 0.106 0.393 0.303 0.811
#> 4 0.385 0.184 0.540 0.180
#> 5 0.564 0.579 0.414 0.0110