r-使用dplyr来加宽数据,自动对变量排序并对加宽的列求和



我正在尝试执行一些已经完成/得到帮助的函数,但我想使用我的结果添加更多的列。我想在每年的年底做一个总数,在年底做一次总数。有两列被删除,我想保留为总数。如果需要更多信息,请告诉我。提前谢谢。

np<-structure(list(STATE_ABBR = structure(c(2L, 1L, 2L,2L, 1L, 3L, 3L,2L, 2L),
.Label = c("CA", "KY", "NM"), class = "factor"),

REPORTING_YEAR = structure(c(3L,1L, 3L,3L, 2L, 1L, 2L, 1L, 2L),
.Label = c("1990", "2000", "2005"), class = "factor"),

COUNTY_NAME = structure(c(2L,1L, 3L,3L, 2L, 3L, 1L, 5L, 4L),
.Label = c("FRESNO", "ORANGE", "CARROLL","JEFFERSON", "TAYLOR"), class = "factor"), 

CATEGORY = structure(c(1L, 2L, 1L,2L, 2L, 1L, 3L, 3L, 2L),
.Label = c("AIR","OnSite_LAND", "POTW METALS"), class = "factor"),

`SUM(REL_EST_AMT_SUM)` = c(14000,23149, 5617000,123, 23, 250, 1300, 0, 5),

CARCINOGEN = structure(c(2L,1L, 1L,1L, 1L, 2L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"), 
CLEAN_AIR = structure(c(2L, 1L, 2L,2L, 2L, 2L, 2L, 1L, 2L), .Label = c("N","Y"), class = "factor")),
row.names = c("62993", "49717","63290","00005", "19700", "332510", "323501", "67491", "62086"), class = "data.frame")

NP<-arrange(np,STATE_ABBR,REPORTING_YEAR,COUNTY_NAME)

NP
STATE_ABBR REPORTING_YEAR COUNTY_NAME    CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717          CA           1990      FRESNO OnSite_LAND                23149          N         N
19700          CA           2000      ORANGE OnSite_LAND                   23          N         Y
67491          KY           1990      TAYLOR POTW METALS                    0          N         N
62086          KY           2000   JEFFERSON OnSite_LAND                    5          N         Y
62993          KY           2005      ORANGE         AIR                14000          Y         Y
63290          KY           2005     CARROLL         AIR              5617000          N         Y
00005          KY           2005     CARROLL OnSite_LAND                  123          N         Y
332510         NM           1990     CARROLL         AIR                  250          Y         Y
323501         NM           2000      FRESNO POTW METALS                 1300          N         Y
NP<-pivot_wider(NP, 
id_cols = c(STATE_ABBR, COUNTY_NAME), 
names_from = c(REPORTING_YEAR,CATEGORY), 
values_from = `SUM(REL_EST_AMT_SUM)`, 
values_fn = sum, 
values_fill = 0L)

NP
# A tibble: 8 x 9
STATE_ABBR  COUNTY_NAME  `1990_OnSite_LAN~`  `1990_POTW METAL~`  `1990_AIR`    "Total_1990"   2000_OnSite_LAN~`  `2000_POTW METAL~`  "total_2000"  `2005_AIR`  `2005_OnSite_LAN   "total_2005"   CARCINOGEN   CLEAN_AIR
<fct>      <fct>                   <dbl>             <dbl>         <dbl>                             <dbl>                 <dbl>                     <dbl>              <dbl>                          0           0
CA         FRESNO                  23149                 0             0            23149                0                     0             0           0                  0               0           0           1
CA         ORANGE                      0                 0             0                0               23                     0            23           0                  0               0           0           0
KY         TAYLOR                      0                 0             0                0                0                     0             0           0                  0               0           0           1
KY         JEFFERSON                   0                 0             0                0                5                     0             5           0                  0               0           1           1
KY         ORANGE                      0                 0             0                0                0                     0             0       14000                  0           14000           0           1
KY         CARROLL                     0                 0             0                0                0                     0             0     5617000                123         5617123           0           2
NM         CARROLL                     0                 0           250              250                0                     0             0           0                  0               0           1           1
NM         FRESNO                      0                 0             0                0                0                  1300          1300           0                  0               0           0           1

假设您还有两列,我认为这种方法效果更好(也许更容易理解(。

library(dplyr)
library(tidyr)
make_yearly_total_cols <- 
. %>% 
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
summarise(
CATEGORY = "total", 
across(`SUM(REL_EST_AMT_SUM)`, sum), 
.groups = "drop"
) %>% 
pivot_wider(
names_from = c(CATEGORY, REPORTING_YEAR), 
values_from = `SUM(REL_EST_AMT_SUM)`, 
values_fill = 0L
)
make_grand_total_cols <- 
. %>% 
group_by(STATE_ABBR, COUNTY_NAME) %>% 
summarise(
CATEGORY = "grand_total", 
across(`SUM(REL_EST_AMT_SUM)`, sum), 
across(c(CARCINOGEN, CLEAN_AIR), ~sum(c("N" = 0L, "Y" = 1L)[.])), 
.groups = "drop"
) %>% 
pivot_wider(
names_from = CATEGORY, 
values_from = `SUM(REL_EST_AMT_SUM)`, 
values_fill = 0L
)
make_category_cols <- 
. %>% 
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME), 
names_from = c(CATEGORY, REPORTING_YEAR), 
values_from = `SUM(REL_EST_AMT_SUM)`, 
values_fn = sum, 
values_fill = 0L
)
make_category_cols(NP) %>% 
left_join(make_yearly_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>% 
left_join(make_grand_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>% 
select(
STATE_ABBR, COUNTY_NAME, 
ends_with("1990"), ends_with("2000"), ends_with("2005"), 
grand_total, CARCINOGEN, CLEAN_AIR
)

输出

# A tibble: 8 x 15
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 AIR_2005 OnSite_LAND_2005 total_2005 grand_total CARCINOGEN CLEAN_AIR
<fct>      <fct>                  <dbl>              <dbl>    <dbl>      <dbl>            <dbl>              <dbl>      <dbl>    <dbl>            <dbl>      <dbl>       <dbl>      <int>     <int>
1 CA         FRESNO                 23149                  0        0      23149                0                  0          0        0                0          0       23149          0         0
2 CA         ORANGE                     0                  0        0          0               23                  0         23        0                0          0          23          0         1
3 KY         TAYLOR                     0                  0        0          0                0                  0          0        0                0          0           0          0         0
4 KY         JEFFERSON                  0                  0        0          0                5                  0          5        0                0          0           5          0         1
5 KY         ORANGE                     0                  0        0          0                0                  0          0    14000                0      14000       14000          1         1
6 KY         CARROLL                    0                  0        0          0                0                  0          0  5617000              123    5617123     5617123          0         2
7 NM         CARROLL                    0                  0      250        250                0                  0          0        0                0          0         250          1         1
8 NM         FRESNO                     0                  0        0          0                0               1300       1300        0                0          0        1300          0         1

如果您想了解更多关于旧方法背后的逻辑,请参阅下文。

这是tidyverse解决方案

library(dplyr)
library(tidyr)
NP %>% 
rows_insert(
(.) %>% 
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum)), 
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>% 
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY))) %>% 
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME), 
names_from = c(CATEGORY, REPORTING_YEAR), 
values_from = `SUM(REL_EST_AMT_SUM)`, 
values_fn = sum, 
values_fill = 0L
) %>% 
mutate(grand_total = rowSums(across(starts_with("total"))))

输出

`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
# A tibble: 8 x 13
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 OnSite_LAND_2005 AIR_2005 total_2005 grand_total
<fct>      <fct>                  <dbl>              <dbl>    <dbl>      <dbl>            <dbl>              <dbl>      <dbl>            <dbl>    <dbl>      <dbl>       <dbl>
1 CA         FRESNO                 23149                  0        0      23149                0                  0          0                0        0          0       23149
2 KY         TAYLOR                     0                  0        0          0                0                  0          0                0        0          0           0
3 NM         CARROLL                    0                  0      250        250                0                  0          0                0        0          0         250
4 CA         ORANGE                     0                  0        0          0               23                  0         23                0        0          0          23
5 KY         JEFFERSON                  0                  0        0          0                5                  0          5                0        0          0           5
6 NM         FRESNO                     0                  0        0          0                0               1300       1300                0        0          0        1300
7 KY         CARROLL                    0                  0        0          0                0                  0          0              123  5617000    5617123     5617123
8 KY         ORANGE                     0                  0        0          0                0                  0          0                0    14000      14000       14000

的一些解释

首先,看看这个

NP %>% 
rows_insert(
SOME_DATA, 
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)

它通过展开行将SOME_DATA插入NP中。此外,它还使用STATE_ABBR、COUNTY_NAME、REPORTING_YEAR和CATEGORY的组合来标识要插入的正确位置。例如,尝试像这样的

library(dplyr)
data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
data
data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))

你应该得到

> data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
> data
a    b   c
1 1    a 0.5
2 2    b 1.5
3 3 <NA> 2.5
> data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
a    b   c
1 1    a 0.5
2 2    b 1.5
3 3 <NA> 2.5
4 4    z  NA
> data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
a    b    c
1 1    a  0.5
2 2    b  1.5
3 3 <NA>  2.5
4 2    x 99.0

那么,问题是什么是SOME_DATA?好吧,这是我们从这个管道得到的总结

(.) %>% 
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))

在管道中,.可以有多种含义。请参阅以下我从文件中复制的解释

用法lhs%>%rhs

将点用于次要目的通常,除了lhs本身的值之外,rhs调用中还需要lhs的某些属性或属性,例如行数或列数。在rhs调用中多次使用点占位符是完全有效的,但在嵌套函数调用中使用点占位符时,其行为在设计上略有不同。特别是,如果占位符仅用于嵌套函数调用,那么lhs也将作为第一个参数!原因是在大多数用例中,这会生成可读性最强的代码。例如,虹膜%>%子集(1:nrow(.(%%2==0(等价于iris%>%子集(.,1:nrow(.(%%2==0(,但稍微紧凑一些。可以通过将rhs括在大括号中来否决这种行为。例如,1:10%>%{c(min(.(,max(.((}相当于c(min,max(1:10((

使用点占位符作为lhs当点用作lhs时,结果将是一个函数序列,即一个将整个右侧链依次应用于其输入的函数。请参阅示例。

因此,此处的点从外部管道的lhs捕获NP。然而,它也可以被认为是内部函数序列的开始。后一种用法的优先级高于前一种用法。因此,我们在点周围加上大括号,以避免它被错误地认为是函数序列的开始。

(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))

此步骤后

NP %>% 
rows_insert(
(.) %>% 
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), 
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)

数据看起来像这个

STATE_ABBR REPORTING_YEAR COUNTY_NAME    CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717          CA           1990      FRESNO OnSite_LAND                23149          N         N
19700          CA           2000      ORANGE OnSite_LAND                   23          N         Y
67491          KY           1990      TAYLOR POTW METALS                    0          N         N
62086          KY           2000   JEFFERSON OnSite_LAND                    5          N         Y
62993          KY           2005      ORANGE         AIR                14000          Y         Y
63290          KY           2005     CARROLL         AIR              5617000          N         Y
00005          KY           2005     CARROLL OnSite_LAND                  123          N         Y
332510         NM           1990     CARROLL         AIR                  250          Y         Y
323501         NM           2000      FRESNO POTW METALS                 1300          N         Y
...10          CA           1990      FRESNO       total                23149       <NA>      <NA>
...11          CA           2000      ORANGE       total                   23       <NA>      <NA>
...12          KY           2005      ORANGE       total                14000       <NA>      <NA>
...13          KY           2005     CARROLL       total              5617123       <NA>      <NA>
...14          KY           2000   JEFFERSON       total                    5       <NA>      <NA>
...15          KY           1990      TAYLOR       total                    0       <NA>      <NA>
...16          NM           2000      FRESNO       total                 1300       <NA>      <NA>
...17          NM           1990     CARROLL       total                  250       <NA>      <NA>

然后,我们需要重新排列数据,以便将具有相同REPORTING_YEAR的行分组在一起。我们还希望保持CATEGORY列的顺序不变。这就是我们使用factor(CATEGORY, unique(CATEGORY))的原因。重新排列后,

STATE_ABBR REPORTING_YEAR COUNTY_NAME    CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
1          CA           1990      FRESNO OnSite_LAND                23149          N         N
2          KY           1990      TAYLOR POTW METALS                    0          N         N
3          NM           1990     CARROLL         AIR                  250          Y         Y
4          CA           1990      FRESNO       total                23149       <NA>      <NA>
5          KY           1990      TAYLOR       total                    0       <NA>      <NA>
6          NM           1990     CARROLL       total                  250       <NA>      <NA>
7          CA           2000      ORANGE OnSite_LAND                   23          N         Y
8          KY           2000   JEFFERSON OnSite_LAND                    5          N         Y
9          NM           2000      FRESNO POTW METALS                 1300          N         Y
10         CA           2000      ORANGE       total                   23       <NA>      <NA>
11         KY           2000   JEFFERSON       total                    5       <NA>      <NA>
12         NM           2000      FRESNO       total                 1300       <NA>      <NA>
13         KY           2005     CARROLL OnSite_LAND                  123          N         Y
14         KY           2005      ORANGE         AIR                14000          Y         Y
15         KY           2005     CARROLL         AIR              5617000          N         Y
16         KY           2005      ORANGE       total                14000       <NA>      <NA>
17         KY           2005     CARROLL       total              5617123       <NA>      <NA>

然后我们CCD_ 5并得到结果。这就是逻辑。注意,如果你想摆脱这个消息

`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with 
`.groups` argument)

只需执行类似的操作

NP %>% 
rows_insert(
(.) %>% 
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>% 
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), # drop the groups
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY)))

然而,我无法重现你的另一个错误。

最新更新