我正在尝试执行一些已经完成/得到帮助的函数,但我想使用我的结果添加更多的列。我想在每年的年底做一个总数,在年底做一次总数。有两列被删除,我想保留为总数。如果需要更多信息,请告诉我。提前谢谢。
np<-structure(list(STATE_ABBR = structure(c(2L, 1L, 2L,2L, 1L, 3L, 3L,2L, 2L),
.Label = c("CA", "KY", "NM"), class = "factor"),
REPORTING_YEAR = structure(c(3L,1L, 3L,3L, 2L, 1L, 2L, 1L, 2L),
.Label = c("1990", "2000", "2005"), class = "factor"),
COUNTY_NAME = structure(c(2L,1L, 3L,3L, 2L, 3L, 1L, 5L, 4L),
.Label = c("FRESNO", "ORANGE", "CARROLL","JEFFERSON", "TAYLOR"), class = "factor"),
CATEGORY = structure(c(1L, 2L, 1L,2L, 2L, 1L, 3L, 3L, 2L),
.Label = c("AIR","OnSite_LAND", "POTW METALS"), class = "factor"),
`SUM(REL_EST_AMT_SUM)` = c(14000,23149, 5617000,123, 23, 250, 1300, 0, 5),
CARCINOGEN = structure(c(2L,1L, 1L,1L, 1L, 2L, 1L, 1L, 1L), .Label = c("N", "Y"), class = "factor"),
CLEAN_AIR = structure(c(2L, 1L, 2L,2L, 2L, 2L, 2L, 1L, 2L), .Label = c("N","Y"), class = "factor")),
row.names = c("62993", "49717","63290","00005", "19700", "332510", "323501", "67491", "62086"), class = "data.frame")
NP<-arrange(np,STATE_ABBR,REPORTING_YEAR,COUNTY_NAME)
NP
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717 CA 1990 FRESNO OnSite_LAND 23149 N N
19700 CA 2000 ORANGE OnSite_LAND 23 N Y
67491 KY 1990 TAYLOR POTW METALS 0 N N
62086 KY 2000 JEFFERSON OnSite_LAND 5 N Y
62993 KY 2005 ORANGE AIR 14000 Y Y
63290 KY 2005 CARROLL AIR 5617000 N Y
00005 KY 2005 CARROLL OnSite_LAND 123 N Y
332510 NM 1990 CARROLL AIR 250 Y Y
323501 NM 2000 FRESNO POTW METALS 1300 N Y
NP<-pivot_wider(NP,
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(REPORTING_YEAR,CATEGORY),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L)
NP
# A tibble: 8 x 9
STATE_ABBR COUNTY_NAME `1990_OnSite_LAN~` `1990_POTW METAL~` `1990_AIR` "Total_1990" 2000_OnSite_LAN~` `2000_POTW METAL~` "total_2000" `2005_AIR` `2005_OnSite_LAN "total_2005" CARCINOGEN CLEAN_AIR
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> 0 0
CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 0 1
CA ORANGE 0 0 0 0 23 0 23 0 0 0 0 0
KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0 1
KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 1 1
KY ORANGE 0 0 0 0 0 0 0 14000 0 14000 0 1
KY CARROLL 0 0 0 0 0 0 0 5617000 123 5617123 0 2
NM CARROLL 0 0 250 250 0 0 0 0 0 0 1 1
NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 0 1
假设您还有两列,我认为这种方法效果更好(也许更容易理解(。
library(dplyr)
library(tidyr)
make_yearly_total_cols <-
. %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(
CATEGORY = "total",
across(`SUM(REL_EST_AMT_SUM)`, sum),
.groups = "drop"
) %>%
pivot_wider(
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fill = 0L
)
make_grand_total_cols <-
. %>%
group_by(STATE_ABBR, COUNTY_NAME) %>%
summarise(
CATEGORY = "grand_total",
across(`SUM(REL_EST_AMT_SUM)`, sum),
across(c(CARCINOGEN, CLEAN_AIR), ~sum(c("N" = 0L, "Y" = 1L)[.])),
.groups = "drop"
) %>%
pivot_wider(
names_from = CATEGORY,
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fill = 0L
)
make_category_cols <-
. %>%
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L
)
make_category_cols(NP) %>%
left_join(make_yearly_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>%
left_join(make_grand_total_cols(NP), by = c("STATE_ABBR", "COUNTY_NAME")) %>%
select(
STATE_ABBR, COUNTY_NAME,
ends_with("1990"), ends_with("2000"), ends_with("2005"),
grand_total, CARCINOGEN, CLEAN_AIR
)
输出
# A tibble: 8 x 15
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 AIR_2005 OnSite_LAND_2005 total_2005 grand_total CARCINOGEN CLEAN_AIR
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int>
1 CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 23149 0 0
2 CA ORANGE 0 0 0 0 23 0 23 0 0 0 23 0 1
3 KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0 0 0
4 KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 5 0 1
5 KY ORANGE 0 0 0 0 0 0 0 14000 0 14000 14000 1 1
6 KY CARROLL 0 0 0 0 0 0 0 5617000 123 5617123 5617123 0 2
7 NM CARROLL 0 0 250 250 0 0 0 0 0 0 250 1 1
8 NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 1300 0 1
如果您想了解更多关于旧方法背后的逻辑,请参阅下文。
旧
这是tidyverse
解决方案
library(dplyr)
library(tidyr)
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum)),
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY))) %>%
pivot_wider(
id_cols = c(STATE_ABBR, COUNTY_NAME),
names_from = c(CATEGORY, REPORTING_YEAR),
values_from = `SUM(REL_EST_AMT_SUM)`,
values_fn = sum,
values_fill = 0L
) %>%
mutate(grand_total = rowSums(across(starts_with("total"))))
输出
`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
# A tibble: 8 x 13
STATE_ABBR COUNTY_NAME OnSite_LAND_1990 `POTW METALS_1990` AIR_1990 total_1990 OnSite_LAND_2000 `POTW METALS_2000` total_2000 OnSite_LAND_2005 AIR_2005 total_2005 grand_total
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CA FRESNO 23149 0 0 23149 0 0 0 0 0 0 23149
2 KY TAYLOR 0 0 0 0 0 0 0 0 0 0 0
3 NM CARROLL 0 0 250 250 0 0 0 0 0 0 250
4 CA ORANGE 0 0 0 0 23 0 23 0 0 0 23
5 KY JEFFERSON 0 0 0 0 5 0 5 0 0 0 5
6 NM FRESNO 0 0 0 0 0 1300 1300 0 0 0 1300
7 KY CARROLL 0 0 0 0 0 0 0 123 5617000 5617123 5617123
8 KY ORANGE 0 0 0 0 0 0 0 0 14000 14000 14000
的一些解释
首先,看看这个
NP %>%
rows_insert(
SOME_DATA,
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)
它通过展开行将SOME_DATA插入NP中。此外,它还使用STATE_ABBR、COUNTY_NAME、REPORTING_YEAR和CATEGORY的组合来标识要插入的正确位置。例如,尝试像这样的
library(dplyr)
data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
data
data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
你应该得到
> data <- data.frame(a = 1:3, b = letters[c(1:2, NA)], c = 0.5 + 0:2)
> data
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
> data %>% rows_insert(data.frame(a = 4, b = "z"), by = "a")
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
4 4 z NA
> data %>% rows_insert(data.frame(a = 2, b = "x", c = 99), by = c("a", "b"))
a b c
1 1 a 0.5
2 2 b 1.5
3 3 <NA> 2.5
4 2 x 99.0
那么,问题是什么是SOME_DATA?好吧,这是我们从这个管道得到的总结
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))
在管道中,.
可以有多种含义。请参阅以下我从文件中复制的解释
用法lhs%>%rhs
将点用于次要目的通常,除了lhs本身的值之外,rhs调用中还需要lhs的某些属性或属性,例如行数或列数。在rhs调用中多次使用点占位符是完全有效的,但在嵌套函数调用中使用点占位符时,其行为在设计上略有不同。特别是,如果占位符仅用于嵌套函数调用,那么lhs也将作为第一个参数!原因是在大多数用例中,这会生成可读性最强的代码。例如,虹膜%>%子集(1:nrow(.(%%2==0(等价于iris%>%子集(.,1:nrow(.(%%2==0(,但稍微紧凑一些。可以通过将rhs括在大括号中来否决这种行为。例如,1:10%>%{c(min(.(,max(.((}相当于c(min,max(1:10((
使用点占位符作为lhs当点用作lhs时,结果将是一个函数序列,即一个将整个右侧链依次应用于其输入的函数。请参阅示例。
因此,此处的点从外部管道的lhs捕获NP
。然而,它也可以被认为是内部函数序列的开始。后一种用法的优先级高于前一种用法。因此,我们在点周围加上大括号,以避免它被错误地认为是函数序列的开始。
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum))
此步骤后
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"),
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
)
数据看起来像这个
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
49717 CA 1990 FRESNO OnSite_LAND 23149 N N
19700 CA 2000 ORANGE OnSite_LAND 23 N Y
67491 KY 1990 TAYLOR POTW METALS 0 N N
62086 KY 2000 JEFFERSON OnSite_LAND 5 N Y
62993 KY 2005 ORANGE AIR 14000 Y Y
63290 KY 2005 CARROLL AIR 5617000 N Y
00005 KY 2005 CARROLL OnSite_LAND 123 N Y
332510 NM 1990 CARROLL AIR 250 Y Y
323501 NM 2000 FRESNO POTW METALS 1300 N Y
...10 CA 1990 FRESNO total 23149 <NA> <NA>
...11 CA 2000 ORANGE total 23 <NA> <NA>
...12 KY 2005 ORANGE total 14000 <NA> <NA>
...13 KY 2005 CARROLL total 5617123 <NA> <NA>
...14 KY 2000 JEFFERSON total 5 <NA> <NA>
...15 KY 1990 TAYLOR total 0 <NA> <NA>
...16 NM 2000 FRESNO total 1300 <NA> <NA>
...17 NM 1990 CARROLL total 250 <NA> <NA>
然后,我们需要重新排列数据,以便将具有相同REPORTING_YEAR的行分组在一起。我们还希望保持CATEGORY列的顺序不变。这就是我们使用factor(CATEGORY, unique(CATEGORY))
的原因。重新排列后,
STATE_ABBR REPORTING_YEAR COUNTY_NAME CATEGORY SUM(REL_EST_AMT_SUM) CARCINOGEN CLEAN_AIR
1 CA 1990 FRESNO OnSite_LAND 23149 N N
2 KY 1990 TAYLOR POTW METALS 0 N N
3 NM 1990 CARROLL AIR 250 Y Y
4 CA 1990 FRESNO total 23149 <NA> <NA>
5 KY 1990 TAYLOR total 0 <NA> <NA>
6 NM 1990 CARROLL total 250 <NA> <NA>
7 CA 2000 ORANGE OnSite_LAND 23 N Y
8 KY 2000 JEFFERSON OnSite_LAND 5 N Y
9 NM 2000 FRESNO POTW METALS 1300 N Y
10 CA 2000 ORANGE total 23 <NA> <NA>
11 KY 2000 JEFFERSON total 5 <NA> <NA>
12 NM 2000 FRESNO total 1300 <NA> <NA>
13 KY 2005 CARROLL OnSite_LAND 123 N Y
14 KY 2005 ORANGE AIR 14000 Y Y
15 KY 2005 CARROLL AIR 5617000 N Y
16 KY 2005 ORANGE total 14000 <NA> <NA>
17 KY 2005 CARROLL total 5617123 <NA> <NA>
然后我们CCD_ 5并得到结果。这就是逻辑。注意,如果你想摆脱这个消息
`summarise()` regrouping output by 'STATE_ABBR', 'COUNTY_NAME' (override with `.groups` argument)
只需执行类似的操作
NP %>%
rows_insert(
(.) %>%
group_by(STATE_ABBR, COUNTY_NAME, REPORTING_YEAR) %>%
summarise(CATEGORY = "total", across(`SUM(REL_EST_AMT_SUM)`, sum), .groups = "drop"), # drop the groups
by = c("STATE_ABBR", "COUNTY_NAME", "REPORTING_YEAR", "CATEGORY")
) %>%
arrange(REPORTING_YEAR, factor(CATEGORY, unique(CATEGORY)))
然而,我无法重现你的另一个错误。