我有以下数据,这让我头疼
编辑:更新了列名和数据,以更好地代表我的数据,因为我没有技能应用已经带来的第四个答案(对不起(
dat3 <- data.frame(
sex = c("F","F","F", "F","M","M","M", "M","TOT","TOT","TOT","TOT","F","F","F", "F","M","M","M", "M","TOT","TOT","TOT","TOT"),
age = c(rep("Y70-74",12),rep("Y75-79",12)),
geo = c("UK","GER","FRA", "POL","UK","GER","FRA", "POL","UK","GER","FRA","POL","UK","GER","FRA", "POL","UK","GER","FRA", "POL","UK","GER","FRA","POL"),
"2021W3" = c(5,3,7,5,8,2,6,5,13,5,13,10, 1,2,3,4,1,1,1,1,2,3,4,5),
"2021W2" = c(6,8,4,5,9,1,0,4,15,9,12,9, 1,2,3,4,1,1,1,1,2,3,4,5),
"2021W1" = c(8,7,9,2,1,2,3,6,9, 9,12,8 , 1,2,3,4,1,1,1,1,2,3,4,5),
"2020W52"= c(1,2,8,2,5,1,2,4,6, 3,10,6, 1,2,3,4,1,1,1,1,2,3,4,5),
"2020W51"= c(4,4,3,6,4,5,1,0,9, 5,4, 6 , 1,2,3,4,1,1,1,1,2,3,4,5),
"2020W50"= c(3,6,3,5,2,1,2,5,5, 7,5, 10, 1,2,3,4,1,1,1,1,2,3,4,5))
dat3
sex age geo X2021W3 X2021W2 X2021W1 X2020W52 X2020W51 X2020W50
1 F Y70-74 UK 5 6 8 1 4 3
2 F Y70-74 GER 3 8 7 2 4 6
3 F Y70-74 FRA 7 4 9 8 3 3
4 F Y70-74 POL 5 5 2 2 6 5
5 M Y70-74 UK 8 9 1 5 4 2
6 M Y70-74 GER 2 1 2 1 5 1
7 M Y70-74 FRA 6 0 3 2 1 2
8 M Y70-74 POL 5 4 6 4 0 5
9 TOT Y70-74 UK 13 15 9 6 9 5
10 TOT Y70-74 GER 5 9 9 3 5 7
11 TOT Y70-74 FRA 13 12 12 10 4 5
12 TOT Y70-74 POL 10 9 8 6 6 10
13 F Y75-79 UK 1 1 1 1 1 1
14 F Y75-79 GER 2 2 2 2 2 2
15 F Y75-79 FRA 3 3 3 3 3 3
16 F Y75-79 POL 4 4 4 4 4 4
17 M Y75-79 UK 1 1 1 1 1 1
18 M Y75-79 GER 1 1 1 1 1 1
19 M Y75-79 FRA 1 1 1 1 1 1
20 M Y75-79 POL 1 1 1 1 1 1
21 TOT Y75-79 UK 2 2 2 2 2 2
22 TOT Y75-79 GER 3 3 3 3 3 3
23 TOT Y75-79 FRA 4 4 4 4 4 4
24 TOT Y75-79 POL 5 5 5 5 5 5
dat4 <- data.frame(
Time = rep(c("2020W50","2020W51","2020W52","20210W1","2021W2","2021W3"),4),
geo = c(rep("UK",6),rep("GER",6), rep("FRA",6),rep("POL",6)),
"FY70-74" = c(5,6,8,1,4,3,3,8,7,2,4,6,7,4,9,8,3,3,5,5,2,2,6,5),
"MY70-74" = c(8,9,1,5,4,2,2,1,2,1,5,1,6,0,3,2,1,2,5,4,6,4,0,5),
"TY70-74" = c(13,15,9,6,8,5,5,9,9,3,9,7,13,4,12,10,4,5,10,9,8,6,6,10),
"FY75-79" = c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4),
"MY75-74" = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
"TY75-79" = c(2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5))
> dat4
Time geo FY70.74 MY70.74 TY70.74 FY75.79 MY75.74 TY75.79
1 2020W50 UK 5 8 13 1 1 2
2 2020W51 UK 6 9 15 1 1 2
3 2020W52 UK 8 1 9 1 1 2
4 20210W1 UK 1 5 6 1 1 2
5 2021W2 UK 4 4 8 1 1 2
6 2021W3 UK 3 2 5 1 1 2
7 2020W50 GER 3 2 5 2 1 3
8 2020W51 GER 8 1 9 2 1 3
9 2020W52 GER 7 2 9 2 1 3
10 20210W1 GER 2 1 3 2 1 3
11 2021W2 GER 4 5 9 2 1 3
12 2021W3 GER 6 1 7 2 1 3
13 2020W50 FRA 7 6 13 3 1 4
14 2020W51 FRA 4 0 4 3 1 4
15 2020W52 FRA 9 3 12 3 1 4
16 20210W1 FRA 8 2 10 3 1 4
17 2021W2 FRA 3 1 4 3 1 4
18 2021W3 FRA 3 2 5 3 1 4
19 2020W50 POL 5 5 10 4 1 5
20 2020W51 POL 5 4 9 4 1 5
21 2020W52 POL 2 6 8 4 1 5
22 20210W1 POL 2 4 6 4 1 5
23 2021W2 POL 6 0 6 4 1 5
24 2021W3 POL 5 5 10 4 1 5
>
编辑:我非常抱歉,我无法想象我想要我的数据集,但我现在知道了——我想要三个性别类别;女性";,男性"总计";分布在所有年龄段:
"LT5 Y5-9 Y14-19 Y20-24 Y25-29 Y30-34 Y35-39 Y40-44 Y45-49 Y50-54 Y55-59 Y60-64 Y65-69 Y70-74 Y75-79 Y80-84 Y85-89 OT90〃;
因此,在每个年龄段内:女性+男性=总
注意:我的TIME索引必须从第一个日期-2020W50开始,所以我需要在melt
之前以某种方式翻转列的顺序
在我的例子中,数字和日期现在混淆了。我想要正确的日期顺序(从第一到最后,当然要附上正确的数字(
我也不知道为什么会有";X’es";在我的列名后面。同样地,R改变了";70-74〃;至";70.74〃;。不知道为什么。
如果你喜欢data.table
包,这里有一种方法:
library(data.table)
setDT(datWide)
datLong <- datWide[, melt(.SD , id.vars = c("sex", "age", "geo"), variable.name = "Time")
][, dcast(.SD, age + geo + Time ~ sex, value.var = "value")]
datLong
样本输出:
age geo Time F M TOT
1: Y70-74 FRA Week1 7 6 13
2: Y70-74 FRA Week2 4 0 12
3: Y70-74 FRA Week3 9 3 12
4: Y70-74 FRA Week4 8 2 10
5: Y70-74 FRA Week5 3 1 4
使用这种方法,无论您有多少周的时间,都只能指定您的id.vars
。
library(tidyr)
library(stringr)
n <- 1129 # change this to the number accordingly
names_list <- as.character()
names_list[1:n] <- str_c("Week",seq(1:n)) %>% unlist()
datLong <- datWide %>%
pivot_longer(cols = names_list,
names_to = "Time") %>%
pivot_wider(names_from = "sex")
# A tibble: 24 x 6
age geo Time F M TOT
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Y70-74 UK Week1 5 8 13
2 Y70-74 UK Week2 6 9 15
3 Y70-74 UK Week3 8 1 9
4 Y70-74 UK Week4 1 5 6
5 Y70-74 UK Week5 4 4 9
6 Y70-74 UK Week6 3 2 5
7 Y70-74 GER Week1 3 2 5
8 Y70-74 GER Week2 8 1 9
9 Y70-74 GER Week3 7 2 9
10 Y70-74 GER Week4 2 1 3
# ... with 14 more rows
更新:R将X放在日期列之前,因为R变量"应该"以字符而非数字开头。列名正在更改,因为-字符是非标准的对象名字符。要解决这两个问题,请将check.names=FALSE
添加到数据帧构造函数的末尾:
dat3 <- data.frame(
sex = c("F","F","F", "F","M","M","M", "M","TOT","TOT","TOT","TOT","F","F","F", "F","M","M","M", "M","TOT","TOT","TOT","TOT"),
age = c(rep("Y70-74",12),rep("Y75-79",12)),
geo = c("UK","GER","FRA", "POL","UK","GER","FRA", "POL","UK","GER","FRA","POL","UK","GER","FRA", "POL","UK","GER","FRA", "POL","UK","GER","FRA","POL"),
"2021W3" = c(5,3,7,5,8,2,6,5,13,5,13,10, 1,2,3,4,1,1,1,1,2,3,4,5),
"2021W2" = c(6,8,4,5,9,1,0,4,15,9,12,9, 1,2,3,4,1,1,1,1,2,3,4,5),
"2021W1" = c(8,7,9,2,1,2,3,6,9, 9,12,8 , 1,2,3,4,1,1,1,1,2,3,4,5),
"2020W52"= c(1,2,8,2,5,1,2,4,6, 3,10,6, 1,2,3,4,1,1,1,1,2,3,4,5),
"2020W51"= c(4,4,3,6,4,5,1,0,9, 5,4, 6 , 1,2,3,4,1,1,1,1,2,3,4,5),
"2020W50"= c(3,6,3,5,2,1,2,5,5, 7,5, 10, 1,2,3,4,1,1,1,1,2,3,4,5),
check.names = FALSE)
dat3
sex age geo 2021W3 2021W2 2021W1 2020W52 2020W51 2020W50
1 F Y70-74 UK 5 6 8 1 4 3
2 F Y70-74 GER 3 8 7 2 4 6
3 F Y70-74 FRA 7 4 9 8 3 3
4 F Y70-74 POL 5 5 2 2 6 5
5 M Y70-74 UK 8 9 1 5 4 2
6 M Y70-74 GER 2 1 2 1 5 1
7 M Y70-74 FRA 6 0 3 2 1 2
8 M Y70-74 POL 5 4 6 4 0 5
9 TOT Y70-74 UK 13 15 9 6 9 5
10 TOT Y70-74 GER 5 9 9 3 5 7
11 TOT Y70-74 FRA 13 12 12 10 4 5
12 TOT Y70-74 POL 10 9 8 6 6 10
13 F Y75-79 UK 1 1 1 1 1 1
这允许您以数字开始列名,但不建议使用,应尽可能避免使用。
要将格式从dat3更改为dat4,您可以使用:
library(dplyr)
names_list <- colnames(dat3[4:length(dat3)])
datNew <- dat3 %>%
pivot_longer(cols = names_list, names_to = "Time") %>%
pivot_wider(names_from = c("sex","age"), )
datNew <- arrange(datNew, desc("Time"))
datNew
# A tibble: 24 x 8
geo Time `F_Y70-74` `M_Y70-74` `TOT_Y70-74` `F_Y75-79` `M_Y75-79` `TOT_Y75-79`
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 UK 2021W3 5 8 13 1 1 2
2 GER 2021W3 3 2 5 2 1 3
3 FRA 2021W3 7 6 13 3 1 4
4 POL 2021W3 5 5 10 4 1 5
5 UK 2021W2 6 9 15 1 1 2
6 GER 2021W2 8 1 9 2 1 3
7 FRA 2021W2 4 0 12 3 1 4
8 POL 2021W2 5 4 9 4 1 5
9 UK 2021W1 8 1 9 1 1 2
10 GER 2021W1 7 2 9 2 1 3
# ... with 14 more rows
其将创建相关列并按日期降序排列行。
由于R将"-"字符解释为减号,因此必须使用引号来调用这些列。
> datNew$`F_Y70-74`
[1] 5 6 8 1 4 3 3 8 7 2 4 6 7 4 9 8 3 3 5 5 2 2 6 5
> datNew$F_Y70-74
numeric(0)
Warning message:
Unknown or uninitialised column: `F_Y70`.
我建议将这些更改为下划线。
使用dcast
+melt
的data.table
选项
dcast(
melt(setDT(datWide),
id.var = 1:3,
variable.name = "Time"
),
Time + geo + age ~ sex
)[order(-geo, Time)]
给出
Time geo age F M TOT
1: Week1 UK Y70-74 5 8 13
2: Week2 UK Y70-74 6 9 15
3: Week3 UK Y70-74 8 1 9
4: Week4 UK Y70-74 1 5 6
5: Week5 UK Y70-74 4 4 9
6: Week6 UK Y70-74 3 2 5
7: Week1 POL Y70-74 5 5 10
8: Week2 POL Y70-74 5 4 9
9: Week3 POL Y70-74 2 6 8
10: Week4 POL Y70-74 2 4 6
11: Week5 POL Y70-74 6 0 6
12: Week6 POL Y70-74 5 5 10
13: Week1 GER Y70-74 3 2 5
14: Week2 GER Y70-74 8 1 9
15: Week3 GER Y70-74 7 2 9
16: Week4 GER Y70-74 2 1 3
17: Week5 GER Y70-74 4 5 5
18: Week6 GER Y70-74 6 1 7
19: Week1 FRA Y70-74 7 6 13
20: Week2 FRA Y70-74 4 0 12
21: Week3 FRA Y70-74 9 3 12
22: Week4 FRA Y70-74 8 2 10
23: Week5 FRA Y70-74 3 1 4
24: Week6 FRA Y70-74 3 2 5
Time geo age F M TOT