这是我的数据示例
dput(mydat)
structure(list(ID.group = c(NA, 10150591L, NA, 10150591L, NA,
10150591L, NA, 68837296L, NA, 68837296L, NA, 68837296L, NA, 124771228L,
NA, 124771228L), UserID = c(NA, 181078814L, NA, 88578209L, NA,
30240768L, NA, 334686951L, NA, 297170412L, NA, 265332359L, NA,
216632504L, NA, 5272133L), countlike = c(NA, 44L, NA, 50L, NA,
99L, NA, 1L, NA, 1L, NA, 15L, NA, 41L, NA, 20L), statistics.snt = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("",
"fb"), class = "factor"), statistics.created_at = structure(c(1L,
8L, 1L, 4L, 1L, 7L, 1L, 2L, 1L, 2L, 1L, 5L, 1L, 3L, 1L, 6L), .Label = c("",
"10.04.2020 9:14", "11.04.2020 0:01", "11.04.2020 19:22", "12.04.2020 19:45",
"12.04.2020 6:54", "13.04.2020 20:47", "17.04.2020 23:02"), class = "factor"),
statistics.updated_at = structure(c(1L, 8L, 1L, 7L, 1L, 6L,
1L, 3L, 1L, 3L, 1L, 4L, 1L, 5L, 1L, 2L), .Label = c("", "22.04.2020 12:27",
"22.04.2020 12:51", "22.04.2020 14:19", "22.04.2020 5:41",
"22.04.2020 6:18", "22.04.2020 7:37", "30.04.2020 16:55"), class = "factor"),
statistics.is_recount = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("", "False"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
我想按 ID 组计算计数的中位数
library(psych)
describeBy(mydat,mydat$ID.group)
但我没有得到所需的结果,我得到了所有描述性的统计数据。 我怎样才能得到这样的结果
ID group median countlike
10150591 50
68837296 1
那么如何计算用户ID的分类变量呢? 例如。ID group
=10150591 的中位数为 50,那么如果userid
=30240768 的值比该组的中位数多 25%,则为"红色"。 从 50=12.5=50/100*25=12.5
25% 的百分比。所以50+12.5=62.5
,如果userid
=30240768 的值大于 62.5 的计数,则"红色" 即userid=
30240768的值为 99。所以他是"红色的"。 如果userid
的值比该组的中位数低 25%,则为"绿色"。50-12.5=37.5
,这里没有这样的价值。最后,如果值在组中的中位数±24%范围内,则为"橙色"。24% 从 50 =50/100*24=12
,所以如果userid
具有计数50 ± 12 (38-62)
的值,则为"橙色"。 所以想要的输出
ID group UserID countlike median countlike
10150591 181078814 44 orange
10150591 88578209 50 orange
10150591 30240768 99 red
68837296 334686951 1 green
68837296 297170412 1 green
68837296 265332359 15 red
我如何遵守这些条件?
这是使用dplyr
的答案。 我们将数据聚合为中位数,将中位数与原始数据合并,然后计算color
。
首先,我们从 OP 中读取dput()
数据并删除缺少的行。
data <- structure(list(ID.group = c(NA, 10150591L, NA, 10150591L, NA,
10150591L, NA, 68837296L, NA, 68837296L, NA, 68837296L, NA, 124771228L,
NA, 124771228L), UserID = c(NA, 181078814L, NA, 88578209L, NA,
30240768L, NA, 334686951L, NA, 297170412L, NA, 265332359L, NA,
216632504L, NA, 5272133L), countlike = c(NA, 44L, NA, 50L, NA,
99L, NA, 1L, NA, 1L, NA, 15L, NA, 41L, NA, 20L), statistics.snt = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("",
"fb"), class = "factor"), statistics.created_at = structure(c(1L,
8L, 1L, 4L, 1L, 7L, 1L, 2L, 1L, 2L, 1L, 5L, 1L, 3L, 1L, 6L), .Label = c("",
"10.04.2020 9:14", "11.04.2020 0:01", "11.04.2020 19:22", "12.04.2020 19:45",
"12.04.2020 6:54", "13.04.2020 20:47", "17.04.2020 23:02"), class = "factor"),
statistics.updated_at = structure(c(1L, 8L, 1L, 7L, 1L, 6L,
1L, 3L, 1L, 3L, 1L, 4L, 1L, 5L, 1L, 2L), .Label = c("", "22.04.2020 12:27",
"22.04.2020 12:51", "22.04.2020 14:19", "22.04.2020 5:41",
"22.04.2020 6:18", "22.04.2020 7:37", "30.04.2020 16:55"), class = "factor"),
statistics.is_recount = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("", "False"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L))
data <- data[!is.na(data$ID.group),]
接下来,我们加载dplyr
并计算所需的输出。
library(dplyr)
data %>% group_by(ID.group) %>%
summarise(.,mdn_countlike = median(countlike)) %>%
inner_join(.,data) %>%
mutate(color = case_when(countlike > 1.25 * mdn_countlike ~ "red",
countlike < 0.75 * mdn_countlike ~ "green",
countlike >= 0.75 * mdn_countlike &
countlike <= 1.25 * mdn_countlike ~ "orange")) -> mergedData
mergedData[,c("ID.group","UserID","countlike","mdn_countlike","color")]
。和输出:
> mergedData[,c("ID.group","UserID","countlike","mdn_countlike","color")]
# A tibble: 8 x 5
ID.group UserID countlike mdn_countlike color
<int> <int> <int> <dbl> <chr>
1 10150591 181078814 44 50 orange
2 10150591 88578209 50 50 orange
3 10150591 30240768 99 50 red
4 68837296 334686951 1 1 orange
5 68837296 297170412 1 1 orange
6 68837296 265332359 15 1 red
7 124771228 216632504 41 30.5 red
8 124771228 5272133 20 30.5 green
>