我正在为我的一些数据进行一些QA/QC,并且有兴趣查看我的数据在每个组中被标记为可能更容易出错的百分比。我有15个组想要重复这一点,但我不确定如何最好地做到这一点。也许是for循环?
# pulling out group 1 data
group_1 <- filter(flow_group_df, GROUP == 1)
#looking at number of flagged occurrences in group 1
group_1_flagged <- length(which(group_1 == "flagged"))
#total number of checked occurrences that havent been flagged
class_1_checked <- length(which(group_1 == "checked"))
考虑使用aggregate
:进行完全聚合的R内置stats
库
agg_df <- aggregate(num_col ~ GROUP + group_1, flow_group_df, length)
或者使用ave
:进行内联聚合
flow_group_df$group_1_count <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length))
用随机数据进行演示:
set.seed(72318)
flow_group_df <- data.frame(GROUP = c("julia", "r", "pandas"),
group_1 = sample(c("flagged", "checked"), 60, replace=TRUE),
num_col = runif(60, 0, 100))
骨料
agg_df <- aggregate(num_col ~ GROUP + group_1, flow_group_df, length)
agg_df <- with(agg_df, agg_df[order(GROUP, group_1),]) # ORDER BY GROUPS
row.names(agg_df) <- NULL # RESET ROW NAMES
colnames(agg_df)[3] <- "count" # RENAME KEY COL
agg_df
# GROUP group_1 count
# 1 julia checked 10
# 2 julia flagged 10
# 3 pandas checked 8
# 4 pandas flagged 12
# 5 r checked 7
# 6 r flagged 13
平均(计数和百分比计算(
flow_group_df$group_1_count <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length))
flow_group_df$group_1_pct <- with(flow_group_df, ave(num_col, GROUP, group_1, FUN=length)) /
with(flow_group_df, ave(num_col, GROUP, FUN=length))
flow_group_df <- with(flow_group_df, flow_group_df[order(GROUP, group_1),]) # ORDER BY GROUPS
row.names(flow_group_df) <- NULL # RESET ROW NAMES
tail(flow_group_df, 20)
# GROUP group_1 num_col group_1_count group_1_pct
# 41 r checked 8.128056 7 0.35
# 42 r checked 86.439911 7 0.35
# 43 r checked 75.488474 7 0.35
# 44 r checked 88.120510 7 0.35
# 45 r checked 43.058268 7 0.35
# 46 r checked 46.662674 7 0.35
# 47 r checked 42.329505 7 0.35
# 48 r flagged 94.959380 13 0.65
# 49 r flagged 64.817015 13 0.65
# 50 r flagged 61.118952 13 0.65
# 51 r flagged 69.104977 13 0.65
# 52 r flagged 98.078729 13 0.65
# 53 r flagged 74.857959 13 0.65
# 54 r flagged 83.813440 13 0.65
# 55 r flagged 99.069011 13 0.65
# 56 r flagged 62.298414 13 0.65
# 57 r flagged 14.335920 13 0.65
# 58 r flagged 70.404048 13 0.65
# 59 r flagged 18.744892 13 0.65
# 60 r flagged 21.598072 13 0.65