我试图将当前数据集的格式更改为每行有1个用户的格式,并且将颜色和食品列中的所有唯一值(动态值数)拆分为具有Yes和No的列。每个用户都有一个唯一的ID。
Current format:
ID | Name | Color | Food
1 | John | Blue | Pizza
1 | John | Red | Pizza
1 | John | Yellow | Pizza
1 | John | Blue | Ice Cream
1 | John | Red | Ice Cream
1 | John | Yellow | Ice Cream
2 | Kelly | Blue | Pizza
2 | Kelly | Red | Pizza
Desired format:
ID | Name | Color_Blue | Color_Red | Color_Yellow | Food_Pizza | Food_Ice Cream |
1 | John | Yes | Yes | Yes | Yes | Yes |
2 | Kelly | Yes | Yes | No | Yes | No |
library(dplyr); library(tidyr)
df %>%
pivot_longer(-c(ID:Name)) %>%
unite("col", c(name, value)) %>%
distinct(ID, Name, col) %>%
mutate(val = "Yes") %>%
pivot_wider(names_from = col, values_from = "val", values_fill = "No")
# A tibble: 2 x 7
ID Name Color_Blue Food_Pizza Color_Red Color_Yellow `Food_Ice Cream`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 John Yes Yes Yes Yes Yes
2 2 Kelly Yes Yes Yes No No
如果你想要一个基R的等价物,这里有一个使用相同步骤的。(谁能请帮我弄清楚如何删除rownames和"val !"附加到最后的列名?)
df2 <- reshape(df,
direction = "long",
varying = c("Color", "Food"),
v.names = "Value",
timevar = "col_name",
times = c("Color", "Food"))
df2$col = paste(df2$col_name, df2$Value, sep = "_")
df3 <- unique(df2[c("ID", "Name", "col")])
df3$val = "Yes"
df4 <- reshape(df3,
direction = "wide",
idvar = c("ID", "Name"),
timevar = "col")
df4[is.na(df4)] <- "No"
> df4
ID Name val.Color_Blue val.Color_Red val.Color_Yellow val.Food_Pizza val.Food_Ice Cream
1.Color 1 John Yes Yes Yes Yes Yes
7.Color 2 Kelly Yes Yes No Yes No
示例数据df <- tribble(~ID , ~Name , ~Color , ~Food,
"1" , "John", "Blue", "Pizza",
"1" , "John" , "Red", "Pizza",
"1" , "John", "Yellow", "Pizza",
"1" , "John" , "Blue", "Ice Cream",
"1" , "John", "Red", "Ice Cream",
"1" , "John" , "Yellow", "Ice Cream",
"2" , "Kelly", "Blue", "Pizza",
"2" , "Kelly", "Red", "Pizza")
基本R脚本:
# Data to import: df => data.frame
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), Name = c("John",
"John", "John", "John", "John", "John", "Kelly", "Kelly"), Color = c("Blue",
"Red", "Yellow", "Blue", "Red", "Yellow", "Blue", "Red"), Food = c("Pizza",
"Pizza", "Pizza", "Ice Cream", "Ice Cream", "Ice Cream", "Pizza",
"Pizza")), class = "data.frame", row.names = c(NA, -8L))
# Function to extract the column names of data.frame
# not contained in a character vector:
# resolve_other_vec_names => function
resolve_other_vec_names <- function(df, vec_names){
# Explicitly define returned object: character vector => env
return(
colnames(df)[!(
colnames(df) %in% vec_names
)
]
)
}
# Create a formula to aggregate a data.frame by:
# resolve_agg_formula => function()
resolve_agg_formula <- function(keep_vecs){
# Formula object to aggregate data.frame by:
# res => formula object
res <- as.formula(
paste(
".",
paste0(
keep_vecs,
collapse = "+"
),
sep = "~"
)
)
# Explicitly define returned object: formula => env
return(res)
}
# Function required to aggregate vector by:
# agg_func => function
agg_func <- function(df, agg_formula){
# Function to agg by: .agg_vec_by => function
.agg_vec_by <- function(x){
ifelse(
any(x),
"Yes",
"No"
)
}
# Aggregate data.frame: res => data.frame
res <- aggregate(
agg_formula,
df,
FUN = .agg_vec_by
)
# Explicitly define the returned object:
# data.frame => env
return(res)
}
# Function to spread a data.frame's vector,
# from unique row-values to column vectors:
# spread_func => function()
spread_func <- function(df, vec_name){
# Extract the unique values of a given vector:
# y => vector
y <- unique(df[,vec_name])
# Determine if a row contains a given value in y:
# row_contains_value_df => boolean data.frame
row_contains_value_df <- data.frame(
outer(
df[,vec_name],
y,
`==`
),
row.names = NULL
)
# Create the data.frame vector names:
# df_vec_names => character vector
df_vec_names <- paste(
vec_name,
y,
sep = "_"
)
# Rename the data.frame vectors: res => data.frame
res <- setNames(
row_contains_value_df,
df_vec_names
)
# Explicitly define the returned object: data.frame => env
return(res)
}
# Function to combine list of data.frames into df:
# df_list_2_df => function
df_list_2_df <- function(df_list, cmb_func = c(rbind, cbind)){
# Resolve the desired combination function:
# cmb_func_resolved => character scalar
cmb_func_resolved <- match.fun(cmb_func)
# Combine list of data.frames into a data.frame
# using a given combination function: res => data.frame
res <- data.frame(
do.call(
cmb_func_resolved,
df_list
),
row.names = NULL
)
# Explicitly define the returned object:
# data.frame => Env
return(res)
}
# Define the main function: main => function
main <- function(){
# Vectors to spread values to columns:
# spread_vecs => character vector
spread_vecs <- c("Color", "Food")
# Vectors to keep as columns: keep_vecs => character vector
keep_vecs <- resolve_other_vec_names(df, spread_vecs)
# Formula to aggregate the data.frame by:
# agg_formula => formula object
agg_formula <- resolve_agg_formula(keep_vecs)
# Resolve if person/id has observed value:
# res => data.frame
res <- agg_func(
cbind(
df[,keep_vecs],
df_list_2_df(
lapply(
spread_vecs,
function(x){
spread_func(df, x)
}
),
cbind
)
),
agg_formula
)
# Print data.frame to console: data.frame => stdout(console)
res
}
# Execute main if called:
if (sys.nframe() == 0){
# Execute the main function: data.frame => stdout(console)
main()
}