r语言 - 为给定行的每个唯一值添加列



我试图将当前数据集的格式更改为每行有1个用户的格式,并且将颜色和食品列中的所有唯一值(动态值数)拆分为具有Yes和No的列。每个用户都有一个唯一的ID。

Current format: 
ID | Name  | Color  | Food 
1  | John  | Blue   | Pizza
1  | John  | Red    | Pizza
1  | John  | Yellow | Pizza
1  | John  | Blue   | Ice Cream
1  | John  | Red    | Ice Cream
1  | John  | Yellow | Ice Cream
2  | Kelly | Blue   | Pizza
2  | Kelly | Red    | Pizza

Desired format: 
ID | Name  | Color_Blue | Color_Red | Color_Yellow | Food_Pizza | Food_Ice Cream |
1  | John  | Yes        | Yes       | Yes          | Yes        | Yes            |
2  | Kelly | Yes        | Yes       | No           | Yes        | No             |
library(dplyr); library(tidyr)
df %>% 
pivot_longer(-c(ID:Name)) %>%
unite("col", c(name, value)) %>%
distinct(ID, Name, col) %>%
mutate(val = "Yes") %>%
pivot_wider(names_from = col, values_from = "val", values_fill = "No")
# A tibble: 2 x 7
ID    Name  Color_Blue Food_Pizza Color_Red Color_Yellow `Food_Ice Cream`
<chr> <chr> <chr>      <chr>      <chr>     <chr>        <chr>           
1 1     John  Yes        Yes        Yes       Yes          Yes             
2 2     Kelly Yes        Yes        Yes       No           No   

如果你想要一个基R的等价物,这里有一个使用相同步骤的。(谁能请帮我弄清楚如何删除rownames和"val !"附加到最后的列名?)

df2 <- reshape(df, 
direction = "long", 
varying = c("Color", "Food"),
v.names = "Value",
timevar = "col_name",
times = c("Color", "Food"))
df2$col = paste(df2$col_name, df2$Value, sep = "_")
df3 <- unique(df2[c("ID", "Name", "col")])
df3$val = "Yes"
df4 <- reshape(df3,
direction = "wide",
idvar = c("ID", "Name"),
timevar = "col")
df4[is.na(df4)] <- "No"
> df4
ID  Name val.Color_Blue val.Color_Red val.Color_Yellow val.Food_Pizza val.Food_Ice Cream
1.Color  1  John            Yes           Yes              Yes            Yes                Yes
7.Color  2 Kelly            Yes           Yes               No            Yes                 No

示例数据
df <- tribble(~ID , ~Name  , ~Color  , ~Food,
"1"  , "John",  "Blue",    "Pizza",
"1"  , "John" , "Red",    "Pizza",
"1"  , "John",  "Yellow",  "Pizza",
"1"  , "John" , "Blue",   "Ice Cream",
"1"  , "John",  "Red",    "Ice Cream",
"1"  , "John" , "Yellow", "Ice Cream",
"2"  , "Kelly", "Blue",    "Pizza",
"2"  , "Kelly", "Red",    "Pizza")

基本R脚本:

# Data to import: df => data.frame
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), Name = c("John", 
"John", "John", "John", "John", "John", "Kelly", "Kelly"), Color = c("Blue", 
"Red", "Yellow", "Blue", "Red", "Yellow", "Blue", "Red"), Food = c("Pizza", 
"Pizza", "Pizza", "Ice Cream", "Ice Cream", "Ice Cream", "Pizza", 
"Pizza")), class = "data.frame", row.names = c(NA, -8L))
# Function to extract the column names of data.frame 
# not contained in a character vector: 
# resolve_other_vec_names => function
resolve_other_vec_names <- function(df, vec_names){
# Explicitly define returned object: character vector => env
return(
colnames(df)[!(
colnames(df) %in% vec_names
)
]
)
}
# Create a formula to aggregate a data.frame by:
# resolve_agg_formula => function()
resolve_agg_formula <- function(keep_vecs){
# Formula object to aggregate data.frame by:
# res => formula object
res <- as.formula(
paste(
".", 
paste0(
keep_vecs,
collapse = "+"
),
sep = "~"
)
)
# Explicitly define returned object: formula => env
return(res)
}
# Function required to aggregate vector by:
# agg_func => function
agg_func <- function(df, agg_formula){
# Function to agg by: .agg_vec_by => function
.agg_vec_by <- function(x){
ifelse(
any(x),
"Yes",
"No"
)
}

# Aggregate data.frame: res => data.frame 
res <- aggregate(
agg_formula, 
df,
FUN = .agg_vec_by
)

# Explicitly define the returned object:
# data.frame => env
return(res)
}
# Function to spread a data.frame's vector, 
# from unique row-values to column vectors:
# spread_func => function()
spread_func <- function(df, vec_name){
# Extract the unique values of a given vector:
# y => vector
y <- unique(df[,vec_name])

# Determine if a row contains a given value in y:
# row_contains_value_df => boolean data.frame
row_contains_value_df <- data.frame(
outer(
df[,vec_name], 
y,
`==`
),
row.names = NULL
)

# Create the data.frame vector names: 
# df_vec_names => character vector
df_vec_names <- paste(
vec_name,
y,
sep = "_"
)

# Rename the data.frame vectors: res => data.frame
res <- setNames(
row_contains_value_df,
df_vec_names
)

# Explicitly define the returned object: data.frame => env
return(res)
}
# Function to combine list of data.frames into df: 
# df_list_2_df => function
df_list_2_df <- function(df_list, cmb_func = c(rbind, cbind)){
# Resolve the desired combination function: 
# cmb_func_resolved => character scalar
cmb_func_resolved <- match.fun(cmb_func)
# Combine list of data.frames into a data.frame 
# using a given combination function: res => data.frame
res <- data.frame(
do.call(
cmb_func_resolved,
df_list
),
row.names = NULL
)
# Explicitly define the returned object: 
# data.frame => Env
return(res)
}
# Define the main function: main => function
main <- function(){
# Vectors to spread values to columns: 
# spread_vecs => character vector
spread_vecs <- c("Color", "Food")

# Vectors to keep as columns: keep_vecs => character vector
keep_vecs <- resolve_other_vec_names(df, spread_vecs)

# Formula to aggregate the data.frame by: 
# agg_formula => formula object
agg_formula <- resolve_agg_formula(keep_vecs)

# Resolve if person/id has observed value: 
# res => data.frame
res <- agg_func(
cbind(
df[,keep_vecs],
df_list_2_df(
lapply(
spread_vecs,
function(x){
spread_func(df, x)
}
),
cbind
)
),
agg_formula
)

# Print data.frame to console: data.frame => stdout(console)
res

}
# Execute main if called:
if (sys.nframe() == 0){
# Execute the main function: data.frame => stdout(console)
main()
}