R - 多列 CSV 文本转换为存在缺席矩阵



我有一个这样的csv文件:

col1    col2    col3
r1  a,b,c   e,f      g
r2  h,i             j,k
r3  l      m,n,o

有些单元格有多个文本逗号分隔,有些单元格只有单个,有些则没有。我想像这样转换它:

col1  col2  col3
a   1     0     0
b   1     0     0
c   1     0     0
e   0     1     0
f   0     1     0
g   0     0     1
h   1     0     0
i   1     0     0
j   0     0     1
k   0     0     1
l   1     0     0
m   0     1     0
n   0     1     0
o   0     1     0

有什么建议吗?我尝试在 excel 中使用数据透视表,但没有获得所需的输出。 提前谢谢。

此致敬意 齐鲁尔

不确定这是否是最短的解决方案(可能不是(,但它会产生所需的输出。基本上,我们遍历所有三列并计算字符串的出现次数,并获得一个长格式数据框,然后将其翻转到您想要的宽格式。

library(tidyr)
library(purrr)
df <- data_frame(col1 = c("a,b,c", "h,i", "l"),
col2 = c("e,f", "", "m,n,o"),
col3 = c("g", "j,k", ""))
let_df <- map_df(df, function(col){
# map_df applies the function to each column of df
# split strings at "," and unlist to get vector of letters 
letters <- unlist(str_split(col, ","))
# delete "" 
letters <- letters[nchar(letters) > 0]
# count occurrences for each letter
tab <- table(letters)
# replace with 1 if occurs more often
tab[tab > 1] <- 1 
# create data frame from table
df <- data_frame(letter = names(tab), count = tab) 
return(df)
}, .id = "col") # id adds a column col that contains col1 - col3
# bring data frame into wide format 
let_df %>% 
spread(col, count, fill = 0)

这么好的问题要解决。这是我在基础R中对它的看法:

col1 <- c("a,b,c","h,i","l")
col2 <- c("e,f","","m,n,o")
col3 <- c("g","j,k","")
data <- data.frame(col1, col2, col3, stringsAsFactors = F)
restructure <- function(df){
df[df==""] <- "missing"
result_rows <- as.character()
l <- list()
for (i in seq_along(colnames(df)) ){
df_col <- sort(unique(unlist(strsplit(gsub(" ", "",toString(df[[i]])), ","))))
df_col <- df_col[!df_col %in% "missing"]
result_rows <- sort(unique(c(result_rows, df_col)))
l[i] <- list(df_col)
}
result <- data.frame(result_rows)
for (j in seq_along(l)){
result$temp <- NA
result$temp[match(l[[j]], result_rows)] <- 1
colnames(result)[colnames(result)=="temp"] <- colnames(df)[j]
}
result[is.na(result)] <- 0
return(result)
}
> restructure(data)
#  result_rows col1 col2 col3
#1            a    1    0    0
#2            b    1    0    0
#3            c    1    0    0
#4            e    0    1    0
#5            f    0    1    0
#6            g    0    0    1
#7            h    1    0    0
#8            i    1    0    0
#9            j    0    0    1
#10           k    0    0    1
#11           l    1    0    0
#12           m    0    1    0
#13           n    0    1    0
#14           o    0    1    0

最新更新