R用分号将一列分隔成行



我使用的是RStudio 2.15.0,并使用XLConnect从Excel创建了一个包含3000多行和12列的对象。我正在尝试将一列分隔/拆分为多行,但不知道这是否可能,也不知道如何做到。下面的数据示例使用连接中的3列。这方面的任何帮助都将是巨大的。

下面是适用于其中两列的代码。

v1 <- with(df, tapply(PolId, Description,  FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '',     x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1),  str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)  

样本数据

PolId   Description  Document.Type
ABC123;ABC456;ABC789;   TEST1  Pol1
ABC123;ABC456;ABC789;   TEST1  Pol1
ABC123;ABC456;ABC789;   TEST1  Pol1
AAA123; TEST1  End1
AAA123; TEST2  End2
ABB123;ABC123;  TEST3  End1
ABB123;ABC123;  TEST3  End1

我希望输出像这样(替换重复的Polid)

PolId   Description  Document.Type
ABC123  TEST1        Pol1
ABC456  TEST1        Pol1
ABC789  TEST1        Pol1
AAA123  TEST1        End1
AAA123  TEST2        End2
ABB123  TEST3        End1
ABC123  TEST3        End1

这里是一个基本的R解决方案。使用strplit分割PolId字段,并且对于每个这样的分割字段cbind它与对应的Description。这给出了我们共同rbind的矩阵列表。最后设置列名。

out <- do.call(rbind, Map(cbind, strsplit(DF$PolId, ";"), DF$Description))
colnames(out) <- colnames(DF)

给予:

> out
      PolId    Description
 [1,] "ABC123" "TEST1"    
 [2,] "ABC456" "TEST1"    
 [3,] "ABC789" "TEST1"    
 [4,] "ABC123" "TEST1"    
 [5,] "ABC456" "TEST1"    
 [6,] "ABC789" "TEST1"    
 [7,] "ABC123" "TEST1"    
 [8,] "ABC456" "TEST1"    
 [9,] "ABC789" "TEST1"    
[10,] "AAA123" "TEST1"    
[11,] "AAA123" "TEST2"    
[12,] "ABB123" "TEST3"    
[13,] "ABC123" "TEST3"    
[14,] "ABB123" "TEST3"    
[15,] "ABC123" "TEST3" 

注意:我们将其用作输入:

DF <-
structure(list(PolId = c("ABC123;ABC456;ABC789;", "ABC123;ABC456;ABC789;", 
"ABC123;ABC456;ABC789;", "AAA123;", "AAA123;", "ABB123;ABC123;", 
"ABB123;ABC123;"), Description = c("TEST1", "TEST1", "TEST1", 
"TEST1", "TEST2", "TEST3", "TEST3")), .Names = c("PolId", "Description"
), class = "data.frame", row.names = c(NA, -7L))

这里有一个快速的data.table可能的解决方案

library(data.table)
unique(setDT(df)[, .(PolId = unlist(strsplit(as.character(PolId), ";"))), by = Description])
#    Description  PolId
# 1:       TEST1 ABC123
# 2:       TEST1 ABC456
# 3:       TEST1 ABC789
# 4:       TEST1 AAA123
# 5:       TEST2 AAA123
# 6:       TEST3 ABB123
# 7:       TEST3 ABC123

根据您的编辑-另一个选项(如果您有两列以上)

library(splitstackshape)
unique(cSplit(df, "PolId", ";", "long"))
#     PolId Description Document.Type
# 1: ABC123       TEST1          Pol1
# 2: ABC456       TEST1          Pol1
# 3: ABC789       TEST1          Pol1
# 4: AAA123       TEST1          End1
# 5: AAA123       TEST2          End2
# 6: ABB123       TEST3          End1
# 7: ABC123       TEST3          End1

您可以在拆分"PolId"列并获得unique行后,从tidyr尝试unnest

library(dplyr)
library(tidyr)
 unnest(setNames(strsplit(df$PolId, ';'), df$Description), 
                                  Description) %>% unique()

或者将base Rstack/strsplit/duplicated一起使用。用分隔符(;)拆分"PolId"(strsplit),用"Description"列命名输出列表元素,stack列表以获得"data.frame",并使用duplicated删除重复行。

df1 <- stack(setNames(strsplit(df$PolId, ';'), df$Description))
setNames(df1[!duplicated(df1),], names(df))
#     PolId Description
#1  ABC123       TEST1
#2  ABC456       TEST1
#3  ABC789       TEST1
#10 AAA123       TEST1
#11 AAA123       TEST2
#12 ABB123       TEST3
#13 ABC123       TEST3

或者不使用strsplit 的另一个选项

v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
            x1 <- paste(x, collapse=";")
        gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1),  str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
#   PolId Description
#1 ABC123       TEST1
#2 ABC456       TEST1
#3 ABC789       TEST1
#4 AAA123       TEST1
#5 AAA123       TEST2
#6 ABB123       TEST3
#7 ABC123       TEST3

最新更新