我使用的是RStudio 2.15.0,并使用XLConnect从Excel创建了一个包含3000多行和12列的对象。我正在尝试将一列分隔/拆分为多行,但不知道这是否可能,也不知道如何做到。下面的数据示例使用连接中的3列。这方面的任何帮助都将是巨大的。
下面是适用于其中两列的代码。
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
样本数据
PolId Description Document.Type
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
ABC123;ABC456;ABC789; TEST1 Pol1
AAA123; TEST1 End1
AAA123; TEST2 End2
ABB123;ABC123; TEST3 End1
ABB123;ABC123; TEST3 End1
我希望输出像这样(替换重复的Polid)
PolId Description Document.Type
ABC123 TEST1 Pol1
ABC456 TEST1 Pol1
ABC789 TEST1 Pol1
AAA123 TEST1 End1
AAA123 TEST2 End2
ABB123 TEST3 End1
ABC123 TEST3 End1
这里是一个基本的R解决方案。使用strplit
分割PolId
字段,并且对于每个这样的分割字段cbind它与对应的Description
。这给出了我们共同rbind
的矩阵列表。最后设置列名。
out <- do.call(rbind, Map(cbind, strsplit(DF$PolId, ";"), DF$Description))
colnames(out) <- colnames(DF)
给予:
> out
PolId Description
[1,] "ABC123" "TEST1"
[2,] "ABC456" "TEST1"
[3,] "ABC789" "TEST1"
[4,] "ABC123" "TEST1"
[5,] "ABC456" "TEST1"
[6,] "ABC789" "TEST1"
[7,] "ABC123" "TEST1"
[8,] "ABC456" "TEST1"
[9,] "ABC789" "TEST1"
[10,] "AAA123" "TEST1"
[11,] "AAA123" "TEST2"
[12,] "ABB123" "TEST3"
[13,] "ABC123" "TEST3"
[14,] "ABB123" "TEST3"
[15,] "ABC123" "TEST3"
注意:我们将其用作输入:
DF <-
structure(list(PolId = c("ABC123;ABC456;ABC789;", "ABC123;ABC456;ABC789;",
"ABC123;ABC456;ABC789;", "AAA123;", "AAA123;", "ABB123;ABC123;",
"ABB123;ABC123;"), Description = c("TEST1", "TEST1", "TEST1",
"TEST1", "TEST2", "TEST3", "TEST3")), .Names = c("PolId", "Description"
), class = "data.frame", row.names = c(NA, -7L))
这里有一个快速的data.table
可能的解决方案
library(data.table)
unique(setDT(df)[, .(PolId = unlist(strsplit(as.character(PolId), ";"))), by = Description])
# Description PolId
# 1: TEST1 ABC123
# 2: TEST1 ABC456
# 3: TEST1 ABC789
# 4: TEST1 AAA123
# 5: TEST2 AAA123
# 6: TEST3 ABB123
# 7: TEST3 ABC123
根据您的编辑-另一个选项(如果您有两列以上)
library(splitstackshape)
unique(cSplit(df, "PolId", ";", "long"))
# PolId Description Document.Type
# 1: ABC123 TEST1 Pol1
# 2: ABC456 TEST1 Pol1
# 3: ABC789 TEST1 Pol1
# 4: AAA123 TEST1 End1
# 5: AAA123 TEST2 End2
# 6: ABB123 TEST3 End1
# 7: ABC123 TEST3 End1
您可以在拆分"PolId"列并获得unique
行后,从tidyr
尝试unnest
library(dplyr)
library(tidyr)
unnest(setNames(strsplit(df$PolId, ';'), df$Description),
Description) %>% unique()
或者将base R
与stack/strsplit/duplicated
一起使用。用分隔符(;
)拆分"PolId"(strsplit
),用"Description"列命名输出列表元素,stack
列表以获得"data.frame",并使用duplicated
删除重复行。
df1 <- stack(setNames(strsplit(df$PolId, ';'), df$Description))
setNames(df1[!duplicated(df1),], names(df))
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#10 AAA123 TEST1
#11 AAA123 TEST2
#12 ABB123 TEST3
#13 ABC123 TEST3
或者不使用strsplit
的另一个选项
v1 <- with(df, tapply(PolId, Description, FUN= function(x) {
x1 <- paste(x, collapse=";")
gsub('(\b\S+\b)(?=.*\b\1\b.*);', '', x1, perl=TRUE)}))
library(stringr)
Description <- rep(names(v1), str_count(v1, '\w+'))
PolId <- scan(text=gsub(';+', ' ', v1), what='', quiet=TRUE)
data.frame(PolId, Description)
# PolId Description
#1 ABC123 TEST1
#2 ABC456 TEST1
#3 ABC789 TEST1
#4 AAA123 TEST1
#5 AAA123 TEST2
#6 ABB123 TEST3
#7 ABC123 TEST3