R-Group by,然后遍历Group并提取原始列值



我的数据帧(df(:

ID1 | ID2 |  V1  |  V2 |  V3
A   | B   | var1 | foo |  1   
C   | D   | var2 | bar |  2
E   | F   | var3 | foo |  3
G   | F   | var3 | foo |  3
H   | I   | var4 | zap |  2
...

ID1和ID2包含重叠的值,因为它是上矩阵三角形的长格式版本,删除了相同的比较(如a、a(,并添加了一些额外的元数据(V1、V2、V3(。

以上必须由V1、V2&V3,并且最终输出是组成每个组的ID的列表(ID1和ID2包含重叠的变量((每个列表是单独的文件(。

到目前为止,我已经对变量进行了分组,但仍停留在如何迭代dplyr的每个组并获得每个组的值上。

我脑海中的伪代码如下:

# Group
cluster <- df %>% group_by(V1,V2,V3) 
[?] # loop through each group in cluster

[?] # get group values as x, y and z

# Get IDs into lists and merge
ID1 <- df %>% filter(V1 == x, V2 ==y, V3 == z) %>%
pull(ID1)
ID2 <- df %>% filter(V1 == x, V2 ==y, V3 == z) %>%
pull(ID2)
merged <- c(ID1,ID2) 

merged_unique <- unique(unlist(merged))
# Print out to file
fileConn <- file(paste(X ,Y, Z,"txt", sep="."))
writeLines(merged_unique, fileConn)
close(fileConn)

我希望我的最终输出是:

  • 文件var1.foo.1.txt:
A
B
  • 文件var2.bar.2.txt:
C
D
  • 文件var3.foo.3.txt:
E
F
G
  • 文件var4.zap.2.txt:
H
I

感谢您的帮助。

我不确定预期的输出。希望下面的代码能对有所帮助

lapply(
split(
df[c("ID1", "ID2")],
with(df, do.call(paste, list(V1, V2, V3)))
),
function(v) unique(unlist(v))
)

它给出

$`var1 foo 1`
[1] "A" "B"
$`var2 bar 2`
[1] "C" "D"
$`var3 foo 3`
[1] "E" "G" "F"
$`var4 zap 2`
[1] "H" "I"

如果您想将所有组保存到不同的*.txt文件中,您可以尝试下面的代码

lst <- lapply(
split(
df[c("ID1", "ID2")],
with(df, do.call(paste, list(V1, V2, V3,sep = "_")))
),
function(v) unique(unlist(v))
)
sapply(seq_along(lst),function(k) writeLines(lst[[k]],paste0(names(lst[k]),".txt")))

生成您的"数据":

df <- data.frame("ID1" = c("A","B","C","E","G","H"), "ID2" = c("B","B","D","Fe","Fe","I"), "V1" = c("var1","var1","var2","var3","var3","var4"),"V2" = c("foo","foo","bar","foo","foo","zed"), "V3" = c(1,1,2,3,3,2))

对数据进行聚类并获得唯一的聚类:

library(dplyr)
df_clust <- df %>% group_by(V1,V2,V3) 
df_tally <- df_clust %>% tally()

循环通过,假设只有两个ID列和3个特征列,并将每个结果打印到一个新文件:

for (i in c(1:nrow(df_tally))){
pull1 <- df %>% filter(V1 == unlist(df_tally[i,1]), V2 == unlist(df_tally[i,2]), V3 == unlist(df_tally[i,3])) %>% pull(ID1)
pull2 <- df %>% filter(V1 == unlist(df_tally[i,1]), V2 == unlist(df_tally[i,2]), V3 == unlist(df_tally[i,3])) %>% pull(ID2)
mergeID <- c(type.convert(pull1,as.is = TRUE),type.convert(pull2,as.is = TRUE))
mergeID <- unique(mergeID)
filename <- paste("TEST_",i, ".txt", sep="")
fileConn<-file(filename)
writeLines(mergeID, fileConn)
close(fileConn)
}

最新更新