循环索引在R中被忽略了吗



我在R:中有这个数据集

set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
col1       col2       col3       col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E     A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H     C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O     A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N     B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B     D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D     C

我现在正试图运行以下双循环:

library(stringdist)
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
results = list()
l = length(unique(example$group))
for (j in 1:l) {
for (i in 1:length(method)) {


g = unique(example$group)
groups_j = g[j]
my_data_i = example[which(example$group == groups_j  ), ]


method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)

p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method =  method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)

p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method =  method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)

p1_i = p1_i[,3]
p2_i = p2_i[,3]

final_i = cbind(p1_i, p2_i, groups_j)
results[[i]] = final_i

}

}
final = do.call(cbind.data.frame, results)

循环似乎在运行-,但当我检查最终结果时,我注意到";j";循环似乎已被忽略:

> table(final$groups_j)
A 
441 

正如我们可以看到的原始数据,似乎有4组:

> table(example$group)
A  B  C  D 
21 28 19 32 

有人能帮我弄清楚为什么其他3组没有被我的循环处理吗

谢谢!

这不应该是一个正确的答案。我只是在玩你的代码。尽管如此,它可能会帮助你调试它

library(stringdist)
library(tidyverse)
results = list()
res_j <- list()
l = length(unique(example$group))
g = unique(example$group)
for (j in 1:l) {

groups_j = g[j]

for (i in 1:length(method)) {

my_data_i = example[which(example$group == groups_j  ), ]

method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)

p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method =  method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)

p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method =  method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)

p1_i = p1_i[,3]
p2_i = p2_i[,3]

final_i = cbind(p1_i, p2_i)
results[[i]] = final_i

}
res_j[[j]] <- flatten(results)
res_j[[j]]$group <- groups_j
}
test <- map_dfr(res_j, as.tibble) 
# here’s a summary table of the result set.
library(gtExtras)
gt_plt_summary(test) 

这里有一种方法。

代替unique(example$group)和使用该值循环通过数据集,split按组和lapply内部for循环到子数据集。

set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
head(example)
#>         col1       col2       col3       col4 group
#> 1 OOPBR0319H XFNIX1029D UFTLD7446Q LLRTH2385Q     C
#> 2 SUWML2894Y JWGSU4238I HRGIF0793H MTHSV3221Z     B
#> 3 NEAXO7570I OQWCR4065E EQVSJ7607Y PTIGN4766W     D
#> 4 CHHQS1666T ONOBS9571P EMLSS6601V JEFZH0164K     D
#> 5 JSHCU8312A TGWWI3712K SLKFF4079K EXKGJ1406W     A
#> 6 RJJRF2760C LMWLS5552P LORMI7587V OYPGF5046D     C

suppressPackageStartupMessages({
library(stringdist)
library(magrittr)
library(tidyr)
})
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
ex_split <- split(example, example$group)
temp <- vector("list", length = length(method))
results <- lapply(ex_split, (x) {
group <- x$group[1]
for (i in seq_along(method)) {
name_1 <- paste0("col1_col_2_", method[i])
name_2 <- paste0("col3_col_4_", method[i])

p1 <- stringdistmatrix(x$col1, x$col2, method = method[i], useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1)

p2 <- stringdistmatrix(x$col3, x$col4, method = method[i], useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2)

temp[[i]] <- cbind(p1[3], p2[3])
}
y <- do.call(cbind.data.frame, temp)
y$group <- group
y
})
final <- do.call(rbind.data.frame, results)
row.names(final) <- NULL
str(final)
#> 'data.frame':    2610 obs. of  21 variables:
#>  $ col1_col_2_osa    : num  8 10 10 9 10 7 9 9 9 10 ...
#>  $ col3_col_4_osa    : num  8 9 9 10 10 9 10 10 9 8 ...
#>  $ col1_col_2_lv     : num  8 10 10 9 10 7 9 9 9 10 ...
#>  $ col3_col_4_lv     : num  8 9 9 10 10 9 10 10 9 9 ...
#>  $ col1_col_2_dl     : num  8 10 10 9 10 7 9 9 9 10 ...
#>  $ col3_col_4_dl     : num  8 9 9 10 10 9 10 10 9 8 ...
#>  $ col1_col_2_hamming: num  8 10 10 9 10 9 9 9 9 10 ...
#>  $ col3_col_4_hamming: num  9 9 9 10 10 9 10 10 9 9 ...
#>  $ col1_col_2_lcs    : num  14 18 16 18 18 12 16 16 14 18 ...
#>  $ col3_col_4_lcs    : num  14 18 16 16 18 14 16 18 18 16 ...
#>  $ col1_col_2_qgram  : num  14 18 16 14 18 12 16 14 14 16 ...
#>  $ col3_col_4_qgram  : num  14 18 16 16 18 12 16 18 18 14 ...
#>  $ col1_col_2_cosine : num  0.726 0.817 0.8 0.763 0.915 ...
#>  $ col3_col_4_cosine : num  0.662 0.923 0.831 0.746 0.923 ...
#>  $ col1_col_2_jaccard: num  0.812 0.944 0.889 0.8 0.941 ...
#>  $ col3_col_4_jaccard: num  0.8 0.938 0.875 0.875 0.938 ...
#>  $ col1_col_2_jw     : num  0.467 0.6 0.533 0.578 0.6 ...
#>  $ col3_col_4_jw     : num  0.467 0.6 0.533 0.533 0.6 ...
#>  $ col1_col_2_soundex: num  1 1 1 1 1 1 1 1 1 1 ...
#>  $ col3_col_4_soundex: num  1 1 1 1 1 1 1 1 1 1 ...
#>  $ group             : chr  "A" "A" "A" "A" ...

创建于2022-11-27,reprex v2.0.2

最新更新