r-将频率添加到变换表中



我有以下数据帧:

> data6
                                 verb_object SESSION_ID
1:   BA31C1CC63E5043483FAE25F085E25E5 INSERT   41595370
2: BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE   41595371
3:   26D695C8CA82CAFFDF985201F3AA44D7 UPDATE   41595282
4:   26D695C8CA82CAFFDF985201F3AA44D7 UPDATE   41595282
5: 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE   41595373
6:   6D944D54C54ED75D487288FE1505BB59 INSERT   41595368

我有一个转换表:

> u1
                                      items newitem
1   BA31C1CC63E5043483FAE25F085E25E5 INSERT     OV1
2 BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE     OV2
3   26D695C8CA82CAFFDF985201F3AA44D7 UPDATE     OV3
4 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE     OV4
5   6D944D54C54ED75D487288FE1505BB59 INSERT     OV5

我想在转换表(u1)中添加原始数据帧(data6)中项目的频率。在这种情况下,我认为u1的结果是:

> u1
                                      items newitem  freq
1   BA31C1CC63E5043483FAE25F085E25E5 INSERT     OV1    0.1667
2 BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE     OV2    0.1667
3   26D695C8CA82CAFFDF985201F3AA44D7 UPDATE     OV3    0.3333
4 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE     OV4    0.1667
5   6D944D54C54ED75D487288FE1505BB59 INSERT     OV5    0.1667

以下是两种解决方案的基准:

> microbenchmark(
+   setDT(u100)[setDT(data100)[, .(freq = .N/nrow(data100)), by = verb_object], on=c("items"="verb_object")],
+   
+   merge(u100, xtabs(~ verb_object, data100)/length(data100$verb_object) , 
+         by.x = "items", by.y = "verb_object", all.x = TRUE, sort = FALSE),
+   times=1000
+ )
Unit: milliseconds
                                                                                                                                         expr
                              setDT(u100)[setDT(data100)[, .(freq = .N/nrow(data100)), by = verb_object],      on = c(items = "verb_object")]
 merge(u100, xtabs(~verb_object, data100)/length(data100$verb_object),      by.x = "items", by.y = "verb_object", all.x = TRUE, sort = FALSE)
      min       lq     mean   median       uq      max neval cld
 1.269799 1.394808 1.586311 1.439762 1.493543 66.58702  1000  a 
 1.842091 2.030118 2.634712 2.099499 2.182838 67.77471  1000   b

基本包:

merge(u1, xtabs(~ verb_object, data6)/length(data6$verb_object), 
      by.x = "items", by.y = "verb_object", all.x = TRUE, sort = FALSE)

输出:

                                     items newitem      Freq
1   BA31C1CC63E5043483FAE25F085E25E5 INSERT     OV1 0.1666667
2 BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE     OV2 0.1666667
3   26D695C8CA82CAFFDF985201F3AA44D7 UPDATE     OV3 0.3333333
4 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE     OV4 0.1666667
5   6D944D54C54ED75D487288FE1505BB59 INSERT     OV5 0.1666667

数据:

data6 <- structure(list(verb_object = c("BA31C1CC63E5043483FAE25F085E25E5 INSERT", 
"BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE", "26D695C8CA82CAFFDF985201F3AA44D7 UPDATE", 
"26D695C8CA82CAFFDF985201F3AA44D7 UPDATE", "2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE", 
"6D944D54C54ED75D487288FE1505BB59 INSERT"), SESSION_ID = c(41595370L, 
41595371L, 41595282L, 41595282L, 41595373L, 41595368L)), .Names = c("verb_object", 
"SESSION_ID"), row.names = c(NA, -6L), class = "data.frame")   
u1 <- structure(list(items = c("BA31C1CC63E5043483FAE25F085E25E5 INSERT", 
"BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE", "26D695C8CA82CAFFDF985201F3AA44D7 UPDATE", 
"2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE", "6D944D54C54ED75D487288FE1505BB59 INSERT"
), newitem = structure(1:5, .Label = c("OV1", "OV2", "OV3", "OV4", 
"OV5"), class = "factor")), .Names = c("items", "newitem"), row.names = c(NA, 
-5L), class = "data.frame")

使用数据表:

library(data.table)
setDT(u1)[setDT(data6)[, .(freq = .N/nrow(data6)), by = verb_object],
          on=c("items"="verb_object")]

它给出:

                                       items newitem      freq
1:   BA31C1CC63E5043483FAE25F085E25E5 INSERT     OV1 0.1666667
2: BECE6374D91D47E6285EFDEBA6D65BB9 DATABASE     OV2 0.1666667
3:   26D695C8CA82CAFFDF985201F3AA44D7 UPDATE     OV3 0.3333333
4: 2BC5A4199A0DDA16FA17A9CA1AA17C02 DATABASE     OV4 0.1666667
5:   6D944D54C54ED75D487288FE1505BB59 INSERT     OV5 0.1666667