我在 R 中使用 h2o
包并尝试进行一些数据操作,但 sub
/gsub
函数存在一些问题。
这是我的代码:
library(h2o)
# Start cluster
localH2O = h2o.init(nthreads = 2)
# Create data set
dat1.mini <- structure(list(id = c("7927751403363142656", "18236986451472797696",
"5654946373641778176", "14195690822403907584", "1693303484298446848",
"1.1362181921561e+19", "11694645532962195456", "1221431312630614784",
"1987127670789791488", "379819848497418688"), click = c("0",
"0", "0", "0", "0", "0", "0", "1", "0", "0"), hour = c("14102118",
"14102217", "14102812", "14102912", "14102820", "14102401", "14102117",
"14102312", "14102301", "14102414"), C1 = c("1005", "1005", "1005",
"1002", "1005", "1005", "1005", "1005", "1005", "1005"), banner_pos = c("1",
"1", "0", "0", "0", "0", "1", "1", "0", "0"), site_id = c("b7e9786d",
"e151e245", "85f751fd", "ee4c822c", "85f751fd", "85f751fd", "e5c60a05",
"e151e245", "1fbe01fe", "1fbe01fe"), site_domain = c("b12b9f85",
"7e091613", "c4e18dd6", "c4e18dd6", "c4e18dd6", "c4e18dd6", "7256c623",
"7e091613", "f3845767", "f3845767"), site_category = c("f028772b",
"f028772b", "50e219e0", "50e219e0", "50e219e0", "50e219e0", "f028772b",
"f028772b", "28905ebd", "28905ebd"), app_id = c("ecad2386", "ecad2386",
"685d1c4c", "ecad2386", "92f5800b", "f02cb7ab", "ecad2386", "ecad2386",
"ecad2386", "ecad2386"), app_domain = c("7801e8d9", "7801e8d9",
"2347f47a", "7801e8d9", "ae637522", "2347f47a", "7801e8d9", "7801e8d9",
"7801e8d9", "7801e8d9"), app_category = c("07d7df22", "07d7df22",
"8ded1f7a", "07d7df22", "0f2161f8", "f95efa07", "07d7df22", "07d7df22",
"07d7df22", "07d7df22"), device_id = c("a99f214a", "a99f214a",
"a99f214a", "8374cacf", "a99f214a", "8a5908a5", "a99f214a", "a99f214a",
"a99f214a", "a99f214a"), device_ip = c("3214d61e", "d5623936",
"419e166e", "698846d6", "c2d9c2f2", "40817190", "edd10fc1", "e4c6e857",
"05d3adbe", "6929d972"), device_model = c("a0f5f879", "69f9dd0e",
"46a414f4", "12edfe21", "4ffd3a7e", "04f5b394", "779d90c2", "1f0bc64f",
"293291c1", "d787e91b"), device_type = c("1", "1", "1", "0",
"1", "1", "1", "1", "1", "1"), device_conn_type = c("0", "0",
"3", "0", "3", "0", "0", "0", "0", "0"), C14 = c("16208", "20277",
"23224", "17566", "21189", "20633", "19771", "17264", "15703",
"20108"), C15 = c("320", "320", "320", "320", "320", "320", "320",
"320", "320", "320"), C16 = c("50", "50", "50", "50", "50", "50",
"50", "50", "50", "50"), C17 = c("1800", "2281", "2676", "479",
"2424", "2374", "2227", "1872", "1722", "2299"), C18 = c("3",
"3", "0", "3", "1", "3", "0", "3", "0", "2"), C19 = c("167",
"47", "35", "39", "161", "39", "679", "39", "35", "1327"), C20 = c("100077",
"100181", "100176", "100074", "100189", "-1", "100074", "-1",
"-1", "-1"), C21 = c("23", "42", "221", "23", "71", "23", "48",
"23", "79", "52")), .Names = c("id", "click", "hour", "C1", "banner_pos",
"site_id", "site_domain", "site_category", "app_id", "app_domain",
"app_category", "device_id", "device_ip", "device_model", "device_type",
"device_conn_type", "C14", "C15", "C16", "C17", "C18", "C19",
"C20", "C21"), row.names = c(NA, 10L), class = "data.frame")
# Load data to cluster
dat.mini.hex <- as.h2o(localH2O, dat1.mini)
# Attempt to grab substring of first 6 characters from hour column
dat.mini.hex$hr <- h2o.sub('^(.{6}).*$','\1', dat.mini.hex$hour)
dat.mini.hex$hr <- h2o.gsub('(.+)..','\1', dat.mini.hex$hour)
所有这些尝试都会导致以下错误:
Error in .h2o.__remoteSend(client, .h2o.__PAGE_EXEC2, str = expr) :
http://127.0.0.1:54321/2/Exec2.json returned the following error:
class java.lang.NullPointerException
发生此错误是因为hour
是数字列。函数 h2o.sub
和 h2o.gsub
不适用于数值数据。
命令str(dat.mini.hex$hour)
将显示hour
是数字列。
str(dat.mini.hex$hour)
您可以将hour
转换为因子,并将结果保存在新的列hour2
中。
dat.mini.hex$hour2 <- as.factor(dat.mini.hex$hour)
现在,您可以使用 h2o.sub
.但是,我想你不会喜欢这个结果...
h2o.sub('^(.{6}).*$','\1', dat.mini.hex$hour2)
# hour2
# 1 \1
# 2 \1
# 3 \1
# 4 \1
# 5 \1
# 6 \1
如您所见,h2o.sub
字面上使用\1
,但不用于第一个匹配组。这种行为与基数R的sub
相反。
您可以更改正则表达式并将前六个字符后面的字符替换为空字符串。
h2o.sub('(?<=^.{6}).*$','', dat.mini.hex$hour2)
# hour2
# 1 141021
# 2 141022
# 3 141028
# 4 141029
# 5 141028
# 6 141024
在这里,(?<=^.{6})
是一个积极的回头看。它匹配字符串开头和前 6 位数字前面的位置。