Other Useful Tools

Synthetic Data

Use h2o.createFrame to synthetic random data in H2O. This method can also be used to quickly create very large datasets for scaling tests. Note that there is no intrinsic structure in the data (it's either constant or random), so results from many machine learning methods will not be very meaningful.

library(h2o)
h2oServer <- h2o.init(nthreads=-1)

myframe = h2o.createFrame(h2oServer, 'framekey', rows = 20, cols = 5,
                          seed = -12301283, randomize = TRUE, value = 0,
                          categorical_fraction = 0.8, factors = 10, real_range = 1,
                          integer_fraction = 0.2, integer_range = 10, missing_fraction = 0.2,
                          response_factors = 1)

We created a small random frame in H2O that contains missing values, categorical and numerical columns.

head(myframe,20)

#         response    C1    C2 C3    C4    C5
#    1  -0.4227451 4a41c a1290  3 326a0 82dce
#    2   0.4594655 0a79a a1290 -6 05f22 f4772
#    3   0.2784008 70d75 4c3a7  7 320a8 6b17b
#    4  -0.5674698       07067  6 320a8 a78d9
#    5  -0.7041911 24800 4c3a7  8       82dce
#    6   0.5853957 8e64f a1290 NA ea644
#    7   0.8540204              4 05f22 a6b1e
#    8  -0.1466706 4a41c        1 cef5a 89717
#    9          NA 0a79a c402d  3 070d5 a78d9
#    10 -0.7357408 4a41c a1290  9 e055d 64439
#    11 -0.2798403       4c3a7  3       f4772
#    12  0.3454386 24800       NA
#    13         NA 0a79a ca1de  9 070d5 a78d9
#    14 -0.6732674 8e64f 50b47  9 320a8 6b17b
#    15         NA cc3d5 ca1de  4 e055d f4772
#    16         NA cc3d5 07067  1 070d5 89717
#    17  0.8564855 4f8b9 a1290 -1 326a0 6b17b
#    18 -0.5555804 0a79a 50b47 NA       82dce
#    19  0.5639324              3 e055d
#    20  0.2511743              7 30c85 cae4c

We remove the response column and convert the integer column to a factor.

myframe <- myframe[,-1]
myframe[,3] <- as.factor(myframe$C3)
summary(myframe)
head(myframe, 20)

Interaction Features between Factors

Create pairwise interactions for 2 groups of columns, keep only up to 10 (most common) factors per interaction.

pairwise <- h2o.interaction(myframe, key = 'pairwise', factors = list(c(1,2),c(2,3,4)),
                            pairwise=TRUE, max_factors = 10, min_occurrence = 1)
head(pairwise, 20)
levels(pairwise[,2])

Create 5-th order interaction between the specified columns, and allow up to 10k resulting factors (per pair-wise interaction).

higherorder <- h2o.interaction(myframe, key = 'higherorder', factors = c(1,2,3,4,5),
                               pairwise=FALSE, max_factors = 10000, min_occurrence = 1)
head(higherorder, 20)

Create a categorical variable out of the integer column via self-interaction, and keep at most 3 factors, and only if they occur at least twice

summary(myframe$C3)
head(myframe$C3, 20)
trim_integer_levels <- h2o.interaction(myframe, key = 'trim_integers', factors = 3,
                                       pairwise = FALSE, max_factors = 3, min_occurrence = 2)
head(trim_integer_levels, 20)

Append all interactions to the original frame and clean up temporaries

myframe <- cbind(myframe, pairwise, higherorder, trim_integer_levels)
myframe <- h2o.assign(myframe, 'final.key')
h2o.rm(h2oServer, grep(pattern = "Last.value", x = h2o.ls(h2oServer)$Key, value = TRUE))
myframe
head(myframe,20)
summary(myframe)

#    > head(myframe,20)
#          C1    C2 C3    C4    C5       C1_C2    C2_C3       C2_C4    C3_C4             C1_C2_C3_C4_C5 C3_C3
#    1  49ed9 d9ff0  3 c9523 00599 49ed9_d9ff0  d9ff0_3 d9ff0_c9523    other  49ed9_d9ff0_3_c9523_00599     3
#    2  e2271 d9ff0 -6 fe2d9 cb67d e2271_d9ff0 d9ff0_-6 d9ff0_fe2d9 -6_fe2d9 e2271_d9ff0_-6_fe2d9_cb67d other
#    3  408d2 6c5ce  7 28b4d 3e4cb 408d2_6c5ce    other       other    other  408d2_6c5ce_7_28b4d_3e4cb other
#    4        ae93f  6 28b4d da0c6       other    other       other    other     NA_ae93f_6_28b4d_da0c6 other
#    5  722ea 6c5ce  8       00599 722ea_6c5ce    other    6c5ce_NA     8_NA     722ea_6c5ce_8_NA_00599 other
#    6  5e310 d9ff0 NA 9dbca       5e310_d9ff0 d9ff0_NA       other    other    5e310_d9ff0_NA_9dbca_NA
#    7               4 fe2d9 8d2b5       NA_NA    other    NA_fe2d9  4_fe2d9        NA_NA_4_fe2d9_8d2b5 other
#    8  49ed9        1 87bef 92d9c    49ed9_NA     NA_1       other  1_87bef     49ed9_NA_1_87bef_92d9c     1
#    9  e2271 14aa5  3 d77b0 da0c6       other  14aa5_3 14aa5_d77b0  3_d77b0  e2271_14aa5_3_d77b0_da0c6     3
#    10 49ed9 d9ff0  9 79727 b7a40 49ed9_d9ff0    other       other    other  49ed9_d9ff0_9_79727_b7a40     9
#    11       6c5ce  3       cb67d    NA_6c5ce  6c5ce_3    6c5ce_NA     3_NA        NA_6c5ce_3_NA_cb67d     3
#    12 722ea       NA                722ea_NA    NA_NA       NA_NA    NA_NA          722ea_NA_NA_NA_NA
#    13 e2271 0036a  9 d77b0 da0c6       other    other 0036a_d77b0  9_d77b0  e2271_0036a_9_d77b0_da0c6     9
#    14 5e310 0a9ed  9 28b4d 3e4cb       other    other       other    other  5e310_0a9ed_9_28b4d_3e4cb     9
#    15 f76de 0036a  4 79727 cb67d       other    other       other    other  f76de_0036a_4_79727_cb67d other
#    16 f76de ae93f  1 d77b0 92d9c       other  ae93f_1 ae93f_d77b0  1_d77b0  f76de_ae93f_1_d77b0_92d9c     1
#    17 853d4 d9ff0 -1 c9523 3e4cb 853d4_d9ff0 d9ff0_-1 d9ff0_c9523    other 853d4_d9ff0_-1_c9523_3e4cb other
#    18 e2271 0a9ed NA       00599       other 0a9ed_NA    0a9ed_NA    NA_NA    e2271_0a9ed_NA_NA_00599
#    19              3 79727             NA_NA    other       other    other           NA_NA_3_79727_NA     3
#    20              7 8007d 1280c       NA_NA    other    NA_8007d  7_8007d        NA_NA_7_8007d_1280c other

Imputation of Missing Values

First, we randomly replace 50 rows in each column of the iris dataset with missing values

ds <- iris
ds[sample(nrow(ds), 50),1] <- NA
ds[sample(nrow(ds), 50),2] <- NA
ds[sample(nrow(ds), 50),3] <- NA
ds[sample(nrow(ds), 50),4] <- NA
ds[sample(nrow(ds), 50),5] <- NA
summary(ds)

upload the NA'ed dataset to H2O

hex <- as.h2o(h2oServer, ds)
head(hex,20)

Impute the NAs in the first column in place with "median"

h2o.impute(hex, "Sepal.Length", method = "median")
head(hex,20)

Impute the NAs in the second column with the mean based on the groupBy columns Sepal.Length and Petal.Width and Species

h2o.impute(hex, "Sepal.Width", method = "mean", groupBy = c("Sepal.Length", "Petal.Width", "Species"))
head(hex,20)

Impute the Species column with the "mode" based on the columns 1 and 4

h2o.impute(hex, 5, method = "mode", groupBy = c(1,4))
head(hex,20)

Splitting H2O Frames into Consecutive Subsets

First, we create a large frame

myframe = h2o.createFrame(h2oServer, 'large', rows = 1000000, cols = 10,
                          seed = -12301283, randomize = TRUE, value = 0,
                          categorical_fraction = 0.8, factors = 10, real_range = 1,
                          integer_fraction = 0.2, integer_range = 10, missing_fraction = 0.2,
                          response_factors = 1)
dim(myframe)

Now, we split that dataset into 4 consecutive pieces, so we need to specify the sizes of the first 3 splits

splits <- h2o.splitFrame(myframe, c(0.4,0.2,0.1))
dim(splits[[1]])
dim(splits[[2]])
dim(splits[[3]])
dim(splits[[4]])

Splitting H2O Frames into Random Subsets

We create a 1D vector with uniform values sampled from the interval 0...1 and use that to assign rows to the splits.

random <- h2o.runif(myframe, seed = 123456789)
train <- myframe[random < .8,]
valid <- myframe[random >= .8 & random < 0.9,]
test  <- myframe[random >= .9,]
dim(train)
dim(valid)
dim(test)