Chapter 3 Missing values and imputation

3.1 Load data

# Created in 1-clean-merge.Rmd

# Objects included: data, vars
# renv also includes a load() method, so we specify base:: here.
base::load("data/clean-merge-unimputed.RData")

3.2 Examine predictor missingness

3.2.1 Missingness table

# Look at missingness among predictors.
missing = is.na(data[, vars$predictors])

# This will be a zero-row tibble if there is no missingness in the data.
missing_df =
  data.frame(var = colnames(missing),
             missing_mean = colMeans(missing),
             missing_count = colSums(missing)) %>%
  filter(missing_count > 0) %>% arrange(desc(missing_mean))

missing_df
##             var missing_mean missing_count
## oldpeak oldpeak   0.02310231             7
## ca           ca   0.01980198             6
## thal       thal   0.01980198             6
## age         age   0.01650165             5
## restecg restecg   0.01650165             5
## cp           cp   0.01320132             4
## fbs         fbs   0.01320132             4
## slope     slope   0.01320132             4
## chol       chol   0.00990099             3
## thalach thalach   0.00990099             3
## sex         sex   0.00660066             2
## exang     exang   0.00330033             1
if (nrow(missing_df) == 0) {
  cat("No missinginess found in the predictors.")
} else {

  missing_df$missing_mean = paste0(round(missing_df$missing_mean * 100, 1), "%")
  missing_df$missing_count = prettyNum(missing_df$missing_count, big.mark = ",")
  
  colnames(missing_df) = c("Variable", "Missing rate", "Missing values")
  
  print({ kab_table = kable(missing_df, format = "latex", digits = c(0, 3, 0),
                     booktabs = TRUE) })
  cat(kab_table %>% kable_styling(latex_options = "striped"),
      file = "tables/missingness-table.tex")
}
## 
## \begin{tabular}{llll}
## \toprule
##   & Variable & Missing rate & Missing values\\
## \midrule
## oldpeak & oldpeak & 2.3\% & 7\\
## ca & ca & 2\% & 6\\
## thal & thal & 2\% & 6\\
## age & age & 1.7\% & 5\\
## restecg & restecg & 1.7\% & 5\\
## \addlinespace
## cp & cp & 1.3\% & 4\\
## fbs & fbs & 1.3\% & 4\\
## slope & slope & 1.3\% & 4\\
## chol & chol & 1\% & 3\\
## thalach & thalach & 1\% & 3\\
## \addlinespace
## sex & sex & 0.7\% & 2\\
## exang & exang & 0.3\% & 1\\
## \bottomrule
## \end{tabular}

3.2.2 Missingness heatmap

if (nrow(missing_df) == 0) {
  cat("Skipping missingness heatmap, no missigness found in predictors.")
} else {

  # Correlation table of missingness
  # Only examine variables with missingness > 0%.
  missing2 = is.na(data[, as.character(missing_df$Variable)])
  
  colMeans(missing2)
  
  cor(missing2)
  
  # Correlation matrix of missingness.
  (missing_cor = cor(missing2))
  
  # Replace the unit diagonal with NAs so that it doesn't show as yellow.
  diag(missing_cor) = NA
  
  # Heatmap of correlation table.
  #png("visuals/missingness-superheat.png", height = 600, width = 900)
  superheat::superheat(missing_cor,
            # Change the angle of the label text
            bottom.label.text.angle = 90,
            pretty.order.rows = TRUE,
            pretty.order.cols = TRUE,
            row.dendrogram = TRUE,
            scale = FALSE)
  #dev.off()
}

3.2.3 Missingness count plot

if (nrow(missing_df) == 0L) {
  cat("Skipping missingness count plot, no missingness found in predictors.")
} else {
  
  # Table with count of missing covariates by observation.
  missing_counts = rowSums(missing2)
  table(missing_counts)
  # Typical observation is missing 6 covariates.
  summary(missing_counts)
  
  # Code from:
  # https://stackoverflow.com/questions/27850123/ggplot2-have-shorter-tick-marks-for-tick-marks-without-labels?noredirect=1&lq=1
  
  # Major tick marks
  major = 100
  
  # Minor tick marks
  minor = 20
  
  # Range of x values
  # Ensure that we always start at 0.
  (range = c(0, 2* minor + sum(missing_counts == as.integer(names(which.max(table(missing_counts)))))))
  
  # Function to insert blank labels
  # Borrowed from https://stackoverflow.com/questions/14490071/adding-minor-tick-marks-to-the-x-axis-in-ggplot2-with-no-labels/14490652#14490652
  insert_minor <- function(major, n_minor) {
        labs <- c(sapply(major, function(x, y) c(x, rep("", y) ), y = round(n_minor)))
        labs[1:(length(labs) - n_minor)]
  }
  
  # Getting the 'breaks' and 'labels' for the ggplot
  n_minor = major / minor - 1
  (breaks = seq(min(range), max(range), minor))
  (labels = insert_minor(seq(min(range), max(range), major), n_minor))
  if (length(breaks) > length(labels)) labels = c(labels, rep("", length(breaks) - length(labels)))


  print(ggplot(data.frame(missing_counts), aes(x = missing_counts)) +
  geom_bar(aes(y = ..count..)) +
  theme_minimal() +
  geom_text(aes(label = scales::percent(..prop..), y = ..count..),
             stat = "count", hjust = -0.2, size = 3, nudge_x = 0.05,
            color = "gray30",
            NULL) + 
  scale_x_continuous(breaks = seq(0, max(table(missing_counts)))) +
  scale_y_continuous(breaks = breaks,
                     labels = ifelse(labels != "", prettyNum(labels, big.mark = ",", preserve.width = "none"), ""),
                     limits = c(0, max(range))) +
  labs(title = "Distribution of number of missing covariates",
       x = "Number of covariates that are missing",
       y = "Count of observations in dataset") +
  # Remove grid axes, add gray background.
  # Label each value on x axis.
  theme(panel.grid = element_blank(),
        axis.ticks.x = element_line(color = "gray60", size = 0.5),
        panel.background = element_rect(fill = "white", color = "gray50"),
        plot.background = element_rect(fill = "gray95")) +
  coord_flip())
  ggsave("visuals/missing-count-hist.png", width = 8, height = 4)
  
  # X variables with missingness
  print(ncol(missing2))

}

## [1] 12

3.3 Examine outcome missingness

table(data[[vars$outcomes[1]]], useNA = "ifany")
## 
##   0   1 
## 138 165

3.4 Impute missing predictor values

3.4.1 Missingness indicators

# Briefly review missingness.
colMeans(is.na(data[, vars$predictors]))
##        age        sex         cp   trestbps       chol        fbs    restecg 
## 0.01650165 0.00660066 0.01320132 0.00000000 0.00990099 0.01320132 0.01650165 
##    thalach      exang    oldpeak      slope         ca       thal 
## 0.00990099 0.00330033 0.02310231 0.01320132 0.01980198 0.01980198
# First create matrix of missingness indicators for all covariates.
miss_inds =
  ck37r::missingness_indicators(data,
                                skip_vars = c(vars$exclude, vars$outcome),
                                verbose = TRUE)
## Generating 12 missingness indicators.
## Checking for collinearity of indicators.
## Final number of indicators: 12
colMeans(miss_inds)
##     miss_age     miss_sex      miss_cp    miss_chol     miss_fbs miss_restecg 
##   0.01650165   0.00660066   0.01320132   0.00990099   0.01320132   0.01650165 
## miss_thalach   miss_exang miss_oldpeak   miss_slope      miss_ca    miss_thal 
##   0.00990099   0.00330033   0.02310231   0.01320132   0.01980198   0.01980198

3.4.2 Impute to 0

Some variables we want to explicitly set to 0 if they are unobserved.

# Manually impute certain variables to 0 rather than use the sample median (or GLRM).
impute_to_0_vars = c("exang")

# Review missingness one last time for these vars.
colMeans(is.na(data[, impute_to_0_vars, drop = FALSE]))
##      exang 
## 0.00330033
# Also review the median before we conduct imputation.
summary(data[, impute_to_0_vars])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.0000  0.0000  0.3278  1.0000  1.0000       1
# Impute these variables specifically to 0, rather than sample median (although
# in many cases the median was already 0).
data[, impute_to_0_vars] = lapply(data[, impute_to_0_vars, drop = FALSE], function(col) {
  col[is.na(col)] = 0L
  col
})

# Confirm we have no more missingness in these vars.
colMeans(is.na(data[, impute_to_0_vars, drop = FALSE]))
## exang 
##     0

We will use generalized low-rank models in h2o.ai software.

3.4.3 GLRM prep

# Subset using var_df$var so that it's in the same order as var_df.
impute_df = data[, var_df$var]

# Convert binary variables to logical
(binary_vars = var_df$var[var_df$type == "binary"])
## character(0)
for (binary_name in binary_vars) {
  impute_df[[binary_name]] = as.logical(impute_df[[binary_name]])
}

# NOTE: these will be turned into factor variables within h2o.
table(sapply(impute_df, class))
## 
##  factor integer numeric 
##       5       7       1
# Create a dataframe describing the loss function by variable; the first variable must have index = 0
losses = data.frame("index" = seq(ncol(impute_df)) - 1,
                    "feature" = var_df$var,
                    "class" = var_df$class,
                    "type" = var_df$type,
                    stringsAsFactors = FALSE)


# Update class for binary variables.
for (binary_name in binary_vars) {
  losses[var_df$var == binary_name, "class"] = class(impute_df[[binary_name]])
}

losses$loss[losses$class == "numeric"] = "Huber"

losses$loss[losses$class == "integer"] = "Huber"
#losses$loss[losses$class == "integer"] = "Poisson"

losses$loss[losses$class == "factor"] = "Categorical"

losses$loss[losses$type == "binary"] = "Hinge"
# Logistic seems to yield worse reconstruction RMSE overall.
#losses$loss[losses$type == "binary"] = "Logistic"


losses
##    index  feature   class        type        loss
## 1      0       ca  factor categorical Categorical
## 2      1  oldpeak numeric  continuous       Huber
## 3      2  restecg integer     integer       Huber
## 4      3    slope  factor categorical Categorical
## 5      4      age integer     integer       Huber
## 6      5      sex  factor categorical Categorical
## 7      6       cp  factor categorical Categorical
## 8      7    exang integer     integer       Huber
## 9      8     thal  factor categorical Categorical
## 10     9  thalach integer     integer       Huber
## 11    10     chol integer     integer       Huber
## 12    11      fbs integer     integer       Huber
## 13    12 trestbps integer     integer       Huber

3.4.4 Start h2o

# We are avoiding library(h2o) due to namespace conflicts with dplyr & related packages.
# Initialize h2o
h2o::h2o.no_progress()  # Turn off progress bars
analyst_name = "chris-kennedy"
h2o::h2o.init(max_mem_size = "15g",
              name = paste0("h2o-", analyst_name),
              # Default port is 54321, but other analysts may be using that.
              port = 54320,
              # This can reduce accidental sharing of h2o processes on a shared server.
              username = analyst_name,
              password = paste0("pw-", analyst_name),
              # Use half of available cores for h2o.
              nthreads = get_cores())
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\chris\AppData\Local\Temp\Rtmpmov5jY\file5fcc2573ccc/h2o_chris_started_from_r.out
##     C:\Users\chris\AppData\Local\Temp\Rtmpmov5jY\file5fcc77202047/h2o_chris_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 750 milliseconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    2 months and 16 days  
##     H2O cluster name:           h2o-chris-kennedy 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   15.00 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  3 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54320 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.2 (2019-12-12)

3.4.5 Load data into h2o

# Convert data to h2o object
h2o_df = h2o::as.h2o(impute_df)
## Warning in use.package("data.table"): data.table cannot be used without R
## package bit64 version 0.9.7 or higher. Please upgrade to take advangage of
## data.table speedups.
(h2o_types = unlist(h2o::h2o.getTypes(h2o_df)))
##  [1] "enum" "real" "int"  "enum" "int"  "enum" "enum" "int"  "enum" "int" 
## [11] "int"  "int"  "int"
# Double-check side-by-side.
cbind(losses, h2o_types)
##    index  feature   class        type        loss h2o_types
## 1      0       ca  factor categorical Categorical      enum
## 2      1  oldpeak numeric  continuous       Huber      real
## 3      2  restecg integer     integer       Huber       int
## 4      3    slope  factor categorical Categorical      enum
## 5      4      age integer     integer       Huber       int
## 6      5      sex  factor categorical Categorical      enum
## 7      6       cp  factor categorical Categorical      enum
## 8      7    exang integer     integer       Huber       int
## 9      8     thal  factor categorical Categorical      enum
## 10     9  thalach integer     integer       Huber       int
## 11    10     chol integer     integer       Huber       int
## 12    11      fbs integer     integer       Huber       int
## 13    12 trestbps integer     integer       Huber       int

3.4.6 GLRM train/test split

# Split data into train & validation
split = h2o::h2o.splitFrame(h2o_df, ratios = 0.75, seed = 1)
train = split[[1]]
valid = split[[2]]

val_df = as.data.frame(valid)

3.4.7 Define GLRM grid

Follow hyperparameter optimization method shown at: * https://github.com/h2oai/h2o-tutorials/blob/master/best-practices/glrm/GLRM-BestPractices.Rmd * and https://bradleyboehmke.github.io/HOML/GLRM.html#tuning-to-optimize-for-unseen-data

# Create hyperparameter search grid
params = expand.grid(
  # Try 3 values on the exponential scale up to the maximum number of predictors.
  k = round(exp(log(length(vars$predictors)) * exp(c(-0.8, -0.5, -0.1)))),
  regularization_x = c("None", "Quadratic", "L1"),
  regularization_y = c("None", "Quadratic", "L1"),
  gamma_x = c(0, 1, 4),
  gamma_y = c(0, 1, 4),
  error_num = NA,
  error_cat = NA,
  objective = NA,
  stringsAsFactors = FALSE)

# 243 combinations!
dim(params)
## [1] 243   8
# Remove combinations in which regularization_x = None and gamma_x != 0
params = subset(params, regularization_x != "None" | gamma_x == 0)

# Remove combinations in which regularization_x != None and gamma_x == 0
params = subset(params, regularization_x == "None" | gamma_x != 0)

# Remove combinations in which regularization_y = None and gamma_y != 0
params = subset(params, regularization_y != "None" | gamma_y == 0)

# Remove combinations in which regularization_y != None and gamma_y == 0
params = subset(params, regularization_y == "None" | gamma_y != 0)

# Down to 75 combinations.
dim(params)
## [1] 75  8
params
##      k regularization_x regularization_y gamma_x gamma_y error_num error_cat
## 1    3             None             None       0       0        NA        NA
## 2    5             None             None       0       0        NA        NA
## 3   10             None             None       0       0        NA        NA
## 31   3        Quadratic             None       1       0        NA        NA
## 32   5        Quadratic             None       1       0        NA        NA
## 33  10        Quadratic             None       1       0        NA        NA
## 34   3               L1             None       1       0        NA        NA
## 35   5               L1             None       1       0        NA        NA
## 36  10               L1             None       1       0        NA        NA
## 58   3        Quadratic             None       4       0        NA        NA
## 59   5        Quadratic             None       4       0        NA        NA
## 60  10        Quadratic             None       4       0        NA        NA
## 61   3               L1             None       4       0        NA        NA
## 62   5               L1             None       4       0        NA        NA
## 63  10               L1             None       4       0        NA        NA
## 91   3             None        Quadratic       0       1        NA        NA
## 92   5             None        Quadratic       0       1        NA        NA
## 93  10             None        Quadratic       0       1        NA        NA
## 100  3             None               L1       0       1        NA        NA
## 101  5             None               L1       0       1        NA        NA
## 102 10             None               L1       0       1        NA        NA
## 121  3        Quadratic        Quadratic       1       1        NA        NA
## 122  5        Quadratic        Quadratic       1       1        NA        NA
## 123 10        Quadratic        Quadratic       1       1        NA        NA
## 124  3               L1        Quadratic       1       1        NA        NA
## 125  5               L1        Quadratic       1       1        NA        NA
## 126 10               L1        Quadratic       1       1        NA        NA
## 130  3        Quadratic               L1       1       1        NA        NA
## 131  5        Quadratic               L1       1       1        NA        NA
## 132 10        Quadratic               L1       1       1        NA        NA
## 133  3               L1               L1       1       1        NA        NA
## 134  5               L1               L1       1       1        NA        NA
## 135 10               L1               L1       1       1        NA        NA
## 148  3        Quadratic        Quadratic       4       1        NA        NA
## 149  5        Quadratic        Quadratic       4       1        NA        NA
## 150 10        Quadratic        Quadratic       4       1        NA        NA
## 151  3               L1        Quadratic       4       1        NA        NA
## 152  5               L1        Quadratic       4       1        NA        NA
## 153 10               L1        Quadratic       4       1        NA        NA
## 157  3        Quadratic               L1       4       1        NA        NA
## 158  5        Quadratic               L1       4       1        NA        NA
## 159 10        Quadratic               L1       4       1        NA        NA
## 160  3               L1               L1       4       1        NA        NA
## 161  5               L1               L1       4       1        NA        NA
## 162 10               L1               L1       4       1        NA        NA
## 172  3             None        Quadratic       0       4        NA        NA
## 173  5             None        Quadratic       0       4        NA        NA
## 174 10             None        Quadratic       0       4        NA        NA
## 181  3             None               L1       0       4        NA        NA
## 182  5             None               L1       0       4        NA        NA
## 183 10             None               L1       0       4        NA        NA
## 202  3        Quadratic        Quadratic       1       4        NA        NA
## 203  5        Quadratic        Quadratic       1       4        NA        NA
## 204 10        Quadratic        Quadratic       1       4        NA        NA
## 205  3               L1        Quadratic       1       4        NA        NA
## 206  5               L1        Quadratic       1       4        NA        NA
## 207 10               L1        Quadratic       1       4        NA        NA
## 211  3        Quadratic               L1       1       4        NA        NA
## 212  5        Quadratic               L1       1       4        NA        NA
## 213 10        Quadratic               L1       1       4        NA        NA
## 214  3               L1               L1       1       4        NA        NA
## 215  5               L1               L1       1       4        NA        NA
## 216 10               L1               L1       1       4        NA        NA
## 229  3        Quadratic        Quadratic       4       4        NA        NA
## 230  5        Quadratic        Quadratic       4       4        NA        NA
## 231 10        Quadratic        Quadratic       4       4        NA        NA
## 232  3               L1        Quadratic       4       4        NA        NA
## 233  5               L1        Quadratic       4       4        NA        NA
## 234 10               L1        Quadratic       4       4        NA        NA
## 238  3        Quadratic               L1       4       4        NA        NA
## 239  5        Quadratic               L1       4       4        NA        NA
## 240 10        Quadratic               L1       4       4        NA        NA
## 241  3               L1               L1       4       4        NA        NA
## 242  5               L1               L1       4       4        NA        NA
## 243 10               L1               L1       4       4        NA        NA
##     objective
## 1          NA
## 2          NA
## 3          NA
## 31         NA
## 32         NA
## 33         NA
## 34         NA
## 35         NA
## 36         NA
## 58         NA
## 59         NA
## 60         NA
## 61         NA
## 62         NA
## 63         NA
## 91         NA
## 92         NA
## 93         NA
## 100        NA
## 101        NA
## 102        NA
## 121        NA
## 122        NA
## 123        NA
## 124        NA
## 125        NA
## 126        NA
## 130        NA
## 131        NA
## 132        NA
## 133        NA
## 134        NA
## 135        NA
## 148        NA
## 149        NA
## 150        NA
## 151        NA
## 152        NA
## 153        NA
## 157        NA
## 158        NA
## 159        NA
## 160        NA
## 161        NA
## 162        NA
## 172        NA
## 173        NA
## 174        NA
## 181        NA
## 182        NA
## 183        NA
## 202        NA
## 203        NA
## 204        NA
## 205        NA
## 206        NA
## 207        NA
## 211        NA
## 212        NA
## 213        NA
## 214        NA
## 215        NA
## 216        NA
## 229        NA
## 230        NA
## 231        NA
## 232        NA
## 233        NA
## 234        NA
## 238        NA
## 239        NA
## 240        NA
## 241        NA
## 242        NA
## 243        NA
# Randomly order the params so that we can stop at any time.
set.seed(1)
params = params[sample(nrow(params)), ]
params
##      k regularization_x regularization_y gamma_x gamma_y error_num error_cat
## 233  5               L1        Quadratic       4       4        NA        NA
## 153 10               L1        Quadratic       4       1        NA        NA
## 1    3             None             None       0       0        NA        NA
## 148  3        Quadratic        Quadratic       4       1        NA        NA
## 160  3               L1               L1       4       1        NA        NA
## 62   5               L1             None       4       0        NA        NA
## 212  5        Quadratic               L1       1       4        NA        NA
## 183 10             None               L1       0       4        NA        NA
## 102 10             None               L1       0       1        NA        NA
## 204 10        Quadratic        Quadratic       1       4        NA        NA
## 34   3               L1             None       1       0        NA        NA
## 36  10               L1             None       1       0        NA        NA
## 63  10               L1             None       4       0        NA        NA
## 232  3               L1        Quadratic       4       4        NA        NA
## 151  3               L1        Quadratic       4       1        NA        NA
## 158  5        Quadratic               L1       4       1        NA        NA
## 124  3               L1        Quadratic       1       1        NA        NA
## 172  3             None        Quadratic       0       4        NA        NA
## 214  3               L1               L1       1       4        NA        NA
## 207 10               L1        Quadratic       1       4        NA        NA
## 240 10        Quadratic               L1       4       4        NA        NA
## 159 10        Quadratic               L1       4       1        NA        NA
## 234 10               L1        Quadratic       4       4        NA        NA
## 161  5               L1               L1       4       1        NA        NA
## 216 10               L1               L1       1       4        NA        NA
## 135 10               L1               L1       1       1        NA        NA
## 101  5             None               L1       0       1        NA        NA
## 149  5        Quadratic        Quadratic       4       1        NA        NA
## 33  10        Quadratic             None       1       0        NA        NA
## 58   3        Quadratic             None       4       0        NA        NA
## 231 10        Quadratic        Quadratic       4       4        NA        NA
## 152  5               L1        Quadratic       4       1        NA        NA
## 181  3             None               L1       0       4        NA        NA
## 130  3        Quadratic               L1       1       1        NA        NA
## 239  5        Quadratic               L1       4       4        NA        NA
## 122  5        Quadratic        Quadratic       1       1        NA        NA
## 173  5             None        Quadratic       0       4        NA        NA
## 203  5        Quadratic        Quadratic       1       4        NA        NA
## 242  5               L1               L1       4       4        NA        NA
## 206  5               L1        Quadratic       1       4        NA        NA
## 123 10        Quadratic        Quadratic       1       1        NA        NA
## 134  5               L1               L1       1       1        NA        NA
## 238  3        Quadratic               L1       4       4        NA        NA
## 2    5             None             None       0       0        NA        NA
## 61   3               L1             None       4       0        NA        NA
## 93  10             None        Quadratic       0       1        NA        NA
## 121  3        Quadratic        Quadratic       1       1        NA        NA
## 182  5             None               L1       0       4        NA        NA
## 150 10        Quadratic        Quadratic       4       1        NA        NA
## 241  3               L1               L1       4       4        NA        NA
## 100  3             None               L1       0       1        NA        NA
## 202  3        Quadratic        Quadratic       1       4        NA        NA
## 35   5               L1             None       1       0        NA        NA
## 126 10               L1        Quadratic       1       1        NA        NA
## 60  10        Quadratic             None       4       0        NA        NA
## 131  5        Quadratic               L1       1       1        NA        NA
## 157  3        Quadratic               L1       4       1        NA        NA
## 230  5        Quadratic        Quadratic       4       4        NA        NA
## 59   5        Quadratic             None       4       0        NA        NA
## 125  5               L1        Quadratic       1       1        NA        NA
## 31   3        Quadratic             None       1       0        NA        NA
## 133  3               L1               L1       1       1        NA        NA
## 174 10             None        Quadratic       0       4        NA        NA
## 229  3        Quadratic        Quadratic       4       4        NA        NA
## 215  5               L1               L1       1       4        NA        NA
## 132 10        Quadratic               L1       1       1        NA        NA
## 243 10               L1               L1       4       4        NA        NA
## 211  3        Quadratic               L1       1       4        NA        NA
## 32   5        Quadratic             None       1       0        NA        NA
## 162 10               L1               L1       4       1        NA        NA
## 92   5             None        Quadratic       0       1        NA        NA
## 205  3               L1        Quadratic       1       4        NA        NA
## 91   3             None        Quadratic       0       1        NA        NA
## 213 10        Quadratic               L1       1       4        NA        NA
## 3   10             None             None       0       0        NA        NA
##     objective
## 233        NA
## 153        NA
## 1          NA
## 148        NA
## 160        NA
## 62         NA
## 212        NA
## 183        NA
## 102        NA
## 204        NA
## 34         NA
## 36         NA
## 63         NA
## 232        NA
## 151        NA
## 158        NA
## 124        NA
## 172        NA
## 214        NA
## 207        NA
## 240        NA
## 159        NA
## 234        NA
## 161        NA
## 216        NA
## 135        NA
## 101        NA
## 149        NA
## 33         NA
## 58         NA
## 231        NA
## 152        NA
## 181        NA
## 130        NA
## 239        NA
## 122        NA
## 173        NA
## 203        NA
## 242        NA
## 206        NA
## 123        NA
## 134        NA
## 238        NA
## 2          NA
## 61         NA
## 93         NA
## 121        NA
## 182        NA
## 150        NA
## 241        NA
## 100        NA
## 202        NA
## 35         NA
## 126        NA
## 60         NA
## 131        NA
## 157        NA
## 230        NA
## 59         NA
## 125        NA
## 31         NA
## 133        NA
## 174        NA
## 229        NA
## 215        NA
## 132        NA
## 243        NA
## 211        NA
## 32         NA
## 162        NA
## 92         NA
## 205        NA
## 91         NA
## 213        NA
## 3          NA

3.4.9 Apply best GLRM

params = rio::import("tables/glrm-grid-search.xlsx")

(best_params = params %>% arrange(error) %>% as.data.frame() %>% head(1))
##    k regularization_x regularization_y gamma_x gamma_y error_num error_cat
## 1 10        Quadratic             None       4       0  242.0873        29
##   objective    error
## 1  981.6612 271.0873
system.time({
  # Now run on full dataset.
glrm_result =
  h2o::h2o.glrm(training_frame = h2o_df, cols = colnames(h2o_df),
           loss = "Quadratic",
           model_id = "impute_glrm",
           seed = 1,
           k = best_params$k,
           max_iterations = 2000,
           # This is necessary to ensure that the model can optimize, otherwise
           # there may be no improvement in the objective.
           transform = "STANDARDIZE", 
           regularization_x = best_params$regularization_x,
           regularization_y = best_params$regularization_y,
           gamma_x = best_params$gamma_x,
           gamma_y = best_params$gamma_y,
           loss_by_col_idx = losses$index,
           loss_by_col = losses$loss)
})
##    user  system elapsed 
##    0.12    0.01    2.25
h2o::summary(glrm_result)
## Model Details:
## ==============
## 
## H2ODimReductionModel: glrm
## Model Key:  impute_glrm 
## Model Summary: 
##   number_of_iterations final_step_size final_objective_value
## 1                  237         0.00009            1045.33367
## 
## H2ODimReductionMetrics: glrm
## ** Reported on training data. **
## 
## Sum of Squared Error (Numeric):  864.1832
## Misclassification Error (Categorical):  81
## Number of Numeric Entries:  2397
## Number of Categorical Entries:  1493
## 
## 
## 
## Scoring History: 
##             timestamp   duration iterations step_size  objective
## 1 2020-06-20 06:44:13  0.032 sec          0   1.05000 3144.82797
## 2 2020-06-20 06:44:13  0.038 sec          1   0.70000 3144.82797
## 3 2020-06-20 06:44:13  0.044 sec          2   0.46667 3144.82797
## 4 2020-06-20 06:44:13  0.049 sec          3   0.31111 3144.82797
## 5 2020-06-20 06:44:13  0.054 sec          4   0.32667 2931.70821
## 
## ---
##               timestamp   duration iterations step_size  objective
## 232 2020-06-20 06:44:15  1.264 sec        231   0.00017 1045.37528
## 233 2020-06-20 06:44:15  1.269 sec        232   0.00011 1045.37528
## 234 2020-06-20 06:44:15  1.274 sec        233   0.00012 1045.34632
## 235 2020-06-20 06:44:15  1.278 sec        234   0.00013 1045.33851
## 236 2020-06-20 06:44:15  1.282 sec        235   0.00013 1045.33367
## 237 2020-06-20 06:44:15  1.287 sec        236   0.00009 1045.33367
glrm_result
## Model Details:
## ==============
## 
## H2ODimReductionModel: glrm
## Model ID:  impute_glrm 
## Model Summary: 
##   number_of_iterations final_step_size final_objective_value
## 1                  237         0.00009            1045.33367
## 
## 
## H2ODimReductionMetrics: glrm
## ** Reported on training data. **
## 
## Sum of Squared Error (Numeric):  864.1832
## Misclassification Error (Categorical):  81
## Number of Numeric Entries:  2397
## Number of Categorical Entries:  1493
plot(glrm_result)

3.4.10 Review GLRM

# Don't use h2o's provided model$importance statistics, they are flawed.
# We need to calculate these manually for now (Apr. 2020).

# Extract compressed dataset.
new_data = as.data.frame(h2o::h2o.getFrame(glrm_result@model$representation_name))

# Calculate variances for each archetype.
(variances = sapply(new_data, stats::var))
##      Arch1      Arch2      Arch3      Arch4      Arch5      Arch6      Arch7 
## 0.02260095 0.02191188 0.02752955 0.02226097 0.02311360 0.02408451 0.02040544 
##      Arch8      Arch9     Arch10 
## 0.01575170 0.03315694 0.02512172
# Sort variances in descending order
(variances = variances[order(variances, decreasing = TRUE)])
##      Arch9      Arch3     Arch10      Arch6      Arch5      Arch1      Arch4 
## 0.03315694 0.02752955 0.02512172 0.02408451 0.02311360 0.02260095 0.02226097 
##      Arch2      Arch7      Arch8 
## 0.02191188 0.02040544 0.01575170
glrm_vars = data.frame(variances, pct_total = variances / sum(variances))
glrm_vars$cumulative_pct = cumsum(glrm_vars$pct_total)
glrm_vars$order = seq(nrow(glrm_vars))

glrm_vars
##         variances  pct_total cumulative_pct order
## Arch9  0.03315694 0.14053286      0.1405329     1
## Arch3  0.02752955 0.11668165      0.2572145     2
## Arch10 0.02512172 0.10647628      0.3636908     3
## Arch6  0.02408451 0.10208015      0.4657710     4
## Arch5  0.02311360 0.09796503      0.5637360     5
## Arch1  0.02260095 0.09579221      0.6595282     6
## Arch4  0.02226097 0.09435124      0.7538794     7
## Arch2  0.02191188 0.09287162      0.8467510     8
## Arch7  0.02040544 0.08648673      0.9332378     9
## Arch8  0.01575170 0.06676223      1.0000000    10
data.frame(
    component  = glrm_vars$order,
    PVE = glrm_vars$pct_total,
    CVE = glrm_vars$cumulative_pct
) %>%
    tidyr::gather(metric, variance_explained, -component) %>%
    ggplot(aes(component, variance_explained)) +
    geom_point() + theme_minimal() + 
    facet_wrap(~ metric, ncol = 1, scales = "free")

ggsave("visuals/imputation-glrm-component-plot-custom.png")
## Saving 7 x 5 in image
# Examine how many components (archetypes) to use.
library(dplyr)
library(ggplot2)

# Reconstructed data from GLRM.
recon_df = h2o::h2o.reconstruct(glrm_result, h2o_df,
                                reverse_transform = TRUE)
# Fix column names.
names(recon_df) = names(impute_df)

# Convert from h2o object back to an R df.
recon_df = as.data.frame(recon_df)

#####################
# Quick quality review on age variable.

# Compare imputed values to known values.
known_age = !is.na(impute_df$age)
# Examine RMSE = 4.3
sqrt(mean((impute_df$age[known_age] - recon_df$age[known_age])^2))
## [1] 5.018388
# Compare to median imputation, RMSE = 9.1
sqrt(mean((impute_df$age[known_age] - median(impute_df$age[known_age]))^2))
## [1] 9.122765
# Compare to mean imputation, RMSE = 9.1
sqrt(mean((impute_df$age[known_age] - mean(impute_df$age[known_age]))^2))
## [1] 9.095246

3.4.11 Evaluate imputation

# TODO: serialize GLRM h2o object for future reference.

# Calculate median/mode imputation for comparison to GLRM.
impute_info =
  ck37r::impute_missing_values(data,
                               # TODO: need to skip date variables, e.g. POSIXct.
                               # This is yieling an h2o error currently.
                               skip_vars = c(vars$exclude, vars$outcome),
                               # Don't add indicators as we've already created those.
                               add_indicators = FALSE,
                               type = "standard",
                               verbose = TRUE)
## Found 11 variables with NAs.
## Running standard imputation.
## Imputing age (1 integer) with 5 NAs. Impute value: 55 
## Imputing sex (2 factor) with 2 NAs. Impute value: 1 
## Imputing cp (3 factor) with 4 NAs. Impute value: 0 
## Imputing chol (5 integer) with 3 NAs. Impute value: 240 
## Imputing fbs (6 integer) with 4 NAs. Impute value: 0 
## Imputing restecg (7 integer) with 5 NAs. Impute value: 1 
## Imputing thalach (8 integer) with 3 NAs. Impute value: 152 
## Imputing oldpeak (10 numeric) with 7 NAs. Impute value: 0.8 
## Imputing slope (11 factor) with 4 NAs. Impute value: 2 
## Imputing ca (12 factor) with 6 NAs. Impute value: 0 
## Imputing thal (13 factor) with 6 NAs. Impute value: 2
# Skip race because it's categorical.
# Also skip the "impute to 0" variables.
(vars_with_missingness =
  var_df$var[var_df$missingness > 0 & !var_df$var %in% c("race") &
             !var_df$var %in% impute_to_0_vars])
##  [1] "ca"      "oldpeak" "restecg" "slope"   "age"     "sex"     "cp"     
##  [8] "thal"    "thalach" "chol"    "fbs"
# Bound GLRM variables back to the original bounds.
for (var in vars_with_missingness) {
  row = var_df[var_df$var == var, , drop = FALSE]
  
  # Skip factor vars.
  if (row$class != "factor") {
    recon_df[[var]] = pmin(pmax(recon_df[[var]], row$min), row$max)
  }
}

# Round integer and ordinal vars back to be integers.
for (var in c(vars$integers, vars$ordinal)) {
  # TODO: confirm if we need both round() and as.integer() here.
  recon_df[[var]] = as.integer(round(recon_df[[var]]))
}


# Loop over each variable and compare GLRM imputation to median/mode imputation
# Use RMSE as a comparison metric.
# TODO: use a training/test split to make this kosher.
impute_compare = data.frame(var = vars_with_missingness,
                            loss = losses[var_df$var %in% vars_with_missingness, "loss"],
                            missingness = var_df[var_df$var %in% vars_with_missingness, "missingness"],
                            error_glrm = NA,
                            error_median = NA,
                            pct_reduction = NA,
                            stringsAsFactors = FALSE)


# TODO: get this to work with categorical variables.
# For now, remove categorical variables.
#(impute_compare = subset(impute_compare, loss != "Categorical"))


for (var in impute_compare$var) {
  # Obesity became a factor?
  cat("Analzying", var, class(data[[var]]), class(recon_df[[var]]), "\n")
  
  # Analyze the rows in which the variable is not missing.
  observed_rows = !is.na(data[[var]])
  
  # Calculate RMSE for GLRM.
  error_glrm = sqrt(mean((impute_df[observed_rows, var] -
                          recon_df[observed_rows, var])^2))
  
  # Compare to median imputation.
  error_median = sqrt(mean((impute_df[observed_rows, var] -
                            impute_info$impute_values[[var]])^2))
  
  # Save results
  impute_compare[impute_compare$var == var,
                 c("error_glrm", "error_median")] = c(error_glrm, error_median)
}
## Analzying ca factor factor
## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors
## Warning in Ops.factor(impute_df[observed_rows, var],
## impute_info$impute_values[[var]]): '-' not meaningful for factors
## Analzying oldpeak numeric numeric 
## Analzying restecg integer numeric 
## Analzying slope factor factor
## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors

## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors
## Analzying age integer numeric 
## Analzying sex factor factor
## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors

## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors
## Analzying cp factor factor
## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors

## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors
## Analzying thal factor factor
## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors

## Warning in Ops.factor(impute_df[observed_rows, var], recon_df[observed_rows, :
## '-' not meaningful for factors
## Analzying thalach integer numeric 
## Analzying chol integer numeric 
## Analzying fbs integer numeric
impute_compare$pct_reduction =  1 - impute_compare$error_glrm / impute_compare$error_median

(impute_compare = impute_compare %>% arrange(desc(missingness)) %>% as.data.frame())
##        var        loss missingness  error_glrm error_median pct_reduction
## 1  oldpeak       Huber  0.02310231  0.78744295    1.1901624     0.3383735
## 2       ca Categorical  0.01980198          NA           NA            NA
## 3     thal Categorical  0.01980198          NA           NA            NA
## 4  restecg       Huber  0.01650165  0.16977460    0.7071068     0.7599025
## 5      age       Huber  0.01650165  5.01838842    9.1227645     0.4499049
## 6    slope Categorical  0.01320132          NA           NA            NA
## 7       cp Categorical  0.01320132          NA           NA            NA
## 8      fbs       Huber  0.01320132  0.09768869    0.3879455     0.7481896
## 9  thalach       Huber  0.00990099 16.00993723   22.8834729     0.3003712
## 10    chol       Huber  0.00990099 37.75530167   52.2104715     0.2768634
## 11     sex Categorical  0.00660066          NA           NA            NA
cat("Average percent reduction in RMSE:",
    round(100 * mean(impute_compare$pct_reduction, na.rm = TRUE), 1), "\n")
## Average percent reduction in RMSE: 47.9
save(impute_compare, file = "data/imputation-comparison-glrm.RData")

# Make a separate copy for use in the paper.
imput_comp = impute_compare

imput_comp$pct_reduction = round(imput_comp$pct_reduction * 100, 2)
imput_comp$missingness = round(imput_comp$missingness * 100, 2)

# Remove loss column.
imput_comp$loss = NULL

names(imput_comp) = c("Variable", "Missingness", "Error GLRM", "Error Median", "Percent reduction")


(kab_table = kable(imput_comp, format = "latex", digits = c(1, 1, 3, 3, 1),
                   caption = "Comparing missing value imputation using GLRM versus median/mode",
                   label = "imputation-comparison",
                   booktabs = TRUE))
cat(kab_table %>% kable_styling(latex_options = "striped"),
    file = "tables/imputation-comparison-glrm.tex")

rio::export(imput_comp, file = "tables/imputation-comparison-glrm.xlsx")

3.4.12 Replace missing values.

# Now replace the missing values with imputed values.
for (var in impute_compare$var) {
  
  # Analyze the rows in which the variable is not missing.
  missing_rows = is.na(data[[var]])
  
  data[missing_rows, var] = recon_df[missing_rows, var]
}

# Should be all 0's.
summary(colMeans(is.na(data)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0
colSums(is.na(data[, vars$predictors]))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        0        0        0        0        0        0        0        0 
##    exang  oldpeak    slope       ca     thal 
##        0        0        0        0        0
data = cbind(data, miss_inds)
impute_info$data = NULL

colnames(data)
##  [1] "age"          "sex"          "cp"           "trestbps"     "chol"        
##  [6] "fbs"          "restecg"      "thalach"      "exang"        "oldpeak"     
## [11] "slope"        "ca"           "thal"         "target"       "miss_age"    
## [16] "miss_sex"     "miss_cp"      "miss_chol"    "miss_fbs"     "miss_restecg"
## [21] "miss_thalach" "miss_exang"   "miss_oldpeak" "miss_slope"   "miss_ca"     
## [26] "miss_thal"
# Update the predictors with the new missingness indicators.
(vars$predictors = setdiff(names(data), c(vars$exclude, vars$outcomes)))
##  [1] "age"          "sex"          "cp"           "trestbps"     "chol"        
##  [6] "fbs"          "restecg"      "thalach"      "exang"        "oldpeak"     
## [11] "slope"        "ca"           "thal"         "miss_age"     "miss_sex"    
## [16] "miss_cp"      "miss_chol"    "miss_fbs"     "miss_restecg" "miss_thalach"
## [21] "miss_exang"   "miss_oldpeak" "miss_slope"   "miss_ca"      "miss_thal"
# Double-check missingness.
colSums(is.na(data))
##          age          sex           cp     trestbps         chol          fbs 
##            0            0            0            0            0            0 
##      restecg      thalach        exang      oldpeak        slope           ca 
##            0            0            0            0            0            0 
##         thal       target     miss_age     miss_sex      miss_cp    miss_chol 
##            0            0            0            0            0            0 
##     miss_fbs miss_restecg miss_thalach   miss_exang miss_oldpeak   miss_slope 
##            0            0            0            0            0            0 
##      miss_ca    miss_thal 
##            0            0

3.4.13 Shutdown h2o

This saves RAM, which is helpful especially on shared servers.

h2o::h2o.shutdown(prompt = FALSE)

3.5 Update predictor summary

result = summarize_vars(data, vars$predictors, groups = vars$groups)
## Var: fbs 
## 
##                  0 0.0771160802470997  0.170076509369194  0.363667408225791 
##                255                  1                  1                  1 
##                  1 
##                 45 
## Var: restecg 
## 
##                 0 0.276255291712452 0.620070370142201  0.66372197263869 
##               146                 1                 1                 1 
## 0.950072559816494                 1  1.19642276841703                 2 
##                 1               149                 1                 3
# Export as a spreadsheet
# TODO: use prettier variable names for this export and the latex table.
rio::export(result$table, file = "tables/predictor-summary-imputed.xlsx")

# TODO: output as a kableExtra latex table.

var_df = result$table
data = result$data

3.6 Histogram condense

Apply histogram condensing to high-cardinality features

uniq_val_threshold = 80L

# These are the continuous vars with moderate or high missingness.
(dense_vars = var_df[var_df$uniq_vals > uniq_val_threshold, c("var", "uniq_vals")])
##        var uniq_vals
## 10 thalach        93
## 11    chol       155
hist_bins = uniq_val_threshold

for (dense_var in dense_vars$var) {
  # Confirm it has a large number of unique values.
  num_unique = length(unique(data[[dense_var]]))
  if (num_unique > uniq_val_threshold) {
    print(qplot(data[[dense_var]]) + theme_minimal() +
      labs(x = dense_var, y = "original values"))
    
    # Try histogram binning vs. equal-sized group binning.
    hist_vec2 = histogram::histogram(data[[dense_var]],
                                     control = list(maxbin = hist_bins))
    
    # Apply histogram binning to original data vector.
    cuts = cut(data[[dense_var]], breaks = hist_vec2$breaks,
               # If we don't specify this, all obs with lowest value will get an NA.
               include.lowest = TRUE)
    
    
    # Use the midpoint of each bin as the new value.
    mid_vals = hist_vec2$mids[as.numeric(cuts)]
    
    # Check for missing values in the dense vars.
    if (sum(is.na(mid_vals)) > 0) {
      stop("missing values in mid_vals")
    }
    
    # Update variable to use the mid_vals
    data[[dense_var]] = mid_vals
    print(qplot(mid_vals) + labs(x = dense_var, y = "mid_vals") + theme_minimal())
  }
  
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Choosing between regular and irregular histogram:
## 
## 1.Building regular histogram with maximum number of bins 53.
## - Choosing number of bins via maximum likelihood with BR penalty.
## - Number of bins chosen: 10.
## 
## 
## 2.Building irregular histogram.
## - Using finest grid based on observations.
## - Choosing number of bins via maximum likelihood with PENB penalty.
## - Computing weights for dynamic programming algorithm.
## - Now performing dynamic optimization.
## - Number of bins chosen: 4.
## 
## 
## 
## Regular histogram chosen.

## $breaks
##  [1]  71.0  84.1  97.2 110.3 123.4 136.5 149.6 162.7 175.8 188.9 202.0
## 
## $counts
##  [1]  1  6 11 26 35 53 78 63 26  4
## 
## $density
##  [1] 0.0002519336 0.0015116015 0.0027712695 0.0065502733 0.0088176757
##  [6] 0.0133524803 0.0196508200 0.0158718162 0.0065502733 0.0010077344
## 
## $mids
##  [1]  77.55  90.65 103.75 116.85 129.95 143.05 156.15 169.25 182.35 195.45
## 
## $xname
## [1] "data[[dense_var]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Choosing between regular and irregular histogram:
## 
## 1.Building regular histogram with maximum number of bins 53.
## - Choosing number of bins via maximum likelihood with BR penalty.
## - Number of bins chosen: 9.
## 
## 
## 2.Building irregular histogram.
## - Using finest grid based on observations.
## - Choosing number of bins via maximum likelihood with PENB penalty.
## - Using greedy procedure to recursively build a finest partition with at most 100 bins.
## - Pre-selected finest partition with 100 bins.
## - Computing weights for dynamic programming algorithm.
## - Now performing dynamic optimization.
## - Number of bins chosen: 5.
## 
## 
## 
## Regular histogram chosen.

## $breaks
##  [1] 126.0000 174.6667 223.3333 272.0000 320.6667 369.3333 418.0000 466.6667
##  [9] 515.3333 564.0000
## 
## $counts
## [1]  14  92 118  58  16   4   0   0   1
## 
## $density
## [1] 0.000949410 0.006238980 0.008002170 0.003933270 0.001085040 0.000271260
## [7] 0.000000000 0.000000000 0.000067815
## 
## $mids
## [1] 150.3333 199.0000 247.6667 296.3333 345.0000 393.6667 442.3333 491.0000
## [9] 539.6667
## 
## $xname
## [1] "data[[dense_var]]"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Check for missing data.
colSums(is.na(data[, dense_vars$var]))
## thalach    chol 
##       0       0

3.7 Update predictor summary

result = summarize_vars(data, vars$predictors, groups = vars$groups)
## Var: chol 
## 
## 150.333333333333              199 247.666666666667 296.333333333333 
##               14               92              118               58 
##              345 393.666666666667 539.666666666667 
##               16                4                1 
## Var: fbs 
## 
##                  0 0.0771160802470997  0.170076509369194  0.363667408225791 
##                255                  1                  1                  1 
##                  1 
##                 45 
## Var: restecg 
## 
##                 0 0.276255291712452 0.620070370142201  0.66372197263869 
##               146                 1                 1                 1 
## 0.950072559816494                 1  1.19642276841703                 2 
##                 1               149                 1                 3 
## Var: thalach 
## 
##  77.55  90.65 103.75 116.85 129.95 143.05 156.15 169.25 182.35 195.45 
##      1      6     11     26     35     53     78     63     26      4
# Export as a spreadsheet
# TODO: use prettier variable names for this export and the latex table.
rio::export(result$table, file = "tables/predictor-summary-imputed-condensed.xlsx")

# TODO: output as a kableExtra latex table.
var_df = result$table

Save imputed dataset

save(data, vars, var_df,
     file = "data/clean-impute.RData")