Chapter 4 Dataset finalization

Load data

# Created in 2-clean-impute.Rmd
# Objects included: data, vars
# renv also includes a load() method, so we specify base:: here.
base::load("data/clean-impute.RData")

4.1 Factors to indicators

result = ck37r::factors_to_indicators(data[vars$predictors], verbose = TRUE)
## Converting factors (5): sex, cp, slope, ca, thal
## Converting sex from a factor to a matrix (2 levels).
## : sex_1 
## Converting cp from a factor to a matrix (4 levels).
## : cp_1 cp_2 cp_3 
## Converting slope from a factor to a matrix (3 levels).
## : slope_1 slope_2 
## Converting ca from a factor to a matrix (5 levels).
## : ca_1 ca_2 ca_3 ca_4 
## Converting thal from a factor to a matrix (4 levels).
## : thal_1 thal_2 thal_3 
## Combining factor matrices into a data frame.
names(result)
## [1] "data"         "predictors"   "factor_vars"  "factor_names"
# Temporarily remove all predictors from the dataframe.
data[vars$predictors] = NULL

# Now add the new data back on.
data = cbind(data, result$data)

# Remove the original factor predictors from the list of used predictors, add
# add the new indicator predictors.
(vars$predictors = c(setdiff(vars$predictors, result$factor_vars), unlist(result$factor_names)))
##  [1] "age"          "trestbps"     "chol"         "fbs"          "restecg"     
##  [6] "thalach"      "exang"        "oldpeak"      "miss_age"     "miss_sex"    
## [11] "miss_cp"      "miss_chol"    "miss_fbs"     "miss_restecg" "miss_thalach"
## [16] "miss_exang"   "miss_oldpeak" "miss_slope"   "miss_ca"      "miss_thal"   
## [21] "sex_1"        "cp_1"         "cp_2"         "cp_3"         "slope_1"     
## [26] "slope_2"      "ca_1"         "ca_2"         "ca_3"         "ca_4"        
## [31] "thal_1"       "thal_2"       "thal_3"
# Confirm that our predictor vector is updated correctly.
if (!all(vars$predictors %in% names(data))) {
  missing_vars = setdiff(vars$predictors, names(data))
  stop("Missing new indicators that were added: ", paste(missing_vars, collapse = ", "))
}
  

rm(result)

4.2 Remove collinear predictors

This is not essential, but nice for the linear regression estimators.

This needs to be after imputation, because it currently cannot handle missingness.

# Remove linearly correlated columns from the covariate file
# NOTE: assumes that there are no factor variables.
linear_combos = caret::findLinearCombos(data[, vars$predictors])

if (length(linear_combos$remove) > 0L) {

  if (conf$verbose) {
    cat("Removing", length(linear_combos$remove), "predictors due to collinearity.\n")
    cat("Vars:", paste0(vars$predictors[linear_combos$remove], collapse = ", "), "\n")
  }
  
  # Make sure we don't switch to a vector if only 1 column remains.
  data = data[, !colnames(data) %in% vars$predictors[linear_combos$remove],
              drop = FALSE]
  
  vars$predictors = setdiff(vars$predictors, vars$predictors[linear_combos$remove])
  
  if (conf$verbose) {
    cat("Updated predictor count:", length(vars$predictors), "\n")
  }
} else {
  cat("No linear duplication found.\n")
}
## No linear duplication found.
rm(linear_combos)

4.3 Confirm predictor matrix invertability

This is not essential, but nice for the linear regression estimators.

# Compute covariance matrix.
# NOTE: this requires that no factors be included.
cov_mat = stats::cov(data[vars$predictors])
  
# Compute QR decomposition of covariance matrix.
qr_cov = base::qr(cov_mat)

# These need to be equal for the covariance matrix to be full rank.
if (ncol(cov_mat) != qr_cov$rank) {
  cat("Warning: matrix of predictors is not full rank.\n")
  cat("Predictor columns:", ncol(cov_mat), "QR rank:", qr_cov$rank, "\n")
} else {
  cat("Predictor matrix is full rank.\n")
}
## Predictor matrix is full rank.
rm(cov_mat, qr_cov)

Save finalized dataset

save(data, vars,
     file = "data/clean-finalize-imputed.RData")