Chapter 4 Dataset finalization
Load data
4.1 Factors to indicators
## Converting factors (5): sex, cp, slope, ca, thal
## Converting sex from a factor to a matrix (2 levels).
## : sex_1
## Converting cp from a factor to a matrix (4 levels).
## : cp_1 cp_2 cp_3
## Converting slope from a factor to a matrix (3 levels).
## : slope_1 slope_2
## Converting ca from a factor to a matrix (5 levels).
## : ca_1 ca_2 ca_3 ca_4
## Converting thal from a factor to a matrix (4 levels).
## : thal_1 thal_2 thal_3
## Combining factor matrices into a data frame.
## [1] "data" "predictors" "factor_vars" "factor_names"
# Temporarily remove all predictors from the dataframe.
data[vars$predictors] = NULL
# Now add the new data back on.
data = cbind(data, result$data)
# Remove the original factor predictors from the list of used predictors, add
# add the new indicator predictors.
(vars$predictors = c(setdiff(vars$predictors, result$factor_vars), unlist(result$factor_names)))
## [1] "age" "trestbps" "chol" "fbs" "restecg"
## [6] "thalach" "exang" "oldpeak" "miss_age" "miss_sex"
## [11] "miss_cp" "miss_chol" "miss_fbs" "miss_restecg" "miss_thalach"
## [16] "miss_exang" "miss_oldpeak" "miss_slope" "miss_ca" "miss_thal"
## [21] "sex_1" "cp_1" "cp_2" "cp_3" "slope_1"
## [26] "slope_2" "ca_1" "ca_2" "ca_3" "ca_4"
## [31] "thal_1" "thal_2" "thal_3"
4.2 Remove collinear predictors
This is not essential, but nice for the linear regression estimators.
This needs to be after imputation, because it currently cannot handle missingness.
# Remove linearly correlated columns from the covariate file
# NOTE: assumes that there are no factor variables.
linear_combos = caret::findLinearCombos(data[, vars$predictors])
if (length(linear_combos$remove) > 0L) {
if (conf$verbose) {
cat("Removing", length(linear_combos$remove), "predictors due to collinearity.\n")
cat("Vars:", paste0(vars$predictors[linear_combos$remove], collapse = ", "), "\n")
}
# Make sure we don't switch to a vector if only 1 column remains.
data = data[, !colnames(data) %in% vars$predictors[linear_combos$remove],
drop = FALSE]
vars$predictors = setdiff(vars$predictors, vars$predictors[linear_combos$remove])
if (conf$verbose) {
cat("Updated predictor count:", length(vars$predictors), "\n")
}
} else {
cat("No linear duplication found.\n")
}
## No linear duplication found.
4.3 Confirm predictor matrix invertability
This is not essential, but nice for the linear regression estimators.
# Compute covariance matrix.
# NOTE: this requires that no factors be included.
cov_mat = stats::cov(data[vars$predictors])
# Compute QR decomposition of covariance matrix.
qr_cov = base::qr(cov_mat)
# These need to be equal for the covariance matrix to be full rank.
if (ncol(cov_mat) != qr_cov$rank) {
cat("Warning: matrix of predictors is not full rank.\n")
cat("Predictor columns:", ncol(cov_mat), "QR rank:", qr_cov$rank, "\n")
} else {
cat("Predictor matrix is full rank.\n")
}
## Predictor matrix is full rank.