Chapter 2 Initial data import
2.1 Import raw data
# Here "rio::" tells R to use the import() function specifically from the rio package.
# This reminds us of which package supplies import(), and ensures that no other
# package that provides import() could be used accidentally due to package loading order.
data = rio::import("data-raw/heart.csv")
# Check dimensions and variables.
dim(data)
## [1] 303 14
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
2.5 Categoricals to factors
Ensure that any categorical variables are specified as factors and not numeric/integer variables.
# TODO: treat ordinal variables as ordinal rather than categorical.
data = ck37r::categoricals_to_factors(data,
categoricals = c("sex", "ca", "cp", "slope", "thal"),
verbose = TRUE)
## Converting sex from integer to factor. Unique vals: 2
## Converting ca from integer to factor. Unique vals: 5
## Converting cp from integer to factor. Unique vals: 4
## Converting slope from integer to factor. Unique vals: 3
## Converting thal from integer to factor. Unique vals: 4
## 'data.frame': 303 obs. of 14 variables:
## $ age : int 63 37 41 56 57 57 56 44 52 57 ...
## $ sex : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 2 2 2 ...
## $ cp : Factor w/ 4 levels "0","1","2","3": 4 3 2 2 1 1 2 2 3 3 ...
## $ trestbps: int 145 130 130 120 120 140 140 120 172 150 ...
## $ chol : int 233 250 204 236 354 192 294 263 199 168 ...
## $ fbs : int 1 0 0 0 0 0 0 0 1 0 ...
## $ restecg : int 0 1 0 1 1 1 0 1 1 1 ...
## $ thalach : int 150 187 172 178 163 148 153 173 162 174 ...
## $ exang : int 0 0 0 0 1 0 0 0 0 0 ...
## $ oldpeak : num 2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
## $ slope : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
## $ ca : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ thal : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
## $ target : int 1 1 1 1 1 1 1 1 1 1 ...
2.6 Data structure
Specify the outcome variable name, variables excluded from the analysis, and predictor variables (covariates).
vars =
list(
# Variables from exclude from analysis, such as ID fields.
exclude = c(NULL),
# Outcome variables - could be one or multiple (e.g. sensitivity analyses).
outcomes = c("target"),
# Predictor variables will be defined automatically in the next line of code.
predictors = NULL
)
# All remaining variables are considered predictors.
(vars$predictors = setdiff(names(data), c(vars$exclude, vars$outcomes)))
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal"
2.7 Extreme value review
2.8 Remove constant predictors
We don’t have any constant predictors but good to confirm.
# Count the unique values in each predictor, excluding NAs.
unique_vals = sapply(data[, vars$predictors, drop = FALSE],
# Make that we don't count NA as a unique value.
function(col_vals) length(setdiff(unique(col_vals), NA)))
# Looks good, no constant columns.
summary(unique_vals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 3.00 4.00 30.62 41.00 152.00
# Remove constant columns from the covariate file.
constant_columns = vars$predictors[unique_vals < 2L]
if (length(constant_columns) > 0L) {
data = data[, !names(data) %in% constant_columns, drop = FALSE]
vars$predictors = setdiff(vars$predictors, constant_columns)
}
cat("Removed", length(constant_columns), "constant predictors.\n")
## Removed 0 constant predictors.
2.9 Tutorial-only: add random missingness
Our example dataset has no missingness, so for this tutorial we are artifically adding missingness.
# Add missingness to certain predictors.
# Number of data cells to set to be missing.
missing_cells = 50L
set.seed(1)
# Randomly sample 20 patients to have a missing value.
miss_rows = sample(nrow(data), missing_cells, replace = TRUE)
# Randomly select X predictors to have missingness.
miss_preds = sample(vars$predictors, missing_cells, replace = TRUE)
miss_df = data.frame(miss_rows, miss_preds, stringsAsFactors = FALSE)
for (row_i in seq(nrow(miss_df))) {
row = miss_df[row_i, , drop = FALSE]
data[row$miss_rows, row$miss_preds] = NA
}
# Confirm that we now have some missing data.
colSums(is.na(data))
## age sex cp trestbps chol fbs restecg thalach
## 5 2 4 0 3 4 5 3
## exang oldpeak slope ca thal target
## 1 7 4 6 6 0
2.10 Summarize predictors
We will use this to support:
- Review by the team, such as to identify additional cleaning of outliers
- To inform the loss functions used for GLRM interpretation, and
- As a possible table in the manuscript (supplemental info most likely).
# Columns: variable name, type, # of unique values, mode, mean, median, min, max, missingness
# Groups: demographic, biomarker, notes, score, clinical history (including family)
vars$groups = list(
demo = c("age", "sex"),
vitals = c("trestbps"),
exam = c("cp", "thalach", "exang", "thal"),
labs = c("chol", "fbs"),
biomarkers = c("restecg", "oldpeak", "slope", "ca")
)
# Note which predictors we consider to be integers.
(vars$integers = NULL)
## NULL
## NULL
## $exclude
## NULL
##
## $outcomes
## [1] "target"
##
## $predictors
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal"
##
## $groups
## $groups$demo
## [1] "age" "sex"
##
## $groups$vitals
## [1] "trestbps"
##
## $groups$exam
## [1] "cp" "thalach" "exang" "thal"
##
## $groups$labs
## [1] "chol" "fbs"
##
## $groups$biomarkers
## [1] "restecg" "oldpeak" "slope" "ca"
# Could specify integers and ordinal arguments here.
result = summarize_vars(data, vars = vars$predictors, groups = vars$groups)
# Export as a spreadsheet
# TODO: use prettier variable names for this export and the latex table.
rio::export(result$table, file = "tables/predictor-summary-unimputed.xlsx")
# TODO: output as a kableExtra latex table.
var_df = result$table
data = result$data