Chapter 2 Initial data import
2.1 Import raw data
# Here "rio::" tells R to use the import() function specifically from the rio package.
# This reminds us of which package supplies import(), and ensures that no other
# package that provides import() could be used accidentally due to package loading order.
data = rio::import("data-raw/heart.csv")
# Check dimensions and variables.
dim(data)## [1] 303  14##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"2.5 Categoricals to factors
Ensure that any categorical variables are specified as factors and not numeric/integer variables.
# TODO: treat ordinal variables as ordinal rather than categorical.
data = ck37r::categoricals_to_factors(data,
              categoricals = c("sex", "ca", "cp", "slope", "thal"),
              verbose = TRUE)## Converting sex from integer to factor. Unique vals: 2 
## Converting ca from integer to factor. Unique vals: 5 
## Converting cp from integer to factor. Unique vals: 4 
## Converting slope from integer to factor. Unique vals: 3 
## Converting thal from integer to factor. Unique vals: 4## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 4 3 2 2 1 1 2 2 3 3 ...
##  $ trestbps: int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach : int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
##  $ target  : int  1 1 1 1 1 1 1 1 1 1 ...2.6 Data structure
Specify the outcome variable name, variables excluded from the analysis, and predictor variables (covariates).
vars =
  list(
    # Variables from exclude from analysis, such as ID fields.
    exclude = c(NULL),
    
    # Outcome variables - could be one or multiple (e.g. sensitivity analyses).
    outcomes = c("target"),
    
    # Predictor variables will be defined automatically in the next line of code.
    predictors = NULL
)
# All remaining variables are considered predictors.
(vars$predictors = setdiff(names(data), c(vars$exclude, vars$outcomes)))  ##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"2.7 Extreme value review
2.8 Remove constant predictors
We don’t have any constant predictors but good to confirm.
# Count the unique values in each predictor, excluding NAs.
unique_vals = sapply(data[, vars$predictors, drop = FALSE],
                     # Make that we don't count NA as a unique value.
                     function(col_vals) length(setdiff(unique(col_vals), NA)))
# Looks good, no constant columns.
summary(unique_vals)##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    3.00    4.00   30.62   41.00  152.00# Remove constant columns from the covariate file.
constant_columns = vars$predictors[unique_vals < 2L]
  
if (length(constant_columns) > 0L) {
  data = data[, !names(data) %in% constant_columns, drop = FALSE]
  vars$predictors = setdiff(vars$predictors, constant_columns)
}
  
cat("Removed", length(constant_columns), "constant predictors.\n")## Removed 0 constant predictors.2.9 Tutorial-only: add random missingness
Our example dataset has no missingness, so for this tutorial we are artifically adding missingness.
# Add missingness to certain predictors.
# Number of data cells to set to be missing.
missing_cells = 50L
set.seed(1)
# Randomly sample 20 patients to have a missing value.
miss_rows = sample(nrow(data), missing_cells, replace = TRUE)
# Randomly select X predictors to have missingness.
miss_preds = sample(vars$predictors, missing_cells, replace = TRUE)
miss_df = data.frame(miss_rows, miss_preds, stringsAsFactors = FALSE)
for (row_i in seq(nrow(miss_df))) {
  row = miss_df[row_i, , drop = FALSE]
  data[row$miss_rows, row$miss_preds] = NA
}
# Confirm that we now have some missing data.
colSums(is.na(data))##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        5        2        4        0        3        4        5        3 
##    exang  oldpeak    slope       ca     thal   target 
##        1        7        4        6        6        02.10 Summarize predictors
We will use this to support:
- Review by the team, such as to identify additional cleaning of outliers
- To inform the loss functions used for GLRM interpretation, and
- As a possible table in the manuscript (supplemental info most likely).
# Columns: variable name, type, # of unique values, mode, mean, median, min, max, missingness
# Groups: demographic, biomarker, notes, score, clinical history (including family)
vars$groups = list(
  demo = c("age", "sex"),
  vitals = c("trestbps"),
  exam = c("cp", "thalach", "exang", "thal"),
  labs = c("chol", "fbs"),
  biomarkers = c("restecg", "oldpeak", "slope", "ca")
)
# Note which predictors we consider to be integers.
(vars$integers = NULL)## NULL## NULL## $exclude
## NULL
## 
## $outcomes
## [1] "target"
## 
## $predictors
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"    
## 
## $groups
## $groups$demo
## [1] "age" "sex"
## 
## $groups$vitals
## [1] "trestbps"
## 
## $groups$exam
## [1] "cp"      "thalach" "exang"   "thal"   
## 
## $groups$labs
## [1] "chol" "fbs" 
## 
## $groups$biomarkers
## [1] "restecg" "oldpeak" "slope"   "ca"# Could specify integers and ordinal arguments here.
result = summarize_vars(data, vars = vars$predictors, groups = vars$groups)
# Export as a spreadsheet
# TODO: use prettier variable names for this export and the latex table.
rio::export(result$table, file = "tables/predictor-summary-unimputed.xlsx")
# TODO: output as a kableExtra latex table.
var_df = result$table
data = result$data