Chapter 2 Initial data import

2.1 Import raw data

# Here "rio::" tells R to use the import() function specifically from the rio package.
# This reminds us of which package supplies import(), and ensures that no other
# package that provides import() could be used accidentally due to package loading order.
data = rio::import("data-raw/heart.csv")

# Check dimensions and variables.
dim(data)
## [1] 303  14
names(data)
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"
# Lowercase variable names.
(names(data) = tolower(names(data)))
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"

2.2 Merge data

# Merge raw dataframes here.
# dplyr::left_join()

2.3 Recoding

# Recode any values here.

2.4 Exclusions

Exclude observations if desired.

# Apply any exclusion criteria here.

2.5 Categoricals to factors

Ensure that any categorical variables are specified as factors and not numeric/integer variables.

# TODO: treat ordinal variables as ordinal rather than categorical.
data = ck37r::categoricals_to_factors(data,
              categoricals = c("sex", "ca", "cp", "slope", "thal"),
              verbose = TRUE)
## Converting sex from integer to factor. Unique vals: 2 
## Converting ca from integer to factor. Unique vals: 5 
## Converting cp from integer to factor. Unique vals: 4 
## Converting slope from integer to factor. Unique vals: 3 
## Converting thal from integer to factor. Unique vals: 4
# Inspect the updated data frame
str(data)
## 'data.frame':    303 obs. of  14 variables:
##  $ age     : int  63 37 41 56 57 57 56 44 52 57 ...
##  $ sex     : Factor w/ 2 levels "0","1": 2 2 1 2 1 2 1 2 2 2 ...
##  $ cp      : Factor w/ 4 levels "0","1","2","3": 4 3 2 2 1 1 2 2 3 3 ...
##  $ trestbps: int  145 130 130 120 120 140 140 120 172 150 ...
##  $ chol    : int  233 250 204 236 354 192 294 263 199 168 ...
##  $ fbs     : int  1 0 0 0 0 0 0 0 1 0 ...
##  $ restecg : int  0 1 0 1 1 1 0 1 1 1 ...
##  $ thalach : int  150 187 172 178 163 148 153 173 162 174 ...
##  $ exang   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ oldpeak : num  2.3 3.5 1.4 0.8 0.6 0.4 1.3 0 0.5 1.6 ...
##  $ slope   : Factor w/ 3 levels "0","1","2": 1 1 3 3 3 2 2 3 3 3 ...
##  $ ca      : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ thal    : Factor w/ 4 levels "0","1","2","3": 2 3 3 3 3 2 3 4 4 3 ...
##  $ target  : int  1 1 1 1 1 1 1 1 1 1 ...

2.6 Data structure

Specify the outcome variable name, variables excluded from the analysis, and predictor variables (covariates).

vars =
  list(
    # Variables from exclude from analysis, such as ID fields.
    exclude = c(NULL),
    
    # Outcome variables - could be one or multiple (e.g. sensitivity analyses).
    outcomes = c("target"),
    
    # Predictor variables will be defined automatically in the next line of code.
    predictors = NULL
)

# All remaining variables are considered predictors.
(vars$predictors = setdiff(names(data), c(vars$exclude, vars$outcomes)))  
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"

2.7 Extreme value review

# Possibly recode certain extreme values, especially after reviewing the
# predictor summary results (below).

2.8 Remove constant predictors

We don’t have any constant predictors but good to confirm.

# Count the unique values in each predictor, excluding NAs.
unique_vals = sapply(data[, vars$predictors, drop = FALSE],
                     # Make that we don't count NA as a unique value.
                     function(col_vals) length(setdiff(unique(col_vals), NA)))

# Looks good, no constant columns.
summary(unique_vals)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    3.00    4.00   30.62   41.00  152.00
# Remove constant columns from the covariate file.
constant_columns = vars$predictors[unique_vals < 2L]
  
if (length(constant_columns) > 0L) {
  data = data[, !names(data) %in% constant_columns, drop = FALSE]
  vars$predictors = setdiff(vars$predictors, constant_columns)
}
  
cat("Removed", length(constant_columns), "constant predictors.\n")
## Removed 0 constant predictors.
rm(constant_columns)

2.9 Tutorial-only: add random missingness

Our example dataset has no missingness, so for this tutorial we are artifically adding missingness.

# Add missingness to certain predictors.

# Number of data cells to set to be missing.
missing_cells = 50L

set.seed(1)

# Randomly sample 20 patients to have a missing value.
miss_rows = sample(nrow(data), missing_cells, replace = TRUE)

# Randomly select X predictors to have missingness.
miss_preds = sample(vars$predictors, missing_cells, replace = TRUE)

miss_df = data.frame(miss_rows, miss_preds, stringsAsFactors = FALSE)
for (row_i in seq(nrow(miss_df))) {
  row = miss_df[row_i, , drop = FALSE]
  data[row$miss_rows, row$miss_preds] = NA
}

# Confirm that we now have some missing data.
colSums(is.na(data))
##      age      sex       cp trestbps     chol      fbs  restecg  thalach 
##        5        2        4        0        3        4        5        3 
##    exang  oldpeak    slope       ca     thal   target 
##        1        7        4        6        6        0

2.10 Summarize predictors

We will use this to support:

  • Review by the team, such as to identify additional cleaning of outliers
  • To inform the loss functions used for GLRM interpretation, and
  • As a possible table in the manuscript (supplemental info most likely).
# Columns: variable name, type, # of unique values, mode, mean, median, min, max, missingness

# Groups: demographic, biomarker, notes, score, clinical history (including family)
vars$groups = list(
  demo = c("age", "sex"),
  vitals = c("trestbps"),
  exam = c("cp", "thalach", "exang", "thal"),
  labs = c("chol", "fbs"),
  biomarkers = c("restecg", "oldpeak", "slope", "ca")
)

# Note which predictors we consider to be integers.
(vars$integers = NULL)
## NULL
# Note which predictors are ordinal
(vars$ordinal = NULL)
## NULL
vars
## $exclude
## NULL
## 
## $outcomes
## [1] "target"
## 
## $predictors
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"    
## 
## $groups
## $groups$demo
## [1] "age" "sex"
## 
## $groups$vitals
## [1] "trestbps"
## 
## $groups$exam
## [1] "cp"      "thalach" "exang"   "thal"   
## 
## $groups$labs
## [1] "chol" "fbs" 
## 
## $groups$biomarkers
## [1] "restecg" "oldpeak" "slope"   "ca"
# Could specify integers and ordinal arguments here.
result = summarize_vars(data, vars = vars$predictors, groups = vars$groups)

# Export as a spreadsheet
# TODO: use prettier variable names for this export and the latex table.
rio::export(result$table, file = "tables/predictor-summary-unimputed.xlsx")

# TODO: output as a kableExtra latex table.

var_df = result$table
data = result$data

Save unimputed dataset

# Save both dataframe and the vars list that defines the data structure.
save(data, vars, var_df,
     file = "data/clean-merge-unimputed.RData")