# This script compares two imputation methods, missForest and MICE.
# The preschool training dataset (with no missing values) used during the initial model development stage was used in this comparison. 
# 20% of missing data was randomly introduced into this complete training dataset and the two methods were used to impute the data separately. 
# R version 3.5.1 was used for imputation

# set working directory
setwd("/scratch/dk2e18/Asthma_Prediction_Model/Imputation/")

# libraries
library(dplyr)
library(missForest)
library(mice)

# Load data - data found in IOWBC_data.xlsx, sheet: "Preschool data"
data <- read.csv(file="/../../Preschool_QC_1368IDs.csv", header=TRUE) 
data$X <- NULL

# Subset 12 variables included in the preschool model
selected_data <- data %>%
  select('Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR')

# Load initial model test set data - data found in IOWBC_training_test_data.xlsx, sheet: "Preschool test data"
test_data <- read.csv(file="/../../Preschool_test_dataset_183IDs.csv", header=TRUE)

# Extract IDs from original test set - n=183
test <- subset(selected_data, Study_ID %in% test_data$Study_ID)
# Identify size of potential training set
training <- subset(selected_data, !Study_ID %in% test_data$Study_ID)
# n=1185

# Subset 12 variables included in the preschool model
selected_data <- training %>%
  select('Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR')

# Identify complete dataset
complete_data <- na.omit(selected_data)

# Index IDs
IDs <- complete_data$Study_ID

# Report number of missing values for each variable - should be none 
colSums(is.na(complete_data))
complete_data$Study_ID <- NULL


# Introduce missing data - 20%
set.seed(123)
data_missing <- prodNA(complete_data[,0:12], noNA = 0.2)
data_4 <- cbind(data_missing,complete_data$Asthma_10YR) 
data_4$Study_ID <- NULL
cols <- c('Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'complete_data$Asthma_10YR')
data_4[,cols] <- data.frame(apply(data_4[cols], 2, as.factor))

# Check data summary
colSums(is.na(data_4))

#######################################
### Impute the data with missforest ###
#######################################
#function (xmis, maxiter = 10, ntree = 100, variablewise = FALSE,
#    decreasing = FALSE, verbose = FALSE, mtry = floor(sqrt(ncol(xmis))),
#    replace = TRUE, classwt = NULL, cutoff = NULL, strata = NULL,
#    sampsize = NULL, nodesize = NULL, maxnodes = NULL, xtrue = NA,
#    parallelize = c("no", "variables", "forests"))

set.seed(123)
data_4imp <- missForest(data_4, xtrue = complete_data, verbose=T, variablewise = TRUE)
# 5 iterations

# OOB imputation error estimate 
data_4imp$OOBerror

# True imputation error
data_4imp$error

imputed <- data_4imp$ximp
imputed <- cbind(IDs, imputed)
# save dataset - data found in IOWBC_imputed_data.xlsx, sheet: "missForest complete data"
write.csv(imputed, file="Imputed_missForest_preschool_complete_training_dataset_365IDs.csv") 

#######################
### MICE imputation ### 
#######################
data_4  <- data_4[,c('Mat_age', 'Birthweight', 'Solid_food', 'Total.Bf.duration', 'SDS_BMI_1', 'SDS_BMI_4', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'complete_data$Asthma_10YR')]

imp <- mice(
data_4,
m = 5,
method = c("norm", "norm", "norm", "polr", "norm", "norm", "polr", "logreg", "logreg", "logreg", "logreg", "polr", ""),
where = is.na(data_4),
maxit = 5,
printFlag = TRUE,
seed = 123)

# Combine imputed datasets into one
imputed <- complete(imp, action='long')
  
# Calculate means for continuous variables  
ids <- imputed  
avidscont <- ids %>% 
  group_by(.id) %>%
  summarise_all(.funs = mean) %>%
  select(-.id, -.imp)

cont <- as.data.frame(avidscont)
cont <- cont %>%
	select(Mat_age, Birthweight, Solid_food, SDS_BMI_1, SDS_BMI_4)
 
mode_fun <- function(x) {
     mode0 <- names(which.max(table(x)))
     if(is.numeric(x)) return(as.numeric(mode0))
     mode0
}
 
avidscat <- ids %>% 
  group_by(.id) %>%
  summarise_all(.funs = mode_fun) %>%
  select(-.id, -.imp)

cat <- as.data.frame(avidscat)
cat <- cat %>%
	select(Total.Bf.duration, Wheeze_4YR, Cough_4YR, Noct_Symp_4YR, Atopy_4YR, Polysensitisation_4YR, SES, complete_data.Asthma_10YR)
impdata <- cbind(cont, cat)

# save dataset - data found in IOWBC_imputed_data.xlsx, sheet: "MICE complete data"
write.csv(impdata, file="Imputed_MICE_preschool_complete_training_dataset_365IDs.csv")


