# This script applies MICE imputation to the 12 features selected for inclusion in the CAPP model. All individuals not included in the CAPP test set were included in this imputed training dataset.
# R version 3.5.1 was used for imputation

# set working directory
setwd("/../../")

# libraries
library(dplyr)
library(mice)

###########################################################
### 1 - Check missingness patterns of the training data ###
###########################################################
# Load preschool data for all individuals with an asthma outcome at age 10 - data found in IOWBC_data.xlsx, sheet: "Preschool data"
data <- read.csv(file="Preschool_QC_1368IDs.csv", header=TRUE)
data$X <- NULL

# Subset 12 variables included in the preschool model
selected_data <- data %>%
  select('Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR')

# Load the cleaned, unstandardised, preschool model test set used during the initial model development - - data found in IOWBC_training_test_data.xlsx, sheet: "Preschool test set"
test_data <- read.csv(file="Preschool_test_dataset_183IDs.csv", header=TRUE)

# Extract IDs from original test set - n=176
test <- subset(selected_data, Study_ID %in% test_data$Study_ID)
# Potential training set description - n=1192
training <- subset(selected_data, !Study_ID %in% test_data$Study_ID)
# n=1185

# Remove Study ID as a covariate in the imputation model
training_IDs <- training$Study_ID 
training$Study_ID <- NULL

# Correct training set data types to have categorical variables be factors
cols <- c('Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR')
training[,cols] <- data.frame(apply(training[cols], 2, as.factor))

training <- training[,c('Mat_age', 'Birthweight', 'Solid_food', 'Total.Bf.duration', 'SDS_BMI_1', 'SDS_BMI_4', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR')]

# Report number of missing values for each variable
colSums(is.na(training))

#######################
### MICE imputation ### 
#######################
imp <- mice(
training,
m = 5,
method = c("", "norm", "norm", "polr", "norm", "norm", "polr", "logreg", "logreg", "logreg", "logreg", "polr", ""),
where = is.na(training),
maxit = 5,
printFlag = TRUE,
seed = 123)

# Combine imputed datasets into one
imputed <- complete(imp, action='long')
  
# Calculate means for continuous variables  
ids <- imputed  
avidscont <- ids %>% 
  group_by(.id) %>%
  summarise_all(.funs = mean) %>%
  select(-.id, -.imp)

cont <- as.data.frame(avidscont)
cont <- cont %>%
	select(Mat_age, Birthweight, Solid_food, SDS_BMI_1, SDS_BMI_4)

# Calculate modal value for categorical variables   
mode_fun <- function(x) {
     mode0 <- names(which.max(table(x)))
     if(is.numeric(x)) return(as.numeric(mode0))
     mode0
}
 
avidscat <- ids %>% 
  group_by(.id) %>%
  summarise_all(.funs = mode_fun) %>%
  select(-.id, -.imp)

cat <- as.data.frame(avidscat)
cat <- cat %>%
	select(Total.Bf.duration, Wheeze_4YR, Cough_4YR, Noct_Symp_4YR, Atopy_4YR, Polysensitisation_4YR, SES, Asthma_10YR)
impdata <- cbind(cont, cat)
impdata <- cbind(training_IDs, impdata)

# Save imputed training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "MICE imputed preschool training"
write.csv(impdata, file="MICE_imputed_preschool_training_dataset_1185ID.csv")