# This script applies MICE imputation to the 8 features selected for inclusion in the CAPE model. All individuals not included in the CAPE test set were included in this imputed training dataset.
# R version 3.5.1 was used for imputation

# set working directory
setwd("/../../")

# libraries
library(dplyr)
library(mice)

###########################################################
### 1 - Check missingness patterns of the training data ###
###########################################################
# Load early life data for all individuals with an asthma outcome at age 10 - data found in IOWBC_data.xlsx, sheet: "Early life data"
data <- read.csv(file="Early_life_QC_1368IDs.csv", header=TRUE)
data$X <- NULL

# Subset 8 variables included in the infancy model
selected_data <- data %>%
  select('Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES', 'Asthma_10YR')

# Load the cleaned, unstandardised, early life model test set used during the initial model development - - data found in IOWBC_training_test_data.xlsx, sheet: "Early life test set"
test_data <- read.csv(file="Early_life_test_dataset_255IDs.csv", header=TRUE)

# Extract IDs from original test set - n=176
test <- subset(selected_data, Study_ID %in% test_data$Study_ID)
# Potential training set description - n=1192
training <- subset(selected_data, !Study_ID %in% test_data$Study_ID)
# n=1113

# Remove Study ID as a covariate in the imputation model
training_IDs <- training$Study_ID 
training$Study_ID <- NULL

# Correct training set data types to have categorical variables be factors
cols <- c('Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES', 'Asthma_10YR')
training[,cols] <- data.frame(apply(training[cols], 2, as.factor))


# Report number of missing values for each variable
colSums(is.na(training))
#          Mat_age       Birthweight        Solid_food         SDS_BMI_1
#                0                27               174               382
#Total.Bf.duration        Wheeze_2YR         Cough_2YR               SES
#              150               294               297                68
#      Asthma_10YR
#                0


######################
### MICE imputation ### 
#######################
imp <- mice(
training,
m = 5,
method = c("", "norm", "norm", "norm", "polr", "polr", "logreg", "polr", ""),
where = is.na(training),
maxit = 5,
printFlag = TRUE,
seed = 123)

# Combine imputed datasets into one
imputed <- complete(imp, action='long')
  
# Calculate means for continuous variables  
ids <- imputed  
avidscont <- ids %>% 
  group_by(.id) %>%
  summarise_all(.funs = mean) %>%
  select(-.id, -.imp)

cont <- as.data.frame(avidscont)
cont <- cont %>%
	select(Mat_age, Birthweight, Solid_food, SDS_BMI_1)

# Calculate modal value for categorical variables   
mode_fun <- function(x) {
     mode0 <- names(which.max(table(x)))
     if(is.numeric(x)) return(as.numeric(mode0))
     mode0
}
 
avidscat <- ids %>% 
  group_by(.id) %>%
  summarise_all(.funs = mode_fun) %>%
  select(-.id, -.imp)

cat <- as.data.frame(avidscat)
cat <- cat %>%
	select(Total.Bf.duration, Wheeze_2YR, Cough_2YR, SES, Asthma_10YR)
impdata <- cbind(cont, cat)
impdata <- cbind(training_IDs, impdata)

# Save imputed training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "MICE imputed earlylife training"
write.csv(impdata, file="MICE_imputed_earlylife_training_dataset_1113ID.csv")