# This script is used to remove outliers among the 54 raw phenotypic variables used to develop the CAPE and CAPP models
# R version 3.6.1 is used

# Set working directory
setwd("../../..")

# Load packages
library(dplyr)

########################################
### Check distributions and Outliers ###
########################################
# Import cleaned data for all individuals with asthma outcome data at age 10 - found in IOWBC_data.xlsx, sheet: "Cleaned data (n=1368)"
dataset <- read.csv(file="Asthma_Prediction_Model_Dataset_1368Ids.csv", header=T)

# Identify continuous variables for distribution check and outlier removal 
cont_var <- select(dataset, Study_ID, Mat_age, Birthweight, Solid_food, SDS_BMI_1, SDS_BMI_4)

####################
### Maternal age ###
####################
pdf("Mat_age_histogram_1368")
mat_age_h <- hist(cont_var$Mat_age, breaks = 10, density = 10,
          col = "lightgray", xlab = "Maternal Age", main = "Maternal Age Histogram") 
xfit <- seq(min(cont_var$Mat_age), max(cont_var$Mat_age), length = 40) 
yfit <- dnorm(xfit, mean = mean(cont_var$Mat_age), sd = sd(cont_var$Mat_age)) 
yfit <- yfit * diff(mat_age_h$mids[1:2]) * length(cont_var$Mat_age) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

### Mat age outlier ID 
mean(cont_var$Mat_age)	# 26.98026
sd(cont_var$Mat_age)	# 5.288216

# cut_offs - 4SD
lower <- (mean(cont_var$Mat_age) - (4*sd(cont_var$Mat_age)))		# 5.827399
upper <- (mean(cont_var$Mat_age) + (4*sd(cont_var$Mat_age)))		# 48.13313

Mat_age_4SD <- subset(dataset, cont_var$Mat_age < 5.827399| cont_var$Mat_age > 48.13313)
# 0 outliers


###################
### Birthweight ###
###################
bw <- na.omit(cont_var$Birthweight)
pdf("Birthweight_histogram_1368")
birthweight_h <- hist(bw, breaks = 10, density = 10,
          col = "lightgray", xlab = "Birthweight", main = "Birthweight Histogram") 
xfit <- seq(min(bw), max(bw), length = 40) 
yfit <- dnorm(xfit, mean = mean(bw), sd = sd(bw)) 
yfit <- yfit * diff(birthweight_h$mids[1:2]) * length(bw) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	


### Birthweight outlier ID 
bw <- na.omit(cont_var$Birthweight)
mean(bw)	# 3.411954
sd(bw)		# 0.5243568

# cut_offs - 4SD
lower <- (mean(bw) - (4*sd(bw)))		# 1.314527
upper <- (mean(bw) + (4*sd(bw)))		# 5.509381

Birthweight_4SD <- subset(dataset, cont_var$Birthweight < 1.314527| cont_var$Birthweight > 5.509381)
# 5 outliers

##################
### Solid_food ###
##################
solids <- na.omit(cont_var$Solid_food)
pdf("Solid_food_histogram_1368")
solid_food_h <- hist(solids, breaks = 10, density = 10,
          col = "lightgray", xlab = "Solid food introduction (weeks)", main = "Solid Food Introduction Histogram") 
xfit <- seq(min(solids), max(solids), length = 40) 
yfit <- dnorm(xfit, mean = mean(solids), sd = sd(solids)) 
yfit <- yfit * diff(solid_food_h$mids[1:2]) * length(solids) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	


### Solid food outliers ###
sf <- na.omit(cont_var$Solid_food)
mean(sf)	# 14.52951
sd(sf)		# 4.70721

# cut_offs - 4SD
lower <- (mean(sf)) - (4*sd(sf))		# -4.299329
upper <- (mean(sf)) + (4*sd(sf))		# 33.35835

Solid_food_4SD <- subset(dataset, cont_var$Solid_food < -4.299329| cont_var$Solid_food > 33.35835)
# 9 outliers


#############
### BMI_1 ###
#############
# Checked original BMI and standardised BMI distributions - both are normally distributed

### Original BMI ###
bmi1 <- na.omit(data_A$BMI_1)
summary(bmi1, useNA="ifany")
bmi1[bmi1 < 5] <- NA
bmi1 <- na.omit(bmi1)

pdf("BMI_1_histogram_1368")
bmi1_h <- hist(bmi1, breaks = 10, density = 10,
          col = "lightgray", xlab = "BMI at 1 year", main = "BMI at One Year Old Histogram") 
xfit <- seq(min(bmi1), max(bmi1), length = 40) 
yfit <- dnorm(xfit, mean = mean(bmi1), sd = sd(bmi1)) 
yfit <- yfit * diff(bmi1_h$mids[1:2]) * length(bmi1) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

### SDS_BMI_1 ###
bmi1 <- na.omit(cont_var$SDS_BMI_1)
pdf("SDS_BMI_1_histogram_1368")
bmi1_h <- hist(bmi1, breaks = 10, density = 10,
          col = "lightgray", xlab = "SDS BMI at 1 year", main = "SDS BMI at One Year Old Histogram") 
xfit <- seq(min(bmi1), max(bmi1), length = 40) 
yfit <- dnorm(xfit, mean = mean(bmi1), sd = sd(bmi1)) 
yfit <- yfit * diff(bmi1_h$mids[1:2]) * length(bmi1) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

# To check if there are any outliers, returned to the original height and weight variables, identified 4SD outlier IDs and excluded from the SDS BMI variables
data <- read.csv(file="BMI_variables_for_outlier_check.csv", header=T) - data found in IOWBC_data.xlsx, sheet: "Raw BMI data"

### BMI @1 outlier ID 
## Height @1
h1 <- data[2]
h1[h1=="Not seen"|h1=="Missing"|h1=="Drop out"] <- NA
h1<- unlist(h1)
h1 <- as.numeric(levels(h1)[h1])
h1 <- na.omit(h1)

mean(h1)	# 71.85471
sd(h1)		# 4.12185

pdf("BMI_1YR_height_histogram_1368")
height1 <- hist(h1, breaks = 10, density = 10,
          col = "lightgray", xlab = "Height (cm)", main = "Height 1 year Histogram") 
xfit <- seq(min(h1), max(h1), length = 40) 
yfit <- dnorm(xfit, mean = mean(h1), sd = sd(h1)) 
yfit <- yfit * diff(height1$mids[1:2]) * length(h1) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

# cut_offs - 4SD
lower <- (mean(h1) - (4*sd(h1)))		# 55.36731
upper <- (mean(h1) + (4*sd(h1)))		# 88.34211

h1_4SD <- subset(data, data$HeightCM.1 < 55.36731| data$HeightCM.1 > 88.34211)
# 13 outliers

## Weight @1
w1 <- data[3]
w1[w1=="Not seen"|w1=="Missing"|w1=="Drop out"] <- NA
w1<- unlist(w1)
w1 <- as.numeric(levels(w1)[w1])
w1 <- na.omit(w1)

mean(w1)	# 9.019452
sd(w1)		# 1.09608

pdf("BMI_1YR_weight_histogram_1368")
weight1 <- hist(w1, breaks = 5, density = 10,
          col = "lightgray", xlab = "Weight (kg)", main = "Weight 1 year Histogram") 
xfit <- seq(min(w1), max(w1), length = 40) 
yfit <- dnorm(xfit, mean = mean(w1), sd = sd(w1)) 
yfit <- yfit * diff(weight1$mids[1:2]) * length(w1) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

# cut_offs - 4SD
lower <- (mean(w1) - (4*sd(w1)))		#  4.635132
upper <- (mean(w1) + (4*sd(w1)))		#  13.40377

w1_4SD <- subset(data, data$WeightKG.1 <  4.635132| data$WeightKG.1 > 13.40377)
# 0 outliers



#############
### BMI_4 ###
#############
# Checked original BMI and standardised BMI distributions - both are normally distributed

### Original BMI ###
bmi4 <- na.omit(data_A$BMI_4)
summary(bmi4, useNA="ifany")
bmi4[bmi4 < 5] <- NA
bmi4 <- na.omit(bmi4)

pdf("BMI_4_histogram_1368")
bmi4_h <- hist(bmi4, breaks = 10, density = 10,
          col = "lightgray", xlab = "BMI at 4 year", main = "BMI at Four Years Old Histogram") 
xfit <- seq(min(bmi4), max(bmi4), length = 40) 
yfit <- dnorm(xfit, mean = mean(bmi4), sd = sd(bmi4)) 
yfit <- yfit * diff(bmi4_h$mids[1:2]) * length(bmi4) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

### SDS_BMI_4 ###
bmi4 <- na.omit(cont_var$SDS_BMI_4)
pdf("SDS_BMI_4_histogram_1368")
bmi4_h <- hist(bmi4, breaks = 10, density = 10,
          col = "lightgray", xlab = "SDS BMI at 4 year", main = "SDS BMI at Four Years Old Histogram") 
xfit <- seq(min(bmi4), max(bmi4), length = 40) 
yfit <- dnorm(xfit, mean = mean(bmi4), sd = sd(bmi4)) 
yfit <- yfit * diff(bmi4_h$mids[1:2]) * length(bmi4) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

# To check if there are any outliers, returned to the original height and weight variables, identified 4SD outlier IDs and excluded from the SDS BMI variables
data <- read.csv(file="BMI_variables_for_outlier_check.csv", header=T) - data found in IOWBC_data.xlsx, sheet: "Raw BMI data"

### BMI @4 outlier ID 
## Height @4
h4 <- data[4]
h4[h4=="Not seen"|h4=="Missing"|h4=="Drop out"] <- NA
h4<- unlist(h4)
h4 <- as.numeric(levels(h4)[h4])
h4 <- na.omit(h4)

mean(h4)	# 103.953
sd(h4)		# 4.449111

pdf("BMI_4YR_height_histogram_1368")
height4 <- hist(h4, breaks = 10, density = 10,
          col = "lightgray", xlab = "Height (cm)", main = "Height 4 year Histogram") 
xfit <- seq(min(h4), max(h4), length = 40) 
yfit <- dnorm(xfit, mean = mean(h4), sd = sd(h4)) 
yfit <- yfit * diff(height4$mids[1:2]) * length(h4) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

# cut_offs - 4SD
lower <- (mean(h4) - (4*sd(h4)))		# 86.1565
upper <- (mean(h4) + (4*sd(h4)))		# 121.7495

h4_4SD <- subset(data, data$HeightCM.4 < 86.1565| data$HeightCM.4  > 121.7495)
# 1 outliers

## Weight @4
w4 <- data[5]
w4[w4=="-200"|w4=="-102"|w4=="-100"] <- NA
w4<- unlist(w4)
w4 <- as.numeric(w4)
w4 <- na.omit(w4)

mean(w4)	# 17.53775
sd(w4)		# 2.283572

pdf("BMI_4YR_weight_histogram_1368")
weight4 <- hist(w4, breaks = 10, density = 10,
          col = "lightgray", xlab = "Weight (kg)", main = "Weight 4 year Histogram") 
xfit <- seq(min(w4), max(w4), length = 40) 
yfit <- dnorm(xfit, mean = mean(w4), sd = sd(w4)) 
yfit <- yfit * diff(weight4$mids[1:2]) * length(w4) 
lines(xfit, yfit, col = "black", lwd = 2)
dev.off()	

# cut_offs - 4SD
lower <- (mean(w4) - (4*sd(w4)))		#  8.403464
upper <- (mean(w4) + (4*sd(w4)))		#  26.67204

w4_4SD <- subset(data, data$WeightKG.4>0 & (data$WeightKG.4 <  8.403464| data$WeightKG.4 > 26.67204))
# 6 outliers


#######################
### Remove outliers ###
#######################
dim(Birthweight_4SD)
dim(Solid_food_4SD)
dim(h1_4SD)
dim(w1_4SD)
dim(h4_4SD)
dim(w4_4SD)


# Mark all outliers as NA in the 1368 dataset
data_B <- read.csv(file="Asthma_Prediction_Model_Dataset_1368Ids.csv", header=T)

# Remove Mat age oultiers
sum(is.na(data_B$Mat_age))	
	# 0
data_B$Mat_age[data_B$Mat_age < 5.827399| data_B$Mat_age > 48.13313] <-NA
sum(is.na(data_B$Mat_age))
	# 0

# Remove Birthweight oultiers
sum(is.na(data_B$Birthweight))	
	# 22
data_B$Birthweight[data_B$Birthweight < 1.314527| data_B$Birthweight > 5.509381] <-NA
sum(is.na(data_B$Birthweight))
	# 27

# Remove Solid food oultiers
sum(is.na(data_B$Solid_food))	
	# 165
data_B$Solid_food[data_B$Solid_food < -4.299329| data_B$Solid_food > 33.35835] <-NA
sum(is.na(data_B$Solid_food))
	# 174
	

# Remove SDS_BMI_1 oultiers
sum(is.na(data_B$SDS_BMI_1))	
	# 369
data_B$SDS_BMI_1[data_B$Study_ID==3|data_B$Study_ID==68|data_B$Study_ID==124|data_B$Study_ID==159|data_B$Study_ID==172|data_B$Study_ID==229|data_B$Study_ID==422|data_B$Study_ID==945|data_B$Study_ID==985|data_B$Study_ID==1179|data_B$Study_ID==1194|data_B$Study_ID==1195|data_B$Study_ID==1394] <-NA	
sum(is.na(data_B$SDS_BMI_1))	
	# 382
	
# Remove SDS_BMI_4 oultiers
sum(is.na(data_B$SDS_BMI_4))	
	# 360
data_B$SDS_BMI_4[data_B$Study_ID==723|data_B$Study_ID==91|data_B$Study_ID==474|data_B$Study_ID==758|data_B$Study_ID==1221|data_B$Study_ID==1445|data_B$Study_ID==1526] <-NA	
sum(is.na(data_B$SDS_BMI_4))	
	# 367	


##############################################################
### One-hot encoding of nominal variable - season of birth ###
##############################################################

Season_autumn <- ifelse(data_B$Birth_season=="0",1,
	ifelse(data_B$Birth_season=="1"|data_B$Birth_season=="2"|data_B$Birth_season=="3",0,NA))
	
Season_winter <- ifelse(data_B$Birth_season=="1",1,
	ifelse(data_B$Birth_season=="0"|data_B$Birth_season=="2"|data_B$Birth_season=="3",0,NA))
	
Season_spring <- ifelse(data_B$Birth_season=="2",1,
	ifelse(data_B$Birth_season=="0"|data_B$Birth_season=="1"|data_B$Birth_season=="3",0,NA))
	
Season_summer <- ifelse(data_B$Birth_season=="3",1,
	ifelse(data_B$Birth_season=="0"|data_B$Birth_season=="1"|data_B$Birth_season=="2",0,NA))
	
# Recombine dataset with 4x season of birth variables in correct place in dataframe	
x<- data_B[,1:21]

y<-cbind(Season_autumn, Season_winter, Season_spring, Season_summer)

z<- data_B[,23:56]

data_C<- cbind(x,y,z)
write.csv(data_C, file="Asthma_Prediction_Model_Dataset_outliersNA_nominalfix_1368Ids.csv") - data found in IOWBC_data.xlsx, sheet: "Cleaned QCed data (n=1368)"

# Write datafiles for 2 models:
#Preschool model predictors - data found in IOWBC_data.xlsx, sheet: "Preschool data"
write.csv(data_C, file="Preschool_QC_1368Ids.csv")

# Early life model predictors - data found in IOWBC_data.xlsx, sheet: "Early life data"
y<- data_C[1:41]
z<-data_C[57:59]
y <-cbind(y,z)
write.csv(y, file="Early_life_QC_1368IDs.csv")
