# This script details the analysis conducted to replicate all existing childhood asthma regression-based prediction models in the IOWBC
# Only 5 models had data for all predictors available in the IOWBC for replication
# R version 3.5.1 was used for this analysis

# Set working directory
setwd("/../..")

# Load packages
library("pROC")

###########
### IOW ###
###########
# Load IOW model variables - data found in IOWBC_existing_model_replication_data.xlsx, sheet: "IOW"
IOW<-read.csv(file="IOW_variables.csv", header=T)

# Recode variables as needed
IOW$Parental_asthma <- ifelse(IOW$Mat_asthma=="0" & IOW$Pat_asthma=="0", 0, 
	ifelse(IOW$Mat_asthma=="1" | IOW$Pat_asthma=="1", 1, NA))

# Extract only those variables used in the final model
IOW$Mat_asthma <- NULL
IOW$Pat_asthma <- NULL
IOW <- IOW[c("Study_ID", "Parental_asthma", "Nasal_symp_2YR", "Chest_infection_2YR", "Atopy_4YR", "Asthma_10YR")]

# Remove individuals with missing data
IOWv <- na.omit(IOW)
# 804 with complete data

# Calculate score
IOW$Score
colnms=c("Parental_asthma", "Nasal_symp_2YR", "Chest_infection_2YR", "Atopy_4YR")
IOWv$Score<-rowSums(IOWv[,colnms])

# Score distribution
table(IOWv$Score, useNA="ifany")

#   0   1   2   3   4
# 372 268 133  23   8

#IOW ROC
pdf("IOW_Rep_IOW_ROC.pdf")
roc(IOWv$Asthma_10YR, IOWv$Score, plot=T, legacy.axes=T, main="PARS in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="brown", lwd=2, add=F)
legend("bottomright", legend=c("IOW - AUC: 0.73"), col=c("brown"), lw=3)
dev.off()

# Data: IOWv$Score in 684 controls (IOWv$Asthma_10YR 0) < 120 cases (IOWv$Asthma_10YR 1).
# Area under the curve: 0.7346

#save scores
matrix=table(IOWv$Asthma_10, IOWv$Score)
write.csv(matrix, file="IOW_scores.csv")	


#############
### ucAPI ###
#############
# Load ucAPI model variables - data found in IOWBC_existing_model_replication_data.xlsx, sheet: "ucAPI"
ucAPI<-read.csv(file="ucAPI_variables.csv", header=T)

# Recode variables as needed
ucAPI$freqwheeze <- ifelse(ucAPI$Wheeze_4YR==2,1,
	ifelse((ucAPI$Wheeze_4YR==1 |ucAPI$Wheeze_4YR==0),0,NA))

ucAPI$Parental_asthma <- ifelse(ucAPI$Mat_asthma=="0" & ucAPI$Pat_asthma=="0", 0, 
	ifelse(ucAPI$Mat_asthma=="1" | ucAPI$Pat_asthma=="1", 1, NA))	

ucAPI$aeroallergy <- ifelse(ucAPI$HDMResult.4=="Positive" | ucAPI$CatResult.4=="Positive"| ucAPI$DogResult.4=="Positive"| ucAPI$GrassResult.4=="Positive"| ucAPI$CladosporiumResult.4=="Positive"| ucAPI$AlternariaResult.4=="Positive", 1,
	ifelse(ucAPI$HDMResult.4=="Negative" & ucAPI$CatResult.4=="Negative" & ucAPI$DogResult.4=="Negative" & ucAPI$GrassResult.4=="Negative" & ucAPI$CladosporiumResult.4=="Negative" & ucAPI$AlternariaResult.4=="Negative", 0, NA))
	
ucAPI$foodallergy <- ifelse(ucAPI$MilkResult.4=="Positive" | ucAPI$EggResult.4=="Positive", 1,
	ifelse(ucAPI$MilkResult.4=="Negative" & ucAPI$EggResult.4=="Negative", 0, NA))

# Extract only those variables used in the final model
ucAPIv = ucAPI[c("Study_ID", "freqwheeze", "Parental_asthma", "aeroallergy", "Eczema_4YR", "Wheeze_without_cold_4YR", "Hayfever_4YR", "foodallergy", "Asthma_10YR")]

# Remove individuals with missing data
ucAPIv <- na.omit(ucAPIv)
# 740 with complete data

# Summarise major criteria
major_cols=c("Parental_asthma", "aeroallergy", "Eczema_4YR")
ucAPIv$major <- 	rowSums(ucAPIv[,major_cols], na.rm=TRUE)

# Summarise minor criteria
minor_cols=c("Wheeze_without_cold_4YR", "Hayfever_4YR", "foodallergy")
ucAPIv$minor <- 	rowSums(ucAPIv[,minor_cols], na.rm=TRUE)

# Calculate score
ucAPIs <- ucAPIv[c("Study_ID", "freqwheeze", "major", "minor", "Asthma_10YR")]
ucAPIs$Score <- ifelse((ucAPIs$freqwheeze==1 & (ucAPIs$major==1 |ucAPIs$major==2 |ucAPIs$major==3 |ucAPIs$minor==2)),1,0)

# Score distribution
table(ucAPIs$Score, useNA="ifany")

#   0   1
# 688  52

#ucAPI ROC
pdf("ucAPI_Rep_IOW_ROC.pdf")
roc(ucAPIs$Asthma_10YR, ucAPIs$Score, plot=T, legacy.axes=T, main="PARS in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="darkorchid", lwd=2, add=F)
legend("bottomright", legend=c("ucAPI - AUC: 0.59"), col=c("darkorchid"), lw=3)
dev.off()

# Data: ucAPIs$Score in 636 controls (ucAPIs$Asthma_10YR 0) < 104 cases (ucAPIs$Asthma_10YR 1).
# Area under the curve: 0.5878

# save scores
matrix=table(ucAPIs$Asthma_10, ucAPIs$Score)
write.csv(matrix, file="ucAPI_scores.csv")	


##############
### uPIAMA ###
##############
# Load uPIAMA model variables - data found in IOWBC_existing_model_replication_data.xlsx, sheet: "uPIAMA"
uPIAMA<-read.csv(file="uPIAMA_variables.csv", header=T)

# Recode variables as needed
uPIAMA$Parental_asthma <- ifelse(uPIAMA$Mat_asthma=="0" & uPIAMA$Pat_asthma=="0", 0, 
	ifelse(uPIAMA$Mat_asthma=="1" | uPIAMA$Pat_asthma=="1", 1, NA))
	
uPIAMA$preterm <- ifelse(uPIAMA$Prematurity==0,1,
	ifelse(uPIAMA$Prematurity==1|uPIAMA$Prematurity==2,0,NA))
	
uPIAMA$infreqwheeze <- ifelse(uPIAMA$Wheeze_4YR==1,1,
		ifelse(uPIAMA$Wheeze_4YR==0|uPIAMA$Wheeze_4YR==2,0,NA))
	
uPIAMA$freqwheeze <- ifelse(uPIAMA$Wheeze_4YR==2,1,
		ifelse(uPIAMA$Wheeze_4YR==0|uPIAMA$Wheeze_4YR==1,0,NA))	

# Extract only those variables used in the final model		
uPIAMAv <- uPIAMA[c("Study_ID", "Sex", "Mothed.10", "Parental_asthma", "preterm", "infreqwheeze", "freqwheeze", "Wheeze_without_cold_4YR", "Eczema_4YR", "Asthma_10YR")] 		
		

# Remove individuals with missing data
uPIAMAv <- na.omit(uPIAMAv)
# 1045 with complete data

# Calculate score		
uPIAMAv$Score <- ((uPIAMAv$Sex *2) + (uPIAMAv$Mothed.10 *1) + (uPIAMAv$Parental_asthma *4) +(uPIAMAv$preterm *1) +(uPIAMAv$infreqwheeze *4) +(uPIAMAv$freqwheeze *7) +(uPIAMAv$Wheeze_without_cold_4YR *2) +(uPIAMAv$Eczema_4YR *6))
	
# Score distribution
table(uPIAMAv$Score, useNA="ifany")	

#   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  20	21	22
# 125 197 146 175  37  39  48  66  23  36  29  16  31  20  10  15  12   5   6   3	 4	 2

#uPIAMA ROC
pdf("uPIAMA_Rep_IOW_ROC.pdf")
roc(uPIAMAv$Asthma_10YR, uPIAMAv$Score, plot=T, legacy.axes=T, main="PARS in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="chocolate1", lwd=2, add=F)
legend("bottomright", legend=c("uPIAMA - AUC: 0.75"), col=c("chocolate1"), lw=3)
dev.off()

# Data: uPIAMAv$Score in 886 controls (uPIAMAv$Asthma_10YR 0) < 159 cases (uPIAMAv$Asthma_10YR 1).
# Area under the curve: 0.754

# save scores
matrix=table(uPIAMAv$Asthma_10, uPIAMAv$Score)
write.csv(matrix, file="uPIAMA_scores.csv")	
	
#############
### BAMSE ###
#############
# Load BAMSE model variables - data found in IOWBC_existing_model_replication_data.xlsx, sheet: "BAMSE"
BAMSE<-read.csv(file="BAMSE_variables.csv", header=T)

# Recode variables as needed
BAMSE$Parental_asthma <- ifelse(BAMSE$Mat_asthma=="0" & BAMSE$Pat_asthma=="0", 0, 
	ifelse(BAMSE$Mat_asthma=="1" | BAMSE$Pat_asthma=="1", 1, NA))


BAMSE$Obesity <- ifelse(BAMSE$IOTFgrade==2,1,
	ifelse((BAMSE$IOTFgrade==1| BAMSE$IOTFgrade==0| BAMSE$IOTFgrade==-1| BAMSE$IOTFgrade==-2|BAMSE$IOTFgrade==-3),0,NA))

# Extract only those variables used in the final model		
BAMSEv <- BAMSE[c("Study_ID", "Sex", "Parental_asthma", "Obesity", "Hayfever_4YR", "Asthma_10YR")] 		

# Remove individuals with missing data
BAMSEv <- na.omit(BAMSEv)
# 998 with complete data

# Calculate score
colnms=c("Sex", "Parental_asthma", "Obesity", "Hayfever_4YR")
BAMSEv$Score<-rowSums(BAMSEv[,colnms])

# Score distribution
table(BAMSEv$Score, useNA="ifany")

#  0   1   2   3
# 371 486 125  16

#BAMSE ROC
pdf("BAMSE_Rep_IOW_ROC.pdf")
roc(BAMSEv$Asthma_10YR, BAMSEv$Score, plot=T, legacy.axes=T, main="PARS in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="deepskyblue", lwd=2, add=F)
legend("bottomright", legend=c("BAMSE - AUC: 0.63"), col=c("deepskyblue"), lw=3)
dev.off()

# Data: BAMSEv$Score in 851 controls (BAMSEv$Asthma_10YR 0) < 147 cases (BAMSEv$Asthma_10YR 1).
# Area under the curve: 0.6305

#save scores
matrix=table(BAMSEv$Asthma_10, BAMSEv$Score)
write.csv(matrix, file="BAMSE_scores.csv")

############
### PARS ###
############
# Load PARS model variables - data found in IOWBC_existing_model_replication_data.xlsx, sheet: "PARS"
PARS<-read.csv(file="PARS_variables.csv", header=T)

# Recode variables as needed
PARS$Parental_asthma <- ifelse(PARS$Mat_asthma=="0" & PARS$Pat_asthma=="0", 0, 
	ifelse(PARS$Mat_asthma=="1" | PARS$Pat_asthma=="1", 1, NA))
	
PARS$earlywheeze <- ifelse(PARS$Wheeze_2YR==1 |	PARS$Wheeze_2YR==2|PARS$Wheeze_4YR==1|PARS$Wheeze_4YR==2,1,
	ifelse(PARS$Wheeze_2YR==0&PARS$Wheeze_4YR==0,0,NA))
	
PARS$eczema <- ifelse(PARS$Eczema_2YR==1 |	PARS$Eczema_4YR==1,1,
	ifelse(PARS$Eczema_2YR==0 & PARS$Eczema_4YR==0,0,NA))	
	
# Extract only those variables used in the final model		
PARSv <- PARS[c("Study_ID", "Parental_asthma", "eczema", "Wheeze_without_cold_4YR", "earlywheeze","Polysensitisation_4YR", "Race", "Asthma_10YR")] 			

# Remove individuals with missing data
PARSv <- na.omit(PARSv)
# 913 with complete data

# Calculate score		
PARSv$Score <- ((PARSv$Parental_asthma *2) + (PARSv$eczema *2) + (PARSv$Wheeze_without_cold_4YR *3) +(PARSv$earlywheeze *3) +(PARSv$Polysensitisation_4YR *2) +(PARSv$Race *2))

# Score distribution
table(PARSv$Score, useNA="ifany")	

#   0   2   3   4   5   6   7   8   9  10  12
# 400 200  79  41  50  51  20  43   4  16   9

#PARS ROC
pdf("PARS_Rep_IOW_ROC.pdf")
roc(PARSv$Asthma_10YR, PARSv$Score, plot=T, legacy.axes=T, main="PARS in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="chartreuse", lwd=2, add=F)
legend("bottomright", legend=c("PARS - AUC: 0.77"), col=c("chartreuse"), lw=3)
dev.off()

# Data: PARSv$Score in 786 controls (PARSv$Asthma_10YR 0) < 127 cases (PARSv$Asthma_10YR 1).
# Area under the curve: 0.7659

#save scores
matrix=table(PARSv$Asthma_10, PARSv$Score)
write.csv(matrix, file="PARS_scores.csv")

# optimal suggested pars cutoff = 6
t = ifelse((PARSv$Score >5), 1,0)
cm = table(PARSv$Asthma_10, t)
### Confusion matrix 
##		 0		 1	
##	0	599		131
##	1	49	 	79


Comparison of all models - ROC curve
pdf("Existing_models_Rep_IOW_ROC_smooth_31032020.pdf")
roc(IOWv$Asthma_10YR, IOWv$Score, smooth=TRUE, plot=T, legacy.axes=T, main="Existing childhood asthma prediction models in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="firebrick4", lwd=2, add=F)
roc(ucAPIs$Asthma_10YR, ucAPIs$Score, smooth=TRUE, plot=T, legacy.axes=T, main="Existing childhood asthma prediction models in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="darkorchid", lwd=2, add=T)
roc(uPIAMAv$Asthma_10YR, uPIAMAv$Score, smooth=TRUE, plot=T, legacy.axes=T, main="Existing childhood asthma prediction models in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="violetred3", lwd=2, add=T)
roc(BAMSEv$Asthma_10YR, BAMSEv$Score, smooth=TRUE, plot=T, legacy.axes=T, main="Existing childhood asthma prediction models in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="deepskyblue3", lwd=2, add=T)
roc(PARSv$Asthma_10YR, PARSv$Score, smooth=TRUE, plot=T, legacy.axes=T, main="Existing childhood asthma prediction models in IOWBC", xlab="False Positive Rate", ylab="True Postive Rate", col="chartreuse4", lwd=2, add=T)
legend("bottomright", legend=c("IOW - AUC: 0.73", "ucAPI - AUC: 0.59", "uPIAMA - AUC: 0.75", "Szentpetery et al.  - AUC: 0.63", "PARS - AUC: 0.77"), col=c("firebrick4", "darkorchid", "violetred3", "deepskyblue3", "chartreuse4"), lw=3)
dev.off()



























