# This script evaluates the performance of the best CAPE and CAPP models:
	# Best early life (CAPE) model identified from this thesis project was:
		# dataset = complete, oversampled 0%, undersampled
		# algorithm = rbfsvm
		# hyper-parameters = 'C': 45.1, 'gamma': 0.0054
	# Best preschool (CAPP) model identified from this thesis project was:
		# dataset = complete, oversampled 300%, undersampled
		# algorithm = linear SVM
		# hyper-parameters = 'C': 0.33
# For each model, performance in the respective IOWBC test sets was evaluated. The optimal classification threshold cut-off was selected based on the Youden's index and model performance was reported with 95% confidence intervals using 2000 bootstrapped samples.
# This script illustrates the use of this code for the CAPP model but it can be easily adapted for the CAPE model.
# Python version 3.6.8 was used 

# Imports
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import arange
from sklearn.model_selection import cross_val_score, StratifiedKFold
from collections import Counter
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score
from sklearn.utils import resample, shuffle
from sklearn.metrics import f1_score, average_precision_score
from sklearn.metrics import roc_auc_score, roc_curve
import pickle

# Classifiers
from sklearn.svm import SVC


# Set working directory
os.chdir("/../../")

# Define function to obtain performance metrics
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return

# Load training data which developed the best performing model- complete data, oversampled 300%, undersampled
data_300_O = pd.read_csv("Oversampled_preschool_dataset_300%.csv", index_col=False)
data_300_O = data_300_O.iloc[0:518,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 204})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:204,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({0: 204, 1: 204}

X_train = data_300_OU.iloc[:,1:-1]
y_train = data_300_OU.iloc[:,-1]

# Import the standardised preschool test data - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("Preschool_standardised_test_dataset_183IDs.csv", index_col=False)
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

# Specify the algorithm and hyperparameters which gave the best performance
best_clf = SVC(C=0.33, kernel='linear', probability=True, random_state=123)

# Fit optimised model
best_clf.fit(X_train,y_train)

### Training set Performance
y_train_pred = best_clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)

probs = best_clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)

PR_AUC = average_precision_score(y_train, preds)

y_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)

test_report = classification_report(y_test, y_pred)
print (test_report)

test_probs = best_clf.predict_proba(X_test)
test_preds = test_probs[:,1]
ROCAUC_test = roc_auc_score(y_test, test_preds)
print(ROCAUC_test)

# Calcuate Brier score
clf_score = brier_score_loss(y_test, test_preds, pos_label=1)

fpr, tpr, threshold = metrics.roc_curve(y_test, test_preds)
auc = metrics.auc(fpr, tpr)

# save the model to file
filename = 'Final_preschool_lSVM_COU300.sav'
pickle.dump(best_clf, open(filename, 'wb'))

#### Identify optimal threshold based on Youden's index ####
test_probs = best_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))

# Obtain classifications based on optimal threshold cutoff
probs_opt = best_clf.predict_proba(X_test)
X_test['preds'] = probs_opt[:,1]
pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
performance(y_test, pred_opt, X_test['preds'])	
# Example of output:
# [[139  19]
 # [  7  18]]
              # precision    recall  f1-score   support

           # 0       0.95      0.88      0.91       158
           # 1       0.49      0.72      0.58        25
    # accuracy                           0.86       183
   # macro avg       0.72      0.80      0.75       183
# weighted avg       0.89      0.86      0.87       183
# accuracy:=0.857923
# balanced_accuracy:=0.799873
# Sensitivity:=0.720000
# Specificity:=0.879747
# PPV:=0.486486
# NPV:=0.952055
# LRp:=5.987368
# LRn:=0.318273
# ROCAUC:=0.820253
# PR_AUC:=0.498162

# Obtain 95% confidence intervals for performance metrics
# configure bootstrap to get confidence intervals - 2000 bootstrapped samples of size 183 IDs, bootstrapped with replacement. 
# Confidence intervals extracted as the 2.5 and 97.5 percentiles 

n_iterations = 2000
n_size = int(len(test))
randomlist = random.sample(), 5)
# run bootstrap
stats_accuracy = list()
stats_balanced_accuracy = list()
stats_sensitivity = list()
stats_specificity = list()
stats_PPV = list()
stats_NPV = list()
stats_LRp = list()
stats_LRn = list()
stats_F1 = list()
stats_ROCAUC = list()
stats_PR_AUC = list()

# Reimport test data
test = pd.read_csv("/scratch/dk2e18/Asthma_Prediction_Model/Oversampling/Final_models/Preschool_standardised_test_dataset_183IDs.csv", index_col=False)
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

for i in range(n_iterations):
	# prepare train and test sets
	testset = resample(test, replace=True, n_samples=n_size, random_state=i, stratify=test.Asthma_10YR)
	Counter(testset.Asthma_10YR)
	X_test = testset.drop(['Study_ID','Asthma_10YR'], axis=1)
	y_test = testset['Asthma_10YR']
	probs_opt = best_clf.predict_proba(X_test)
	X_test['preds'] = probs_opt[:,1]
	pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)
	# Get performance measures
	cm_test = confusion_matrix(y_test, pred_opt)	
	test_report = classification_report(y_test, pred_opt)
	accuracy = accuracy_score(y_test, pred_opt)
	balanced_accuracy = balanced_accuracy_score(y_test, pred_opt)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, pred_opt)
	ROCAUC = roc_auc_score(y_test, X_test['preds'])
	PR_AUC = average_precision_score(y_test, X_test['preds'])
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	stats_accuracy.append(accuracy)
	stats_balanced_accuracy.append(balanced_accuracy)
	stats_sensitivity.append(sensitivity)
	stats_specificity.append(specificity)
	stats_PPV.append(PPV)
	stats_NPV.append(NPV)
	stats_LRp.append(LRp)	
	stats_LRn.append(LRn)	
	stats_F1.append(F1)
	stats_ROCAUC.append(ROCAUC)
	stats_PR_AUC.append(PR_AUC)

# confidence intervals
set= [0,1,2,3,4,5,6,7,8,9,10]
names = ['accuracy','balanced_accuracy','sensitivity','specificity','PPV','NPV','LRp', 'LRn','F1', 'ROCAUC', 'PR_AUC']
stats = list([stats_accuracy,stats_balanced_accuracy,stats_sensitivity,stats_specificity,stats_PPV,stats_NPV,stats_LRp, stats_LRn, stats_F1, stats_ROCAUC, stats_PR_AUC])

alpha = 0.95
p_lower = ((1.0-alpha)/2.0) * 100
p_upper = (alpha+((1.0-alpha)/2.0)) * 100

for i in (set):
	stat = stats[i]
	name = names[i]
	p_lower = ((1.0-alpha)/2.0) * 100
	p_upper = (alpha+((1.0-alpha)/2.0)) * 100
	lower = max(0.0, np.percentile(stat, p_lower))
	upper = min(100.0, np.percentile(stat, p_upper))
	print(str(name)+" %.0f percent confidence interval:  %.2f%% and %.2f%%" % (alpha*100, lower*100, upper*100))

# Example of output:
# accuracy 95 percent confidence interval:  80.87% and 90.16%
# balanced_accuracy 95 percent confidence interval:  70.09% and 88.62%
# sensitivity 95 percent confidence interval:  52.00% and 88.00%
# specificity 95 percent confidence interval:  82.91% and 92.41%
# PPV 95 percent confidence interval:  37.50% and 61.54%
# NPV 95 percent confidence interval:  92.16% and 97.92%
# LRp 95 percent confidence interval:  379.20% and 1011.20%
# LRn 95 percent confidence interval:  13.44% and 53.79%
# F1 95 percent confidence interval:  45.16% and 70.00%
# ROCAUC 95 percent confidence interval:  71.49% and 91.47%
# PR_AUC 95 percent confidence interval:  34.93% and 70.18%