# This script develops a logistic regression model equivalent to the best CAPP model. 
# Python version 3.6.8 was used 

# Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, roc_curve
from numpy import argmax
from numpy import arange

# Classifiers
from sklearn.linear_model import LogisticRegression

# Set working directory
os.chdir("/../../..")

# Define function to obtain performance metrics
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return
	
# Load training data which developed the best performing model- complete data, oversampled 300%, undersampled
data_300_O = pd.read_csv("Oversampled_preschool_dataset_300%.csv", index_col=False)
data_300_O = data_300_O.iloc[0:518,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 204})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:204,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({0: 204, 1: 204}

X_train = data_300_OU.iloc[:,1:-1]
y_train = data_300_OU.iloc[:,-1]

# Import the standardised preschool test data - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("Preschool_standardised_test_dataset_183IDs.csv", index_col=False)
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

### Define logistic regression model
logreg = LogisticRegression(solver='lbfgs', penalty='none')

# Fit model to training dataset
logreg.fit(X_train, y_train)

# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   # intercept_scaling=1, l1_ratio=None, max_iter=100,
                   # multi_class='warn', n_jobs=None, penalty='none',
                   # random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   # warm_start=False)
# Coefficients: array([[ 0.03600483, -0.06998519, -0.44156169, -0.54588219, -0.12056283,-0.08243078,  0.21817591,  2.76577989, -0.52369354,  2.06699894,0.78580957,  0.12060537]])
# Intercept: -1.77158539

# Evaluate training performance
y_train_pred = logreg.predict(X_train)
probs = logreg.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
# 0.8725009611687812

performance(y_train, y_train_pred,preds)
# [161  43]
 # [ 49 155]]
              # precision    recall  f1-score   support

           # 0       0.77      0.79      0.78       204
           # 1       0.78      0.76      0.77       204

    # accuracy                           0.77       408
   # macro avg       0.77      0.77      0.77       408
# weighted avg       0.77      0.77      0.77       408

# accuracy:=0.774510
# balanced_accuracy:=0.774510
# Sensitivity:=0.759804
# Specificity:=0.789216
# PPV:=0.782828
# NPV:=0.766667
# LRp:=3.604651
# LRn:=0.304348
# ROCAUC:=0.872501
# PR_AUC:=0.875309

# Evaluate performance in the test set
y_pred = logreg.predict(X_test)
performance(y_test, y_pred, preds)

probs = logreg.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7572151898734177

#### Identify optimal threshold based on Youden's index ####
test_probs = logreg.predict_proba(X_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))
# 0.4178288219413288

# Obtain classifications based on optimal threshold cutoff
probs_opt = logreg.predict_proba(X_test)
X_test['preds'] = probs_opt[:,1]
pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
cm_test = confusion_matrix(y_test, pred_opt)	
print (cm_test)
performance(y_test, pred_opt, X_test['preds'])
# [117  41]
 # [  5  20]]
              # precision    recall  f1-score   support

           # 0       0.96      0.74      0.84       158
           # 1       0.33      0.80      0.47        25

    # accuracy                           0.75       183
   # macro avg       0.64      0.77      0.65       183
# weighted avg       0.87      0.75      0.79       183

# accuracy:=0.748634
# balanced_accuracy:=0.770253
# Sensitivity:=0.800000
# Specificity:=0.740506
# PPV:=0.327869
# NPV:=0.959016
# LRp:=3.082927
# LRn:=0.270085
# ROCAUC:=0.757215
# PR_AUC:=0.480810

# Obtain 95% confidence intervals for performance metrics
# configure bootstrap to get confidence intervals - 2000 bootstrapped samples of size 183 IDs, bootstrapped with replacement. 
# Confidence intervals extracted as the 2.5 and 97.5 percentiles 
n_iterations = 2000
n_size = int(len(test))
# run bootstrap
stats_accuracy = list()
stats_balanced_accuracy = list()
stats_sensitivity = list()
stats_specificity = list()
stats_PPV = list()
stats_NPV = list()
stats_LRp = list()
stats_LRn = list()
stats_F1 = list()
stats_ROCAUC = list()
stats_PR_AUC = list()

for i in range(n_iterations):
	# prepare train and test sets
	testset = resample(test, replace=True, n_samples=n_size, random_state=i, stratify=test.Asthma_10YR)
	Counter(testset.Asthma_10YR)
	X_test = testset.drop(['Asthma_10YR','Study_ID'], axis=1)
	y_test = testset['Asthma_10YR']
	probs_opt = logreg.predict_proba(X_test)
	X_test['preds'] = probs_opt[:,1]
	pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)
	# Get performance measures
	cm_test = confusion_matrix(y_test, pred_opt)	
	test_report = classification_report(y_test, pred_opt)
	accuracy = accuracy_score(y_test, pred_opt)
	balanced_accuracy = balanced_accuracy_score(y_test, pred_opt)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, pred_opt)
	ROCAUC = roc_auc_score(y_test, X_test['preds'])
	PR_AUC = average_precision_score(y_test, X_test['preds'])
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	stats_accuracy.append(accuracy)
	stats_balanced_accuracy.append(balanced_accuracy)
	stats_sensitivity.append(sensitivity)
	stats_specificity.append(specificity)
	stats_PPV.append(PPV)
	stats_NPV.append(NPV)
	stats_LRp.append(LRp)	
	stats_LRn.append(LRn)	
	stats_F1.append(F1)
	stats_ROCAUC.append(ROCAUC)
	stats_PR_AUC.append(PR_AUC)

# confidence intervals
set= [0,1,2,3,4,5,6,7,8,9,10]
names = ['accuracy','balanced_accuracy','sensitivity','specificity','PPV','NPV','LRp', 'LRn','F1', 'ROCAUC', 'PR_AUC']
stats = list([stats_accuracy,stats_balanced_accuracy,stats_sensitivity,stats_specificity,stats_PPV,stats_NPV,stats_LRp, stats_LRn, stats_F1, stats_ROCAUC, stats_PR_AUC])
alpha = 0.95
p_lower = ((1.0-alpha)/2.0) * 100
p_upper = (alpha+((1.0-alpha)/2.0)) * 100

for i in (set):
	stat = stats[i]
	name = names[i]
	p_lower = ((1.0-alpha)/2.0) * 100
	p_upper = (alpha+((1.0-alpha)/2.0)) * 100
	lower = max(0.0, np.percentile(stat, p_lower))
	upper = min(100.0, np.percentile(stat, p_upper))
	print(str(name)+" %.0f percent confidence interval:  %.2f%% and %.2f%%" % (alpha*100, lower*100, upper*100))
# Example output:
# accuracy 95 percent confidence interval:  68.85% and 80.87%
# balanced_accuracy 95 percent confidence interval:  68.29% and 84.93%
# sensitivity 95 percent confidence interval:  64.00% and 96.00%
# specificity 95 percent confidence interval:  67.09% and 80.38%
# PPV 95 percent confidence interval:  26.15% and 40.74%
# NPV 95 percent confidence interval:  92.68% and 99.12%
# LRp 95 percent confidence interval:  223.83% and 434.50%
# LRn 95 percent confidence interval:  5.64% and 49.89%
# F1 95 percent confidence interval:  37.50% and 55.70%
# ROCAUC 95 percent confidence interval:  62.81% and 87.57%
# PR_AUC 95 percent confidence interval:  33.03% and 66.67%