# This script develops a logistic regression model equivalent to the best CAPE model. 
# Python version 3.6.8 was used 

# Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, roc_curve
from numpy import argmax
from numpy import arange

# Classifiers
from sklearn.linear_model import LogisticRegression

# Set working directory
os.chdir("/../../..")

# Define function to obtain performance metrics
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return
	
# Load training data which developed the best performing model - complete data, oversampled 0%, undersampled - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife training"
data_0 = pd.read_csv("Earlylife_standardised_training_dataset_510IDs.csv", index_col=False)
print('Original dataset shape %s' % Counter(data_0.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 68})

# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:68,]
data_0_U = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0_U = shuffle(data_0_U, random_state=123)
print('Original dataset shape %s' % Counter(data_0_U.Asthma_10YR))
# Original dataset shape Counter({0: 68, 1: 68})

X_train = data_0_U.iloc[:,1:-1]
y_train = data_0_U.iloc[:,-1]

# Import the standardised test data - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife test set"
test = pd.read_csv("Earlylife_standardised_test_dataset_255IDs.csv", index_col=False) 
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

### Define logistic regression model
logreg = LogisticRegression(solver='lbfgs', penalty='none')

# Fit model to training dataset
logreg.fit(X_train, y_train)

# LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
#                   intercept_scaling=1, l1_ratio=None, max_iter=100,
#                   multi_class='warn', n_jobs=None, penalty='none',
#                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
#                   warm_start=False)
# Intercept: -0.67115964
# Coefficients: -0.12796094,  0.08446293,  0.10623255, -0.09368358,  0.05142765,-0.48265643,  2.17862054,  0.04020671

# Evaluate training performance
y_train_pred = logreg.predict(X_train)
probs = logreg.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
# 0.6714965397923875

performance(y_train, y_train_pred,preds)
# [50 18][30 38]

              # precision    recall  f1-score   support

           # 0       0.62      0.74      0.68        68
           # 1       0.68      0.56      0.61        68

    # accuracy                           0.65       136
   # macro avg       0.65      0.65      0.64       136
# weighted avg       0.65      0.65      0.64       136

# accuracy:=0.647059
# balanced_accuracy:=0.647059
# Sensitivity:=0.558824
# Specificity:=0.735294
# PPV:=0.678571
# NPV:=0.625000
# LRp:=2.111111
# LRn:=0.600000
#ROCAUC:=0.671497
#PR_AUC:=0.689958

# Evaluate performance in the test set
y_pred = logreg.predict(X_test)
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.5922278413627895

performance(y_test, y_pred,preds)
#[[180  41] [ 20  14]]
              # precision    recall  f1-score   support

           # 0       0.90      0.81      0.86       221
           # 1       0.25      0.41      0.31        34

    # accuracy                           0.76       255
   # macro avg       0.58      0.61      0.58       255
# weighted avg       0.81      0.76      0.78       255

# accuracy:=0.760784
# balanced_accuracy:=0.613122
# Sensitivity:=0.411765
# Specificity:=0.814480
# PPV:=0.254545
# NPV:=0.900000
# LRp:=2.219512
# LRn:=0.722222
# ROCAUC:=0.592228
# PR_AUC:=0.193183

#### Identify optimal threshold based on Youden's index ####
test_probs = logreg.predict_proba(X_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))
# 0.4781117106813626

# Obtain classifications based on optimal threshold cutoff
probs_opt = logreg.predict_proba(X_test)
X_test['preds'] = probs_opt[:,1]
pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
cm_test = confusion_matrix(y_test, pred_opt)	
print (cm_test)
performance(y_test, pred_opt,X_test['preds'])
# [[177  44] [ 19  15]]
              # precision    recall  f1-score   support

           # 0       0.90      0.80      0.85       221
           # 1       0.25      0.44      0.32        34

    # accuracy                           0.75       255
   # macro avg       0.58      0.62      0.59       255
# weighted avg       0.82      0.75      0.78       255

# accuracy:=0.752941
# balanced_accuracy:=0.621041
# Sensitivity:=0.441176
# Specificity:=0.800905
# PPV:=0.254237
# NPV:=0.903061
# LRp:=2.215909
# LRn:=0.697740
# ROCAUC:=0.592228
# PR_AUC:=0.193183

# Obtain 95% confidence intervals for performance metrics
# configure bootstrap to get confidence intervals - 2000 bootstrapped samples of size 255 IDs, bootstrapped with replacement. 
# Confidence intervals extracted as the 2.5 and 97.5 percentiles 
n_iterations = 2000
n_size = int(len(test))
# run bootstrap
stats_accuracy = list()
stats_balanced_accuracy = list()
stats_sensitivity = list()
stats_specificity = list()
stats_PPV = list()
stats_NPV = list()
stats_LRp = list()
stats_LRn = list()
stats_F1 = list()
stats_ROCAUC = list()
stats_PR_AUC = list()

for i in range(n_iterations):
	# prepare train and test sets
	testset = resample(test, replace=True, n_samples=n_size, random_state=i, stratify=test.Asthma_10YR)
	Counter(testset.Asthma_10YR)
	X_test = testset.drop(['Asthma_10YR','Study_ID'], axis=1)
	y_test = testset['Asthma_10YR']
	probs_opt = logreg.predict_proba(X_test)
	X_test['preds'] = probs_opt[:,1]
	pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)
	# Get performance measures
	cm_test = confusion_matrix(y_test, pred_opt)	
	test_report = classification_report(y_test, pred_opt)
	accuracy = accuracy_score(y_test, pred_opt)
	balanced_accuracy = balanced_accuracy_score(y_test, pred_opt)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, pred_opt)
	ROCAUC = roc_auc_score(y_test, X_test['preds'])
	PR_AUC = average_precision_score(y_test, X_test['preds'])
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	stats_accuracy.append(accuracy)
	stats_balanced_accuracy.append(balanced_accuracy)
	stats_sensitivity.append(sensitivity)
	stats_specificity.append(specificity)
	stats_PPV.append(PPV)
	stats_NPV.append(NPV)
	stats_LRp.append(LRp)	
	stats_LRn.append(LRn)	
	stats_F1.append(F1)
	stats_ROCAUC.append(ROCAUC)
	stats_PR_AUC.append(PR_AUC)

# confidence intervals
set= [0,1,2,3,4,5,6,7,8,9,10]
names = ['accuracy','balanced_accuracy','sensitivity','specificity','PPV','NPV','LRp', 'LRn','F1', 'ROCAUC', 'PR_AUC']
stats = list([stats_accuracy,stats_balanced_accuracy,stats_sensitivity,stats_specificity,stats_PPV,stats_NPV,stats_LRp, stats_LRn, stats_F1, stats_ROCAUC, stats_PR_AUC])
alpha = 0.95
p_lower = ((1.0-alpha)/2.0) * 100
p_upper = (alpha+((1.0-alpha)/2.0)) * 100

for i in (set):
	stat = stats[i]
	name = names[i]
	p_lower = ((1.0-alpha)/2.0) * 100
	p_upper = (alpha+((1.0-alpha)/2.0)) * 100
	lower = max(0.0, np.percentile(stat, p_lower))
	upper = min(100.0, np.percentile(stat, p_upper))
	print(str(name)+" %.0f percent confidence interval:  %.2f%% and %.2f%%" % (alpha*100, lower*100, upper*100))
	
# accuracy 95 percent confidence interval:  70.20% and 80.39%
# balanced_accuracy 95 percent confidence interval:  53.50% and 70.59%
# sensitivity 95 percent confidence interval:  26.47% and 58.82%
# specificity 95 percent confidence interval:  74.66% and 85.07%
# PPV 95 percent confidence interval:  16.95% and 34.55%
# NPV 95 percent confidence interval:  87.68% and 92.96%
# LRp 95 percent confidence interval:  132.65% and 343.11%
# LRn 95 percent confidence interval:  49.19% and 91.29%
# F1 95 percent confidence interval:  21.18% and 42.86%
# ROCAUC 95 percent confidence interval:  47.92% and 69.92%
# PR_AUC 95 percent confidence interval:  14.56% and 29.60%