# This script develops a variation of the CAPP model which only includes the predictors identified to have a big contribution by SHAP
# The main driving features for the CAPP model identified by SHAP were: Cough_4YR, Atopy_4YR and Polysensitisation_4YR. 
# Best preschool model:
	# dataset = complete, oversampled 300%, undersampled
	# algorithm = linearsvm
	# hyper-parameters = 'C': 0.33
# Python version 3.6.8 was used 
	
# Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from time import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score, roc_curve
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from numpy import argmax, arange

# Classifiers
from sklearn.svm import SVC

# Set working directory
os.chdir("/../..")

#### Define function to extract performance measures ####
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('F1:=%f' % (F1))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return
	
# Load training data which developed the best performing model- complete data, oversampled 300%, undersampled
data_300_O = pd.read_csv("Oversampled_preschool_dataset_300%.csv", index_col=False)
data_300_O = data_300_O.iloc[0:518,:]
print('Original dataset shape %s' % Counter(data_300_O.Asthma_10YR))
# Original dataset shape Counter({0: 314, 1: 204})

# Undersample the controls 
s1 = data_300_O.loc[data_300_O['Asthma_10YR'] == 1]
s0 = data_300_O.loc[data_300_O['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:204,]
data_300_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_300_OU = shuffle(data_300_OU, random_state=123)
print('Original dataset shape %s' % Counter(data_300_OU.Asthma_10YR))
# Original dataset shape Counter({0: 204, 1: 204}

# Extract the features deemed important by SHAP
X_train = data_300_OU[['Cough_4YR','Atopy_4YR','Polysensitisation_4YR']]
y_train = data_300_OU.iloc[:,-1]

# Import the standardised preschool test data - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("/scratch/dk2e18/Asthma_Prediction_Model/Oversampling/Final_models/Preschool_standardised_test_dataset_183IDs.csv", index_col=False)
# Split test data into features and outcome
X_test = test[['Cough_4YR','Atopy_4YR','Polysensitisation_4YR']]
y_test = test['Asthma_10YR']


# Define a linear svm classifier 
clf = SVC(kernel='linear', probability=True, random_state=123)

#### Random search ####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
									n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
best_parameters = random_search.best_params_
print(best_parameters)
#{'C': 0.13219411484660287}
best_score = random_search.best_score_
print(best_score)
# 0.7843137254901961

#### Grid search ####
clf = SVC(kernel='linear', probability=True, random_state=123)
C_range = np.arange(0.01,5,0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, scoring='balanced_accuracy', param_grid=param_grid, cv=StratifiedKFold(5), n_jobs=16)
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
# Get Grid search results
Candidates = len(grid_search.cv_results_['params'])
print(Candidates)
# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.13}

best_score = grid_search.best_score_
print(best_score)
# 0.7843137254901961

# Define the classifier with the identified best hyperparameters
best_clf = SVC(C=0.13, kernel='linear', probability=True, random_state=123)

# Fit optimised model
best_clf.fit(X_train,y_train)

### Training set Performance
y_train_pred = best_clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [157  47]  [ 41 163]

probs = best_clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
# 0.827073721645521

# Evaluate performance in the test set
y_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
# [116  42] [  5  20]

test_probs = best_clf.predict_proba(X_test)
test_preds = test_probs[:,1]
ROCAUC_test = roc_auc_score(y_test, test_preds)
print(ROCAUC_test)
# 0.798860759493671

# Calcuate Brier score
clf_score = brier_score_loss(y_test, test_preds, pos_label=1)
# 0.17560061264842552

# save the model to file
import pickle
filename = 'CAPP_linearSVM_COU300_SHAPfeatures.sav'
pickle.dump(best_clf, open(filename, 'wb'))

#### Identify optimal threshold based on Youden's index ####
test_probs = best_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))
# Best Threshold=0.7423336559682193

# Obtain classifications based on optimal threshold cutoff
probs_opt = best_clf.predict_proba(X_test)
X_test['preds'] = probs_opt[:,1]
pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
cm_test = confusion_matrix(y_test, pred_opt)	
print (cm_test)
performance(y_test, pred_opt,X_test['preds'])
# [[127  31] [  6  19]]
              # precision    recall  f1-score   support

           # 0       0.95      0.80      0.87       158
           # 1       0.38      0.76      0.51        25

    # accuracy                           0.80       183
   # macro avg       0.67      0.78      0.69       183
# weighted avg       0.88      0.80      0.82       183

# accuracy:=0.797814
# balanced_accuracy:=0.781899
# Sensitivity:=0.760000
# Specificity:=0.803797
# PPV:=0.380000
# NPV:=0.954887
# LRp:=3.873548
# LRn:=0.298583
# F1:=0.506667
# ROCAUC:=0.798861
# PR_AUC:=0.401026