# This script develops a variation of the CAPE model which only includes the predictors identified to have a big contribution by SHAP
# The main driving features for the CAPE model identified by SHAP were: Cough_2YR and Wheeze_2YR. 
# Best early life model:
	# dataset = complete, oversampled 0%, undersampled
	# algorithm = rbfsvm
	# hyper-parameters = 'C': 45.1, 'gamma': 0.0054
# Python version 3.6.8 was used 
	
# Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn import metrics
from time import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score, roc_curve
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from numpy import argmax, arange

# Classifiers
from sklearn.svm import SVC

# Set working directory
os.chdir("/../..")

#### Define function to extract performance measures ####
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('F1:=%f' % (F1))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return
	
# Load training data which developed the best performing model - complete data, oversampled 0%, undersampled - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife training"
data_0 = pd.read_csv("Earlylife_standardised_training_dataset_510IDs.csv", index_col=False)
print('Original dataset shape %s' % Counter(data_0.Asthma_10YR))
# Original dataset shape Counter({0: 442, 1: 68})

# Undersample the controls 
s1 = data_0.loc[data_0['Asthma_10YR'] == 1]
s0 = data_0.loc[data_0['Asthma_10YR'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:68,]
data_0_U = s1.append(pd.DataFrame(data = s0), ignore_index=True)
data_0_U = shuffle(data_0_U, random_state=123)
print('Original dataset shape %s' % Counter(data_0_U.Asthma_10YR))
# Original dataset shape Counter({0: 68, 1: 68})

# Keep only wheeze and cough as predictors
X_train = data_0_U.iloc[:,6:-2]
y_train = data_0_U.iloc[:,-1]

# Import test data
test = pd.read_csv("Earlylife_standardised_test_dataset_255IDs.csv", index_col=False) - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised earlylife test set"
# Split test data into features and outcome
X_test = test.iloc[:,6:-2]
y_test = test['Asthma_10YR']

#Create a rbf svm classifier 
clf = SVC(kernel='rbf', probability=True, random_state=123)

#### Random search ####
C_range = np.logspace(-3,2,100)
gamma_range = np.logspace(-3, 2, 100)
param_grid = dict(gamma=gamma_range, C=C_range)

random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
									n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
best_parameters = random_search.best_params_
print(best_parameters)
#'gamma': 0.02595024211399736, 'C': 10.974987654930567}
best_score = random_search.best_score_
print(best_score)
# 0.6544117647058824

#### Grid search ####
clf = SVC(kernel='rbf', probability=True, random_state=123)
C_range = np.arange(0.1, 50, 0.1)
gamma_range = np.arange(0.01, 5, 0.01)
param_grid = dict(C=C_range, gamma=gamma_range)
grid_search = GridSearchCV(clf, scoring='balanced_accuracy', param_grid=param_grid, cv=StratifiedKFold(5), n_jobs=16)
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
# Get Grid search results
Candidates = len(grid_search.cv_results_['params'])
print(Candidates)
# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 4.0, 'gamma': 0.09}

best_score = grid_search.best_score_
print(best_score)
#0.6544117647058824

# Define the classifier with the identified best hyperparameters
best_clf = SVC(C=4.0, gamma=0.09, kernel='rbf', probability=True, random_state=123)

# Fit optimised model
best_clf.fit(X_train,y_train)

### Training set performance
y_train_pred = best_clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [51 17] [30 38]

probs = best_clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
# 0.6473832179930796

# Evaluate performance in the test set
y_pred = best_clf.predict(X_test)
cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
# [181  40] [ 20  14]

test_probs = best_clf.predict_proba(X_test)
test_preds = test_probs[:,1]
ROCAUC_test = roc_auc_score(y_test, test_preds)
print(ROCAUC_test)
# 0.6135879691243012

# Calcuate Brier score
clf_score = brier_score_loss(y_test, test_preds, pos_label=1)
# 0.2097081050491023

# save the model to file
import pickle
filename = 'CAPE_rbfSVM_COU0_SHAPfeatures.sav'
pickle.dump(best_clf, open(filename, 'wb'))

#### Identify optimal threshold based on Youden's index ####
test_probs = best_clf.predict_proba(X_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))
# Best Threshold=0.6818801544280271

# Obtain classifications based on optimal threshold cutoff
probs_opt = best_clf.predict_proba(X_test)
X_test['preds'] = probs_opt[:,1]
pred_opt = X_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
performance(y_test, pred_opt,X_test['preds'])
# [[192  29] [ 21  13]]
              # precision    recall  f1-score   support

           # 0       0.90      0.87      0.88       221
           # 1       0.31      0.38      0.34        34

    # accuracy                           0.80       255
   # macro avg       0.61      0.63      0.61       255
# weighted avg       0.82      0.80      0.81       255

# accuracy:=0.803922
# balanced_accuracy:=0.625566
# Sensitivity:=0.382353
# Specificity:=0.868778
# PPV:=0.309524
# NPV:=0.901408
# LRp:=2.913793
# LRn:=0.710938
# F1:=0.342105
# ROCAUC:=0.613588
# PR_AUC:=0.203073


