# This script evaluates the effect of oversampling on model performance. 
# The best algorithm identified from the initial model development stage was used for this - a preschool model using SVM using a linear kernel
# Models were trained on each oversampled training dataset, optimising the hyperparameters each time using random and grid searches.
# The performance of all models were evaluated on the same validation dataset used during the initial model development stage.
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from collections import Counter

# Set working directory
os.chdir("/../../../")

# Import the test set which will be used to evaluate the performance of all models - data found in IOWBC_training_test_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("Preschool_standardised_test_dataset_183IDs.csv", index_col=False)

# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

################################################################################
# Import the oversampled training datasets. Then train and evaluate each model:

############
### x25% ###
############
# Import oversampled training data - 25%
data_25 = pd.read_csv("Oversampled_preschool_dataset_25%.csv", index_col=False)

# Remove extra synthetic cases produced
data_25 = data_25.iloc[0:378,:]

# Split training data into features and outcome 
X_train = data_25.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_25['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 64})

### SVM - linear ###
# Define a linear svm classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 4.490523338317871

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 0.3351602650938841}

best_score = random_search.best_score_
print(best_score)
# 0.6595925598741779

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
# 6.032305717468262

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#'C': 0.31

best_score = grid_search.best_score_
print(best_score)
# 0.6595925598741779

# Fit optimised model
clf = SVC(C=0.31, kernel='linear', probability=True, random_state=123)
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [302  12] [ 41  23]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.88      0.96      0.92       314
#           1       0.66      0.36      0.46        64
#
#    accuracy                           0.86       378
#   macro avg       0.77      0.66      0.69       378
#weighted avg       0.84      0.86      0.84       378


accuracy_score(y_train, y_train_pred)
#0.8597883597883598

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.359375

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.9617834394904459

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.6571428571428571

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.880466472303207

LRp = sensitivity/(1-specificity)
print(LRp)
#9.403645833333336

LRn = (1-sensitivity)/specificity
print(LRn)
#0.666080298013245

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.6605792197452229

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
# 0.8504179936305734

#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[146  12] [ 15  10]

test_report = classification_report(y_test, y_pred)
print (test_report)
#              precision    recall  f1-score   support
#
#           0       0.91      0.92      0.92       158
#           1       0.45      0.40      0.43        25
#
#    accuracy                           0.85       183
#   macro avg       0.68      0.66      0.67       183
#weighted avg       0.85      0.85      0.85       183



accuracy_score(y_test, y_pred)
# 0.8524590163934426

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.4

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.9240506329113924

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.45454545454545453

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.906832298136646

LRp = sensitivity/(1-specificity)
print(LRp)
#5.266666666666669

LRn = (1-sensitivity)/specificity
print(LRn)
#0.6493150684931507

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.6620253164556962

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7777215189873418

############
### x50% ###
############
# Import oversampled training data - 50%
data_50 = pd.read_csv("Oversampled_preschool_dataset_50%.csv", index_col=False)

# Remove extra synthetic cases produced
data_50 = data_50.iloc[0:391,:]

# Split training data into features and outcome 
X_train = data_50.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_50['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 77})

### SVM - linear ###
# Define a svm Classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 11.82656455039978

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 0.1668100537200059}

best_score = random_search.best_score_
print(best_score)
# 0.6806594926423116

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
print(GStime)
#10.558924674987793

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.17}

best_score = grid_search.best_score_
print(best_score)
# 0.6806594926423116

# Fit optimised model
clf = SVC(C=0.17, kernel='linear', probability=True, random_state=123)

# Fit optimised model
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [302  12] [ 48  29]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.86      0.96      0.91       314
#           1       0.71      0.38      0.49        77
#
#    accuracy                           0.85       391
#   macro avg       0.79      0.67      0.70       391
#weighted avg       0.83      0.85      0.83       391


accuracy_score(y_train, y_train_pred)
#0.8465473145780051

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.37662337662337664

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.9617834394904459

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.7073170731707317

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.8628571428571429

LRp = sensitivity/(1-specificity)
print(LRp)
#9.854978354978357

LRn = (1-sensitivity)/specificity
print(LRn)
#0.6481465554313236

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.6692034080569113

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.8299693936636612


#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[146  12] [ 15  10]

test_report = classification_report(y_test, y_pred)
print (test_report)
#              precision    recall  f1-score   support
#
#           0       0.91      0.92      0.92       158
#           1       0.45      0.40      0.43        25
#
#    accuracy                           0.85       183
#   macro avg       0.68      0.66      0.67       183
#weighted avg       0.85      0.85      0.85       183


accuracy_score(y_test, y_pred)
# 0.8524590163934426

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.4

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.9240506329113924

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.45454545454545453

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.906832298136646

LRp = sensitivity/(1-specificity)
print(LRp)
#5.266666666666669

LRn = (1-sensitivity)/specificity
print(LRn)
#0.6493150684931507

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.6620253164556962

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7726582278481013


#############
### x100% ###
#############
# Import oversampled training data - 100%
data_100 = pd.read_csv("Oversampled_preschool_dataset_100%.csv", index_col=False)

# Remove extra synthetic cases produced
data_100 = data_100.iloc[0:416,:]

# Split training data into features and outcome 
X_train = data_100.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_100['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 102})

### SVM - linear ###
# Define a svm Classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 6.154069423675537

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 0.26560877829466867}

best_score = random_search.best_score_
print(best_score)
# 0.7122795858934972

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
print(GStime)
#2.1083076000213623

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.26}

best_score = grid_search.best_score_
print(best_score)
# 0.7122795858934972

# Fit optimised model
clf = SVC(C=0.26, kernel='linear', probability=True, random_state=123)

# Fit optimised model
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [300  14] [ 57  45]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.84      0.96      0.89       314
#           1       0.76      0.44      0.56       102
#
#    accuracy                           0.83       416
#   macro avg       0.80      0.70      0.73       416
#weighted avg       0.82      0.83      0.81       416

accuracy_score(y_train, y_train_pred)
#0.8293269230769231

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.4411764705882353

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.9554140127388535

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.7627118644067796

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.8403361344537815

LRp = sensitivity/(1-specificity)
print(LRp)
#9.89495798319328

LRn = (1-sensitivity)/specificity
print(LRn)
#0.5849019607843138

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.6982952416635444

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.8277132509054578

#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[143  15] [ 15  10]

test_report = classification_report(y_test, y_pred)
print (test_report)
#              precision    recall  f1-score   support
#
#           0       0.91      0.91      0.91       158
#           1       0.40      0.40      0.40        25
#
#    accuracy                           0.84       183
#   macro avg       0.65      0.65      0.65       183
#weighted avg       0.84      0.84      0.84       183


accuracy_score(y_test, y_pred)
#0.8360655737704918

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.4

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.9050632911392406

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.4

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.9050632911392406

LRp = sensitivity/(1-specificity)
print(LRp)
#4.213333333333336

LRn = (1-sensitivity)/specificity
print(LRn)
#0.6629370629370629

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.6525316455696202

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.78

#############
### x150% ###
#############
# Import oversampled training data - 150%
data_150 = pd.read_csv("Oversampled_preschool_dataset_150%.csv", index_col=False)

# Remove extra synthetic cases produced
data_150 = data_150.iloc[0:442,:]

# Split training data into features and outcome 
X_train = data_150.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_150['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 128})

### SVM - linear ###
# Define a svm Classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 11.652171850204468

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 0.6734150657750821}

best_score = random_search.best_score_
print(best_score)
# 0.7289084505724412

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
print(GStime)
#2.678379535675049

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 2.0}

best_score = grid_search.best_score_
print(best_score)
# 0.7336847200545932

# Fit optimised model
clf = SVC(C=2.0, kernel='linear', probability=True, random_state=123)

# Fit optimised model
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [281  33] [ 53  75]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.84      0.89      0.87       314
#           1       0.69      0.59      0.64       128
#
#    accuracy                           0.81       442
#   macro avg       0.77      0.74      0.75       442
#weighted avg       0.80      0.81      0.80       442


accuracy_score(y_train, y_train_pred)
#0.8054298642533937

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.5859375

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.8949044585987261

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.6944444444444444

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.8413173652694611

LRp = sensitivity/(1-specificity)
print(LRp)
#5.575284090909092

LRn = (1-sensitivity)/specificity
print(LRn)
#0.46268905693950174

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.740420979299363

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.8310857882165605

#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
# [133  25] [ 15  10]

test_report = classification_report(y_test, y_pred)
print (test_report)
#              precision    recall  f1-score   support
#
#           0       0.90      0.84      0.87       158
#           1       0.29      0.40      0.33        25
#
#    accuracy                           0.78       183
#   macro avg       0.59      0.62      0.60       183
#weighted avg       0.81      0.78      0.80       183



accuracy_score(y_test, y_pred)
#0.7814207650273224

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.4

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.8417721518987342

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.2857142857142857

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.8986486486486487

LRp = sensitivity/(1-specificity)
print(LRp)
#2.528000000000001

LRn = (1-sensitivity)/specificity
print(LRn)
#0.712781954887218

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.6208860759493671

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
# 0.7544303797468355

#############
### x200% ###
#############
# Import oversampled training data - 200%
data_200 = pd.read_csv("Oversampled_preschool_dataset_200%.csv", index_col=False)

# Remove extra synthetic cases produced
data_200 = data_200.iloc[0:467,:]

# Split training data into features and outcome 
X_train = data_200.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_200['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 153})


### SVM - linear ###
# Define a svm Classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 12.58332085609436

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 0.0657933224657568}

best_score = random_search.best_score_
print(best_score)
# 0.7545632864828831

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
print(GStime)
#2.1881253719329834

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.06999999999999999}

best_score = grid_search.best_score_
print(best_score)
# 0.7545632864828831

# Fit optimised model
clf = SVC(C=0.07, kernel='linear', probability=True, random_state=123)

# Fit optimised model
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [277  37] [ 57  96]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.83      0.88      0.85       314
#           1       0.72      0.63      0.67       153
#
#    accuracy                           0.80       467
#   macro avg       0.78      0.75      0.76       467
#weighted avg       0.79      0.80      0.79       467



accuracy_score(y_train, y_train_pred)
#0.7987152034261242

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.6274509803921569

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.8821656050955414

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.7218045112781954

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.8293413173652695

LRp = sensitivity/(1-specificity)
print(LRp)
#5.324854266030738

LRn = (1-sensitivity)/specificity
print(LRn)
#0.4223118850428258

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.7548082927438492

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.8380167353565631

#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[135  23] [  9  16]

test_report = classification_report(y_test, y_pred)
print (test_report)
#              precision    recall  f1-score   support
#
#           0       0.94      0.85      0.89       158
#           1       0.41      0.64      0.50        25
#
#    accuracy                           0.83       183
#   macro avg       0.67      0.75      0.70       183
#weighted avg       0.87      0.83      0.84       183


accuracy_score(y_test, y_pred)
#0.825136612021858

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.64

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.8544303797468354

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.41025641025641024

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.9375

LRp = sensitivity/(1-specificity)
print(LRp)
#4.396521739130435

LRn = (1-sensitivity)/specificity
print(LRn)
#0.4223118850428258

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.7472151898734178

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7883544303797468

#############
### x250% ###
#############
# Import oversampled training data - 250%
data_250 = pd.read_csv("Oversampled_preschool_dataset_250%.csv", index_col=False)

# Remove extra synthetic cases produced
data_250 = data_250.iloc[0:493,:]

# Split training data into features and outcome 
X_train = data_250.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_250['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 179})

### SVM - linear ###
# Define a svm Classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 25.533438444137573

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 0.5336699231206307}


best_score = random_search.best_score_
print(best_score)
# 0.7779748273057833

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
print(GStime)
#2.7349069118499756

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.50}

best_score = grid_search.best_score_
print(best_score)
# 0.7779748273057833

# Fit optimised model
clf = SVC(C=0.5, kernel='linear', probability=True, random_state=123)

# Fit optimised model
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [264  50] [ 51 128]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.84      0.84      0.84       314
#           1       0.72      0.72      0.72       179
#
#    accuracy                           0.80       493
#   macro avg       0.78      0.78      0.78       493
#weighted avg       0.79      0.80      0.80       493


accuracy_score(y_train, y_train_pred)
#0.795131845841785

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.7150837988826816

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.8407643312101911

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.7191011235955056

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.8380952380952381

LRp = sensitivity/(1-specificity)
print(LRp)
#4.490726256983241

LRn = (1-sensitivity)/specificity
print(LRn)
#0.33887760284408325

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.7779240650464364

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.8584136924883465

#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[127  31] [  6  19]

test_report = classification_report(y_test, y_pred)
print (test_report)
#              precision    recall  f1-score   support
#
#           0       0.95      0.80      0.87       158
#           1       0.38      0.76      0.51        25
#
#    accuracy                           0.80       183
#   macro avg       0.67      0.78      0.69       183
#weighted avg       0.88      0.80      0.82       183


accuracy_score(y_test, y_pred)
#0.7978142076502732

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.76

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.8037974683544303

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.38

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.9548872180451128

LRp = sensitivity/(1-specificity)
print(LRp)
#3.8735483870967733

LRn = (1-sensitivity)/specificity
print(LRn)
#0.29858267716535436

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.7818987341772152

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7706329113924051

#############
### x300% ###
#############
# Import oversampled training data - 300%
data_300 = pd.read_csv("Oversampled_preschool_dataset_300%.csv", index_col=False)

# Remove extra synthetic cases produced
data_300 = data_300.iloc[0:518,:]

# Split training data into features and outcome 
X_train = data_300.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = data_300['Asthma_10YR']

print('Original dataset shape %s' % Counter(y_train))
# Original dataset shape Counter({0: 314, 1: 204})

### SVM - linear ###
# Define a svm classifier
clf = SVC(kernel='linear', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5))
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
# 18.995981454849243

best_parameters = random_search.best_params_
print(best_parameters)
# {'C': 15.556761439304722}


best_score = random_search.best_score_
print(best_score)
# 0.7801180264707297

##### Grid search #####
C_range = np.arange(0.10, 50.1, 0.1)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
print(GStime)
#61.42272686958313

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 15.9}

best_score = grid_search.best_score_
print(best_score)
# 0.7817114566355883

# Fit optimised model
clf = SVC(C=15.9, kernel='linear', probability=True, random_state=123)

# Fit optimised model
clf.fit(X_train,y_train)

# Predicting the train set results
y_train_pred = clf.predict(X_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [264  50] [ 59 145]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
#              precision    recall  f1-score   support
#
#           0       0.82      0.84      0.83       314
#           1       0.74      0.71      0.73       204
#
#    accuracy                           0.79       518
#   macro avg       0.78      0.78      0.78       518
#weighted avg       0.79      0.79      0.79       518


accuracy_score(y_train, y_train_pred)
#0.7895752895752896

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)
#0.7107843137254902

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)
#0.8407643312101911

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)
#0.7435897435897436

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)
#0.8173374613003096

LRp = sensitivity/(1-specificity)
print(LRp)
#4.4637254901960794

LRn = (1-sensitivity)/specificity
print(LRn)
#0.34399138443256083

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)
#0.7757743224678407

probs = clf.predict_proba(X_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.8361277632071936


#Predict the response for test dataset
y_pred = clf.predict(X_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[127  31] [  6  19]

test_report = classification_report(y_test, y_pred)
print (test_report)
#               precision    recall  f1-score   support
#
#           0       0.95      0.80      0.87       158
#           1       0.38      0.76      0.51        25
#
#    accuracy                           0.80       183
#   macro avg       0.67      0.78      0.69       183
#weighted avg       0.88      0.80      0.82       183


accuracy_score(y_test, y_pred)
#0.7978142076502732

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)
#0.76

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)
#0.8037974683544303

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)
#0.38

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)
#0.9548872180451128

LRp = sensitivity/(1-specificity)
print(LRp)
#3.8735483870967733

LRn = (1-sensitivity)/specificity
print(LRn)
#0.29858267716535436

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)
#0.7818987341772152

probs = clf.predict_proba(X_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7840506329113924