# This script develops a preschool model using a SVM algorithm (polynomial kernel) and the initial complete preschool dataset (no missing data)
# The corresponding initial models developed on the complete training data using the other algorithms is incorporated in the Model_development_algorithm.txt code using the dataset named 'data_0' from the Model_COU_training_test_data.txt scripts. 
# Python version 3.6.8 is used

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# Set working directory
os.chdir("/../../..")

# Import cleaned, unstandardised preschool dataset - data found in IOWBC_data.xlsx, sheet: "Preschool data"
data_4YR = pd.read_csv("Preschool_QC_1368IDs.csv", index_col=False)
del data_4YR['Unnamed: 0']
# 1368 Ids, 59 columns

# Subset the 12 features selected from Balanced Random Forest RFE - 5-fold CV
selected_data = data_4YR[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR']]
complete_data_4YR = selected_data.dropna()

# Create training and test set
#create data_features
complete_subset_features = complete_data_4YR.drop(['Study_ID','Asthma_10YR'], axis=1)

#create data_outcome
complete_subset_outcome = complete_data_4YR['Asthma_10YR']

# Split dataset into training set and test set: 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(complete_subset_features, complete_subset_outcome,
                                                    stratify=complete_subset_outcome, 
                                                    test_size=0.333, shuffle=True, random_state=123)
													
# Training set (n=365, asthma=51, no asthma=314)	Test set (n=183, asthma=25, no asthma=158)

# Standardise training and test sets
cont = X_train[['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4']]

scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'))
cat_train = X_train.iloc[:,5:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)

cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'))
cat_test = X_test.iloc[:,5:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)


####################
### SVM - poly #####
####################
#Create a svm classifier
clf = SVC(kernel='poly', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
gamma_range = np.logspace(-3, 2, 100)
param_grid = dict(gamma=gamma_range, C=C_range, degree=[1,2,3,4,5,6])

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
start = time()
random_search.fit(SX_train, y_train)
RStime = (time() - start)
print(RStime)

best_parameters = random_search.best_params_
print(best_parameters)
#{'gamma': 0.298364724028334, 'degree': 3, 'C': 1.072267222010323}

best_score = random_search.best_score_
print(best_score)

RSresults = pd.DataFrame(random_search.cv_results_)
RSresults.to_csv("PolySVM_initial_random_search_preschool_results.csv",index=False)

##### Grid search #####
C_range = np.arange(0.01, 5.01, 0.01)
gamma_range = np.arange(0.01,1.01, 0.01)
param_grid = dict(gamma=gamma_range, C=C_range, degree=[1,2,3,4,5,6])
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(SX_train, y_train)
GStime = (time() - start)

# Get Grid search results
GSresults = pd.DataFrame(grid_search.cv_results_)
GSresults.to_csv("PolySVM_grid_search_preschool_results.csv",index=False)

# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.02, 'degree': 3, 'gamma': 1.0}

best_score = grid_search.best_score_
print(best_score)

#Create a classifier with the best hyperparameters identified from the grid search
best_clf = SVC(C=0.02, gamma=1.0, degree=3, kernel='poly', probability=True,  random_state=123)

# Train the model using the training sets
scores = cross_val_score(best_clf, SX_train, y_train, n_jobs=16, cv=StratifiedKFold(5))
accuracy = scores.mean()
sd= (scores.std())     
					
# Fit optimised model
best_clf.fit(SX_train,y_train)

### Training set Performance
y_train_pred = best_clf.predict(SX_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)

train_report = classification_report(y_train, y_train_pred)
print (train_report)
# Example of output
#              precision    recall  f1-score   support
#
#           0       0.98      1.00      0.99       314
#           1       1.00      0.84      0.91        51
#
#    accuracy                           0.98       365
#   macro avg       0.99      0.92      0.95       365
#weighted avg       0.98      0.98      0.98       365


accuracy_score(y_train, y_train_pred)

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)

LRp = sensitivity/(1-specificity)
print(LRp)

LRn = (1-sensitivity)/specificity
print(LRn)

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)

probs = best_clf.predict_proba(SX_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)

#Predict the response for test dataset
y_pred = best_clf.predict(SX_test)

cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)

test_report = classification_report(y_test, y_pred)
print (test_report)
# Example of output:
#              precision    recall  f1-score   support
#
#           0       0.92      0.88      0.90       158
#           1       0.41      0.52      0.46        25
#
#    accuracy                           0.83       183
#   macro avg       0.66      0.70      0.68       183
#weighted avg       0.85      0.83      0.84       183


accuracy_score(y_test, y_pred)

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)

LRp = sensitivity/(1-specificity)
print(LRp)

LRn = (1-sensitivity)/specificity
print(LRn)

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)

probs = best_clf.predict_proba(SX_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)