# This script develops a preschool model using a SVM algorithm (polynomial kernel) and the initial complete preschool dataset (no missing data)
# The corresponding initial models developed on the complete training data using the other algorithms is incorporated in the Model_development_algorithm.txt code using the dataset named 'data_0' from the Model_COU_training_test_data.txt scripts. 
# Python version 3.6.8 is used

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# Set working directory
os.chdir("/../../..")

# Import cleaned, unstandardised earlylife dataset - data found in IOWBC_data.xlsx, sheet: "Earlylife data"
data_4YR = pd.read_csv("Earlylife_QC_1368IDs.csv", index_col=False)

# Subset the 8 features selected from Balanced Random Forest RFE - 5-fold CV
selected_data = data_2YR[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES', 'Asthma_10YR']]
complete_data_2YR = selected_data.dropna()
# n=765

# Create training and test set
#create data_features
complete_subset_features = complete_data_2YR.drop(['Study_ID','Asthma_10YR'], axis=1)

#create data_outcome
complete_subset_outcome = complete_data_2YR['Asthma_10YR']

# Split dataset into training set and test set: 2/3% training and 1/3% test
X_train, X_test, y_train, y_test = train_test_split(complete_subset_features, complete_subset_outcome,
                                                    stratify=complete_subset_outcome, 
                                                    test_size=0.333, shuffle=True, random_state=123)
													
# Training set (n=510, asthma=68, no asthma=)	Test set (n=255, asthma=34, no asthma=)

# Standardise training and test sets
#cont = X_train[['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1']]

scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'))
cat_train = X_train.iloc[:,4:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)

cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'))
cat_test = X_test.iloc[:,4:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)

####################
### SVM - poly #####
####################
#Create a svm classifier
clf = SVC(kernel='poly', probability=True, random_state=123)

##### Random search #####
C_range = np.logspace(-3,2,100)
gamma_range = np.logspace(-3, 2, 100)
param_grid = dict(gamma=gamma_range, C=C_range, degree=[1,2,3,4,5,6])

# Run randomized search
random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
                                    n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
start = time()
random_search.fit(SX_train, y_train)
RStime = (time() - start)
print(RStime)

best_parameters = random_search.best_params_
print(best_parameters)
# 'gamma': 0.23644894126454072, 'degree': 4, 'C': 0.11768119524349979

best_score = random_search.best_score_
print(best_score)

##### Grid search #####
C_range = np.arange(0,5,0.01)
gamma_range = np.arange(0, 1, 0.01)
param_grid = dict(gamma=gamma_range, C=C_range, degree=[1,2,3,4,5,6])
grid_search = GridSearchCV(clf, param_grid=param_grid, scoring='balanced_accuracy', n_jobs=16, cv=StratifiedKFold(5))
start = time()
grid_search.fit(SX_train, y_train)
GStime = (time() - start)
print(GStime)

# Get Grid search results
Candidates = len(grid_search.cv_results_['params'])

# best parameters
GSresults = pd.DataFrame(grid_search.cv_results_)
GSresults.to_csv("PolySVM_grid_search_earlylife_results.csv",index=False)

best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 1.8299999999999996, 'degree': 4, 'gamma': 0.32999999999999996}

best_score = grid_search.best_score_
print(best_score)

#Create a Random forest Classifier
best_clf = SVC(C=1.83, gamma=0.33, degree=4, kernel='poly', probability=True,  random_state=123)

# Train the model using the training sets
scores = cross_val_score(best_clf, SX_train, y_train, n_jobs=16, cv=StratifiedKFold(5))
accuracy = scores.mean()
sd= (scores.std())     
					
# Fit optimised model
best_clf.fit(SX_train,y_train)

### Training set Performance
y_train_pred = best_clf.predict(SX_train)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
# [442   0] [  9  59]

train_report = classification_report(y_train, y_train_pred)
print (train_report)
# Example of output
#                precision    recall  f1-score   support
#
#           0       0.98      1.00      0.99       442
#           1       1.00      0.87      0.93        68
#
#    accuracy                           0.98       510
#   macro avg       0.99      0.93      0.96       510
#weighted avg       0.98      0.98      0.98       510


accuracy_score(y_train, y_train_pred)

balanced_accuracy_score(y_train, y_train_pred)

sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
print(sensitivity)

specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
print(specificity)

PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
print(PPV)

NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
print(NPV)

LRp = sensitivity/(1-specificity)
print(LRp)

LRn = (1-sensitivity)/specificity
print(LRn)

#  AUC: 
AUC_train = roc_auc_score(y_train, y_train_pred)
print(AUC_train)

probs = best_clf.predict_proba(SX_train)
preds = probs[:,1]
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)


#Predict the response for test dataset
y_pred = best_clf.predict(SX_test)
cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
#[190  31] [ 25   9]

test_report = classification_report(y_test, y_pred)
print (test_report)
# Example of output:
#                precision    recall  f1-score   support
#
#           0       0.88      0.86      0.87       221
#           1       0.23      0.26      0.24        34
#
#    accuracy                           0.78       255
#   macro avg       0.55      0.56      0.56       255
#weighted avg       0.80      0.78      0.79       255


accuracy_score(y_test, y_pred)

balanced_accuracy_score(y_test, y_pred)

sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
print(sensitivity)

specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
print(specificity)

PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
print(PPV)

NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
print(NPV)

LRp = sensitivity/(1-specificity)
print(LRp)

LRn = (1-sensitivity)/specificity
print(LRn)

AUC_test = roc_auc_score(y_test, y_pred)
print(AUC_test)

probs = best_clf.predict_proba(SX_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)