# This script develops machine learning models using the SVM (RBF kernel) algorithm across all of the model training datasets
# The hyperparameter search ranges for the grid search performed in this script should be specified for each model based on the random search results obtained from the Model_development_rbfSVM_randomsearch.txt script - a reasonable search window should be specified. 
# This script should immediately follow the relevant scripts named: "Model_optimisation-strategy_training_test_data.txt", where:
#	"model" - refers to either the early-life (CAPE) or preschool (CAPP) models, 
# 	"optimisation-strategy" - refers to whether the training datasets had undergone imputation and oversampling (IO), imputation, oversampling and undersampling (IOU), 
#				  or were complete datasets (no missing data) with oversampling and/or undersampling applied (COU).
# 	The data objects generated to run this script are objects called:
#		data - contains all the different training datasets on which to develop the models
#		set - attribute specifying how many training datasets are in the object 'data' for model development
# 		X_test and y_test objects created from the test dataset - refers to the standardised test dataset which corresponds to the standardised training datasets. 
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score

# Classifiers
from sklearn.svm import SVC

# Input data - should have been generated already from files named "Model_optimisation-strategy_training_test_data.txt" as detailed above. 
# As mentioned, the grid search range for each model developed on each dataset should be specified based on the results from the random search results. 
# The following section of script illustrates the model development process for all models developed using the different complete training datasets with/without oversampling and undersampling:
#	i.e. following on from: "Model_COU_training_test_data.txt"

# Specify the algorithm and training dataset on which to develop the models - the grid search ranges were specified for each model and dataset.
models = ['RBF_SVM_0', 'RBF_SVM_25_O', 'RBF_SVM_50_O', 'RBF_SVM_100_O', 'RBF_SVM_150_O', 'RBF_SVM_200_O', 'RBF_SVM_250_O', 'RBF_SVM_300_O', 'RBF_SVM_0_U', 'RBF_SVM_25_OU', 'RBF_SVM_50_OU', 'RBF_SVM_100_OU', 'RBF_SVM_150_OU', 'RBF_SVM_200_OU', 'RBF_SVM_250_OU', 'RBF_SVM_300_OU']

# Define place sto store results
result_table=[]
result_table = pd.DataFrame(columns=['dataset', 'best_parameters', 'best_score', 'cv_scores', 'cv_accuracy', 'cv_sd', 'cm_train', 'train_report', 'tr_accuracy_score', 'tr_balanced_accuracy_score','tr_sensitivity', 'tr_specificity', 'tr_PPV', 'tr_NPV', 'tr_LRp', 'tr_LRn', 'tr_f1', 'AUC_train', 'ROCAUC_train', 'tr_PR_score', 'cm_test', 'test_report', 'accuracy_score', 'balanced_accuracy_score', 'sensitivity', 'specificity', 'PPV', 'NPV', 'LRp', 'LRn', 'f1_test', 'AUC_test', 'ROCAUC_test', 'PR_score_test'])

for i in models:
	if i=='RBF_SVM_0':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(10.1, 60.1, 0.1)
		gamma_range = np.arange(0.0001, 0.0101, 0.0001)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[0]
	if i=='RBF_SVM_25_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(30.1, 80.1, 0.1)
		gamma_range = np.arange(0.001, 0.101, 0.001)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[1]	
	if i=='RBF_SVM_50_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(30.1, 80.1, 0.1)
		gamma_range = np.arange(0.001, 0.101, 0.001)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[2]			
	if i=='RBF_SVM_100_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[3]	
	if i=='RBF_SVM_150_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[4]		
	if i=='RBF_SVM_200_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[5]		
	if i=='RBF_SVM_250_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[6]			
	if i=='RBF_SVM_300_O':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[7]	
	if i=='RBF_SVM_0_U':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.0001,0.05,0.0001)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[8]
	if i=='RBF_SVM_25_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(25.1, 75.1, 0.1)
		gamma_range = np.arange(0.001, 0.101, 0.001)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[9]	
	if i=='RBF_SVM_50_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(1.01, 2.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[10]			
	if i=='RBF_SVM_100_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.1, 50.1, 0.1)
		gamma_range = np.arange(0.01, 1.01, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[11]	
	if i=='RBF_SVM_150_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(10.1, 60, 0.1)
		gamma_range = np.arange(0.51, 1.51, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[12]		
	if i=='RBF_SVM_200_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(10.1, 60, 0.1)
		gamma_range = np.arange(0.51, 1.51, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[13]		
	if i=='RBF_SVM_250_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(10.1, 60, 0.1)
		gamma_range = np.arange(0.51, 1.51, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[14]			
	if i=='RBF_SVM_300_OU':
		clf = SVC(kernel='rbf', probability=True, random_state=123)
		C_range = np.arange(0.01, 5, 0.01)
		gamma_range = np.arange(0.51, 1.51, 0.01)
		param_grid = dict(C=C_range, gamma=gamma_range)
		dataset = data[15]			
	# Split training data into features and outcome 
	X_train = dataset.drop(['Study_ID','Asthma_10YR'], axis=1)
	y_train = dataset['Asthma_10YR']
	print('Original dataset shape %s' % Counter(y_train))
	# Perform grid search 
	grid_search = GridSearchCV(clf, scoring='balanced_accuracy', param_grid=param_grid, cv=StratifiedKFold(5), n_jobs=16)
	start = time()
	grid_search.fit(X_train, y_train)
	GStime = (time() - start)
	# Get Grid search results
	Candidates = len(grid_search.cv_results_['params'])
	print(Candidates)
	# best parameters
	best_parameters = grid_search.best_params_
	print(best_parameters)
	best_score = grid_search.best_score_
	print(best_score)
	results=pd.DataFrame(grid_search.cv_results_)
	filename = "/../../"+str(i)+"_model_training_dataset_grid_search_results.csv"
	results.to_csv(filename,index=False)
	# Build best model
	best_clf = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True, random_state=123)
	# Print performance measures
	# Train the model using the training sets
	cv_scores = cross_val_score(best_clf, X_train, y_train, n_jobs=16, cv=StratifiedKFold(5))
	cv_accuracy = cv_scores.mean()
	cv_sd= (cv_scores.std())     
	# Fit optimised model
	best_clf.fit(X_train,y_train)
	# Fit optimised model
	best_clf.fit(X_train,y_train)
	### Training set Performance
	y_train_pred = best_clf.predict(X_train)
	cm_train = confusion_matrix(y_train, y_train_pred)
	print(cm_train)
	train_report = classification_report(y_train, y_train_pred)
	print (train_report)
	tr_accuracy_score = accuracy_score(y_train, y_train_pred)
	tr_balanced_accuracy_score = balanced_accuracy_score(y_train, y_train_pred)
	tr_sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
	print(tr_sensitivity)
	tr_specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
	print(tr_specificity)
	tr_PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
	print(tr_PPV)
	tr_NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
	print(tr_NPV)
	tr_LRp = tr_sensitivity/(1-tr_specificity)
	print(tr_LRp)
	tr_LRn = (1-tr_sensitivity)/tr_specificity
	print(tr_LRn)
	tr_f1 = f1_score(y_train, y_train_pred)
	print(tr_f1)
	#  AUC: 
	AUC_train = roc_auc_score(y_train, y_train_pred)
	print(AUC_train)
	# Test model
	tr_probs = best_clf.predict_proba(X_train)
	tr_preds = tr_probs[:,1]
	ROCAUC_train = roc_auc_score(y_train, tr_preds)
	print(ROCAUC_train)
	tr_PR_score = average_precision_score(y_train, tr_preds)
	print(tr_PR_score)
	#Predict the response for test dataset
	y_pred = best_clf.predict(X_test)
	cm_test = confusion_matrix(y_test, y_pred)	
	print (cm_test)
	test_report = classification_report(y_test, y_pred)
	print (test_report)
	accuracy_test = accuracy_score(y_test, y_pred)
	balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	print(sensitivity)
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
	print(specificity)
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
	print(PPV)
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	print(NPV)
	LRp = sensitivity/(1-specificity)
	print(LRp)
	LRn = (1-sensitivity)/specificity
	print(LRn)
	f1 = f1_score(y_test, y_pred)
	print(f1)
	AUC_test = roc_auc_score(y_test, y_pred)
	print(AUC_test)
	probs = best_clf.predict_proba(X_test)
	preds = probs[:,1]
	ROCAUC_test = roc_auc_score(y_test, preds)
	print(ROCAUC_test)
	PR_score_test = average_precision_score(y_test, preds)
	print(PR_score_test)
	# Define a result table as a DataFrame
	result_table = result_table.append({'dataset':Counter(y_train),
										'best_parameters':best_parameters,
										'best_score': best_score,
										'cv_scores':cv_scores,
										'cv_accuracy':cv_accuracy,
										'cv_sd':cv_sd,
										'cm_train':cm_train,
										'train_report':train_report, 
										'tr_accuracy_score':tr_accuracy_score,
										'tr_balanced_accuracy_score':tr_balanced_accuracy_score,
										'tr_sensitivity':tr_sensitivity,
										'tr_specificity':tr_specificity, 
										'tr_PPV':tr_PPV, 
										'tr_NPV':tr_NPV, 
										'tr_LRp':tr_LRp, 
										'tr_LRn':tr_LRn, 
										'tr_f1':tr_f1,
										'AUC_train':AUC_train, 
										'ROCAUC_train':ROCAUC_train,
										'tr_PR_score':tr_PR_score,
										'cm_test':cm_test, 
										'test_report':test_report, 
										'accuracy_score':accuracy_test, 
										'balanced_accuracy_score':balanced_accuracy_test, 
										'sensitivity':sensitivity, 
										'specificity':specificity, 
										'PPV':PPV, 
										'NPV':NPV, 
										'LRp':LRp, 
										'LRn':LRn, 
										'f1_test':f1,
										'AUC_test':AUC_test, 
										'ROCAUC_test':ROCAUC_test,
										'PR_score_test':PR_score_test}, ignore_index=True)
	
	
result_table.to_csv("Model_optimisation-strategy_rbfSVM_GS_results.csv")
