# This script develops performs a random hyperparameter search for the linear SVM algorithm for all of the model training datasets
# This script should immediately follow the relevant scripts named: "Model_optimisation-strategy_training_test_data.txt", where:
#	"model" - refers to either the early-life (CAPE) or preschool (CAPP) models, 
# 	"optimisation-strategy" - refers to whether the training datasets had undergone imputation and oversampling (IO), imputation, oversampling and undersampling (IOU), 
#				  or were complete datasets (no missing data) with oversampling and/or undersampling applied (COU).
# 	The data objects generated to run this script are objects called:
#		data - contains all the different training datasets on which to develop the models
#		set - attribute specifying how many training datasets are in the object 'data' for model development
# 		X_test and y_test objects created from the test dataset - refers to the standardised test dataset which corresponds to the standardised training datasets. 
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score

# Classifiers
from sklearn.svm import SVC

# Input data - should have been generated already from files named "Model_optimisation-strategy_training_test_data.txt" as detailed above. 

# Specify which algorithm to develop the models with
models = ['Linear_SVM']

# Define place sto store results
RS_results = pd.DataFrame(columns=['Classifier', 'dataset', 'parameters', 'score'])


for i in models:
	if i=='Linear_SVM':
		clf = SVC(kernel='linear', probability=True, random_state=123)
		C_range = np.logspace(-3,2,100)
		param_grid = dict(C=C_range)
	# Define inner loop to run grid search over each oversampled dataset
	for j in set:
		dataset = data[j]
		# Split training data into features and outcome 
		X_train = dataset.drop(['Study_ID','Asthma_10YR'], axis=1)
		y_train = dataset['Asthma_10YR']
		print('Original dataset shape %s' % Counter(y_train))
		# Perform  randomized search
		random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
											n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
		start = time()
		random_search.fit(X_train, y_train)
		RStime = (time() - start)
		best_parameters = random_search.best_params_
		print(best_parameters)
		best_score = random_search.best_score_
		print(best_score)
		# Define a result table as a DataFrame
		RS_results = RS_results.append({'Classifiers':i,
											'dataset':Counter(y_train),
											'parameters':best_parameters,
											'score':best_score}, ignore_index=True)
		results=pd.DataFrame(random_search.cv_results_)
		filename = "/../../"+str(i)+"_random_search_results_Model_optimisation-strategy_training_dataset_"+str(j)+".csv"
		results.to_csv(filename,index=False)
									
RS_results.to_csv("Model_optimisation-strategy_lSVM_RSresults.csv")												
									