# This script develops machine learning models using the MLP, decision tree, random forest or KNN algorithms (need to specifiy in code) across all of the model training datasets
# This script should immediately follow the relevant scripts named: "Model_optimisation-strategy_training_test_data.txt", where:
#	"model" - refers to either the early-life (CAPE) or preschool (CAPP) models, 
# 	"optimisation-strategy" - refers to whether the training datasets had undergone imputation and oversampling (IO), imputation, oversampling and undersampling (IOU), 
#				  or were complete datasets (no missing data) with oversampling and/or undersampling applied (COU).
# 	The data objects generated to run this script are objects called:
#		data - contains all the different training datasets on which to develop the models
#		set - attribute specifying how many training datasets are in the object 'data' for model development
# 		X_test and y_test objects created from the test dataset - refers to the standardised test dataset which corresponds to the standardised training datasets. 
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score

# Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Input data - should have been generated already from files named "Model_optimisation-strategy_training_test_data.txt" as detailed above. 

# Specify which algorithm to develop the models with
models = ['KNN']

# Define place sto store results
result_table=[]
result_table = pd.DataFrame(columns=['dataset','best_parameters', 'best_score', 'cv_scores', 'cv_accuracy', 'cv_sd', 'cm_train', 'train_report', 'tr_accuracy_score', 'tr_balanced_accuracy_score','tr_sensitivity', 'tr_specificity', 'tr_PPV', 'tr_NPV', 'tr_LRp', 'tr_LRn', 'tr_f1', 'AUC_train', 'ROCAUC_train', 'tr_PR_score', 'cm_test', 'test_report', 'accuracy_score', 'balanced_accuracy_score', 'sensitivity', 'specificity', 'PPV', 'NPV', 'LRp', 'LRn', 'f1_test', 'AUC_test', 'ROCAUC_test', 'PR_score_test'])

	
for i in models:
	if i=='MLP':
		clf = MLPClassifier(max_iter=1000, random_state=123),
		hidden_layers=[(1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,),(1,1),(2,2),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(9,9),(10,10),(11,11)]
		activation=['relu', 'identity', 'tanh', 'logistic']
		solver=['lbfgs', 'sgd', 'adam'] 
		alpha=[0.0000001,0.000001, 0.00001,0.0001,0.001,0.01,0.1,1,10]
		learning_rate=['constant', 'invscaling', 'adaptive'] 
		learning_rate_init=np.arange(0.1,1,0.1) 
		param_grid=dict(hidden_layer_sizes=hidden_layers, activation=activation, solver=solver, alpha=alpha, learning_rate=learning_rate, learning_rate_init=learning_rate_init)
	if i=='DecisionTree':
		clf = DecisionTreeClassifier(random_state=123)
		sample_split=np.arange(2,12,1)
		param_grid = {"max_depth": [1,2,3,4,5,6,7,8,7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,None],
              "splitter": ['best', 'random'],
              "max_features": ['log2', 'sqrt', 'auto', None],
              "min_samples_split": sample_split,
              "criterion": ["gini","entropy"]}
	if i=='RandomForest':
		clf = RandomForestClassifier(random_state=123)	
		n_estimators=np.arange(1,100,2)
		sample_split=np.arange(2,12,1)
		param_grid = {"max_depth": [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,None],
              "n_estimators":n_estimators,
              "max_features": ['log2', 'sqrt', None],
              "min_samples_split": sample_split,
              "bootstrap": [True, False],
              "criterion": ["gini","entropy"]}			  
	if i=='KNN':
		clf = KNeighborsClassifier()
		k_range = np.arange(1, 80, 1)
		p_range=[1,2]
		weights=['uniform', 'distance']
		param_grid=dict(n_neighbors=k_range, p=p_range, weights=weights)
	# Define inner loop to run grid search over each oversampled dataset
	for j in set:
		dataset = data[j]
				# Split training data into features and outcome 
		X_train = dataset.drop(['Study_ID','Asthma_10YR'], axis=1)
		y_train = dataset['Asthma_10YR']
		print('Original dataset shape %s' % Counter(y_train))
		dataset_dimensions = Counter(y_train)
		# Perform grid search 
		grid_search = GridSearchCV(clf, scoring='balanced_accuracy', param_grid=param_grid, cv=StratifiedKFold(5), n_jobs=16)
		start = time()
		grid_search.fit(X_train, y_train)
		GStime = (time() - start)
		# Get Grid search results
		Candidates = len(grid_search.cv_results_['params'])
		print(Candidates)
		# best parameters
		best_parameters = grid_search.best_params_
		print(best_parameters)
		best_score = grid_search.best_score_
		print(best_score)
		results=pd.DataFrame(grid_search.cv_results_)
		filename = "/../../"+str(i)+"_grid_search_results_optimisation-strategy_model_training_dataset_"+str(j)+".csv"
		results.to_csv(filename,index=False)
		# Build best model
		if i=='MLP':
			best_clf = MLPClassifier(activation=best_parameters['activation'], alpha=best_parameters['alpha'], hidden_layer_sizes=best_parameters['hidden_layer_sizes'], learning_rate=best_parameters['learning_rate'], learning_rate_init=best_parameters['learning_rate_init'], solver=best_parameters['solver'], max_iter=1000, random_state=123)
		if i=='DecisionTree':
			best_clf = DecisionTreeClassifier(criterion=best_parameters['criterion'], max_depth=best_parameters['max_depth'], max_features=best_parameters['max_features'], min_samples_split=best_parameters['min_samples_split'], splitter=best_parameters['splitter'], random_state=123)
		if i=='RandomForest':
			best_clf = RandomForestClassifier(bootstrap=best_parameters['bootstrap'], criterion=best_parameters['criterion'], max_depth=best_parameters['max_depth'], max_features=best_parameters['max_features'], min_samples_split=best_parameters['min_samples_split'], n_estimators=best_parameters['n_estimators'], random_state=123)
		if i=='KNN':
			best_clf = KNeighborsClassifier(n_neighbors=best_parameters['n_neighbors'], p=best_parameters['p'], weights=best_parameters['weights'])
		# Print performance measures
		# Train the model using the training sets
		cv_scores = cross_val_score(best_clf, X_train, y_train, n_jobs=16, cv=StratifiedKFold(5))
		cv_accuracy = cv_scores.mean()
		cv_sd= (cv_scores.std())     
		# Fit optimised model
		best_clf.fit(X_train,y_train)
		# Fit optimised model
		best_clf.fit(X_train,y_train)
		### Training set Performance
		y_train_pred = best_clf.predict(X_train)
		cm_train = confusion_matrix(y_train, y_train_pred)
		print(cm_train)
		train_report = classification_report(y_train, y_train_pred)
		print (train_report)
		tr_accuracy_score = accuracy_score(y_train, y_train_pred)
		tr_balanced_accuracy_score = balanced_accuracy_score(y_train, y_train_pred)
		tr_sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
		print(tr_sensitivity)
		tr_specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
		print(tr_specificity)
		tr_PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
		print(tr_PPV)
		tr_NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
		print(tr_NPV)
		tr_LRp = tr_sensitivity/(1-tr_specificity)
		print(tr_LRp)
		tr_LRn = (1-tr_sensitivity)/tr_specificity
		print(tr_LRn)
		tr_f1 = f1_score(y_train, y_train_pred)
		print(tr_f1)
		#  AUC: 
		AUC_train = roc_auc_score(y_train, y_train_pred)
		print(AUC_train)
		# Test model
		tr_probs = best_clf.predict_proba(X_train)
		tr_preds = tr_probs[:,1]
		ROCAUC_train = roc_auc_score(y_train, tr_preds)
		print(ROCAUC_train)
		tr_PR_score = average_precision_score(y_train, tr_preds)
		print(tr_PR_score)
		#Predict the response for test dataset
		y_pred = best_clf.predict(X_test)
		cm_test = confusion_matrix(y_test, y_pred)	
		print (cm_test)
		test_report = classification_report(y_test, y_pred)
		print (test_report)
		accuracy_test = accuracy_score(y_test, y_pred)
		balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred)
		sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
		print(sensitivity)
		specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
		print(specificity)
		PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
		print(PPV)
		NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
		print(NPV)
		LRp = sensitivity/(1-specificity)
		print(LRp)
		LRn = (1-sensitivity)/specificity
		print(LRn)
		f1 = f1_score(y_test, y_pred)
		print(f1)
		AUC_test = roc_auc_score(y_test, y_pred)
		print(AUC_test)
		probs = best_clf.predict_proba(X_test)
		preds = probs[:,1]
		ROCAUC_test = roc_auc_score(y_test, preds)
		print(ROCAUC_test)
		PR_score_test = average_precision_score(y_test, preds)
		print(PR_score_test)
		# Define a result table as a DataFrame
		result_table = result_table.append({'dataset':Counter(y_train),
											'best_parameters':best_parameters,
											'best_score': best_score,
											'cv_scores':cv_scores,
											'cv_accuracy':cv_accuracy,
											'cv_sd':cv_sd,
											'cm_train':cm_train,
											'train_report':train_report, 
											'tr_accuracy_score':tr_accuracy_score,
											'tr_balanced_accuracy_score':tr_balanced_accuracy_score,
											'tr_sensitivity':tr_sensitivity,
											'tr_specificity':tr_specificity, 
											'tr_PPV':tr_PPV, 
											'tr_NPV':tr_NPV, 
											'tr_LRp':tr_LRp, 
											'tr_LRn':tr_LRn, 
											'tr_f1':tr_f1,
											'AUC_train':AUC_train, 
											'ROCAUC_train':ROCAUC_train,
											'tr_PR_score':tr_PR_score,
											'cm_test':cm_test, 
											'test_report':test_report, 
											'accuracy_score':accuracy_test, 
											'balanced_accuracy_score':balanced_accuracy_test, 
											'sensitivity':sensitivity, 
											'specificity':specificity, 
											'PPV':PPV, 
											'NPV':NPV, 
											'LRp':LRp, 
											'LRn':LRn, 
											'f1_test':f1,
											'AUC_test':AUC_test, 
											'ROCAUC_test':ROCAUC_test,
											'PR_score_test':PR_score_test}, ignore_index=True)
											
result_table.to_csv("Model_optimisation-strategy_results.csv")