# This script develops  machine learning models using the Mixed Naive Bayes algorithm across all of the model training datasets
# This script should immediately follow the relevant scripts named: "Model_optimisation-strategy_training_test_data.txt", where:
#	"model" - refers to either the early-life (CAPE) or preschool (CAPP) models, 
# 	"optimisation-strategy" - refers to whether the training datasets had undergone imputation and oversampling (IO), imputation, oversampling and undersampling (IOU), 
#				  or were complete datasets (no missing data) with oversampling and/or undersampling applied (COU).
# 	The data objects generated to run this script are objects called:
#		data - contains all the different training datasets on which to develop the models
#		set - attribute specifying how many training datasets are in the object 'data' for model development
# 		X_test and y_test objects created from the test dataset - refers to the standardised test dataset which corresponds to the standardised training datasets. 
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score

# Classifiers
from mixed_naive_bayes import MixedNB

# Input data - should have been generated already from files named "Model_optimisation-strategy_training_test_data.txt" as detailed above. 

# Define place to store results
result_table=[]
result_table = pd.DataFrame(columns=['dataset','best_parameters', 'best_score', 'cv_scores', 'cv_accuracy', 'cv_sd', 'cm_train', 'train_report', 'tr_accuracy_score', 'tr_balanced_accuracy_score','tr_sensitivity', 'tr_specificity', 'tr_PPV', 'tr_NPV', 'tr_LRp', 'tr_LRn', 'tr_f1', 'AUC_train', 'ROCAUC_train', 'tr_PR_score', 'cm_test', 'test_report', 'accuracy_score', 'balanced_accuracy_score', 'sensitivity', 'specificity', 'PPV', 'NPV', 'LRp', 'LRn', 'f1_test', 'AUC_test', 'ROCAUC_test', 'PR_score_test'])

# Define inner loop to run grid search over each oversampled dataset
for j in set:
	dataset = data[j]
	# Split training data into features and outcome 
	X_train = dataset.drop(['Study_ID','Asthma_10YR'], axis=1)
	y_train = dataset['Asthma_10YR']
	print('Original dataset shape %s' % Counter(y_train))
	# Build best model
	# Specify model parameters to search
	best_clf = MixedNB(categorical_features=[5,6,7,8,9,10,11])
	# For the early life model, the above line should be: best_clf = MixedNB(categorical_features=[4,5,6,7])
	# Train the model using the training sets
	cv_scores = cross_val_score(best_clf, X_train, y_train, n_jobs=16, cv=StratifiedKFold(5))
	cv_accuracy = cv_scores.mean()
	cv_sd= (cv_scores.std())     
	# Fit optimised model
	best_clf.fit(X_train,y_train)
	# Fit optimised model
	best_clf.fit(X_train,y_train)
	### Training set Performance
	y_train_pred = best_clf.predict(X_train)
	cm_train = confusion_matrix(y_train, y_train_pred)
	print(cm_train)
	train_report = classification_report(y_train, y_train_pred)
	print (train_report)
	tr_accuracy_score = accuracy_score(y_train, y_train_pred)
	tr_balanced_accuracy_score = balanced_accuracy_score(y_train, y_train_pred)
	tr_sensitivity =  cm_train[1,1]/(cm_train[1,0]+cm_train[1,1])								
	print(tr_sensitivity)
	tr_specificity = cm_train[0,0]/(cm_train[0,0]+cm_train[0,1])									
	print(tr_specificity)
	tr_PPV = cm_train[1,1]/(cm_train[1,1]+cm_train[0,1])	
	print(tr_PPV)
	tr_NPV = cm_train[0,0]/(cm_train[0,0]+cm_train[1,0])
	print(tr_NPV)
	tr_LRp = tr_sensitivity/(1-tr_specificity)
	print(tr_LRp)
	tr_LRn = (1-tr_sensitivity)/tr_specificity
	print(tr_LRn)
	#  AUC: 
	AUC_train = roc_auc_score(y_train, y_train_pred)
	print(AUC_train)
	# Test model
	tr_probs = best_clf.predict_proba(X_train)
	tr_preds = tr_probs[:,1]
	ROCAUC_train = roc_auc_score(y_train, tr_preds)
	print(ROCAUC_train)
	tr_PR_score = average_precision_score(y_train, tr_preds)
	print(tr_PR_score)
	#Predict the response for test dataset
	y_pred = best_clf.predict(X_test)
	cm_test = confusion_matrix(y_test, y_pred)	
	print (cm_test)
	test_report = classification_report(y_test, y_pred)
	print (test_report)
	accuracy_test = accuracy_score(y_test, y_pred)
	balanced_accuracy_test = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	print(sensitivity)
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])									
	print(specificity)
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])	
	print(PPV)
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	print(NPV)
	LRp = sensitivity/(1-specificity)
	print(LRp)
	LRn = (1-sensitivity)/specificity
	print(LRn)
	AUC_test = roc_auc_score(y_test, y_pred)
	print(AUC_test)
	probs = best_clf.predict_proba(X_test)
	preds = probs[:,1]
	ROCAUC_test = roc_auc_score(y_test, preds)
	print(ROCAUC_test)
	PR_score_test = average_precision_score(y_test, preds)
	print(PR_score_test)
	# Define a result table as a DataFrame
	result_table = result_table.append({'dataset':Counter(y_train),
										'cv_scores':cv_scores,
										'cv_accuracy':cv_accuracy,
										'cv_sd':cv_sd,
										'cm_train':cm_train,
										'train_report':train_report, 
										'tr_accuracy_score':tr_accuracy_score,
										'tr_balanced_accuracy_score':tr_balanced_accuracy_score,
										'tr_sensitivity':tr_sensitivity,
										'tr_specificity':tr_specificity, 
										'tr_PPV':tr_PPV, 
										'tr_NPV':tr_NPV, 
										'tr_LRp':tr_LRp, 
										'tr_LRn':tr_LRn, 
										'AUC_train':AUC_train, 
										'ROCAUC_train':ROCAUC_train,
										'tr_PR_score':tr_PR_score,
										'cm_test':cm_test, 
										'test_report':test_report, 
										'accuracy_score':accuracy_test, 
										'balanced_accuracy_score':balanced_accuracy_test, 
										'sensitivity':sensitivity, 
										'specificity':specificity, 
										'PPV':PPV, 
										'NPV':NPV, 
										'LRp':LRp, 
										'LRn':LRn, 
										'AUC_test':AUC_test, 
										'ROCAUC_test':ROCAUC_test,
										'PR_score_test':PR_score_test}, ignore_index=True)
										
result_table.to_csv("Model_optimisation-strategy_MixedNB_results.csv")		