# This script performs feature selection using RFE and Boruta to identify the features to include in the CAPE and CAPP models
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import imblearn
from imblearn.ensemble import BalancedRandomForestClassifier
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler,FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from boruta import BorutaPy


# Set working directory
os.chdir("/../../..")

###################################################
### Feature Selection - Early life (CAPE) Model ###
###################################################

###########
### RFE ###
###########
# Import cleaned, unstandardised early life dataset - data found in IOWBC_data.xlsx, sheet: "Early life data"
data_2YR = pd.read_csv("Early_life_QC_1368IDs.csv", index_col=False)
del data_2YR['Unnamed: 0']
# 1368 Ids, 44 columns

# Remove those with NA to identify individuals with complete data
complete_data_2YR = data_2YR.dropna()

# Separate features and outcome for feature selection
X1,Y1=complete_data_2YR.iloc[:,1:complete_data_2YR.shape[1]-1],complete_data_2YR.iloc[:,complete_data_2YR.shape[1]-1]
X1 = X1[['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Mat_smoking_birth', 'Pat_smoking_birth',
       'Dog_birth', 'Cat_birth', 'Furry_pet_birth', 'Mat_asthma', 'Mat_eczema',
       'Mat_hayfever', 'Pat_asthma', 'Pat_eczema', 'Pat_hayfever', 'Sex',
       'Prematurity', 'Delivery', 'Total.Bf.duration',
       'Exclusive.Bf.duration', 'Parity', 'Season_autumn',
       'Season_winter', 'Season_spring', 'Season_summer', 
       'Wheeze_2YR', 'Wheeze_without_cold_2YR', 'Cough_2YR', 'Nasal_symp_2YR',
       'Chest_infection_2YR', 'Noct_symp_2YR', 'Eczema_2YR', 'Hayfever_2YR',
       'Atopy_2YR', 'Monosensitisation_2YR', 'Polysensitisation_2YR',
       'Smoking_2YR', 'Dog_2YR', 'Cat_2YR', 'Furry_pet_2YR', 'Farm_early_life',
       'SES']]
	   
# Define RFE model
bclf = BalancedRandomForestClassifier(n_estimators=best_param1["n_estimators"],max_depth=best_param1["max_depth"],
                              min_samples_split =best_param1["min_samples_split"],max_features=best_param1["max_features"],random_state=123)
							  
rfecv = rfecv(bclf, step=1, cv=StratifiedKFold(5,random_state=123), scoring='balanced_accuracy')



pipeline = Pipeline([
	('standardising', Pipeline([
		('select', ColumnTransformer([
			('scale', StandardScaler(),['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'])
			],
			remainder='passthrough')
		)
	])),
   ('bclf', rfecv)	
])


# apply RFE to data
fit=pipeline.fit(X1, Y1)

list2 = []
for i in range(0, 42):
    if rfecv.ranking_[i] == 1:
        list2.append(X1.columns.values[i])

print("Optimal number of features : %d" % rfecv.n_features_)
# 8 features

print("Accuracy: \n", rfecv.grid_scores_[7]) # n features - 1 for indexing	
# accuracy = 0.6488095238095237

print("Feature Selected: \n",list2)
#['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']

# Plot number of features against cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation balanced accuracy score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.savefig('Feature_selection_balancedRFE_complete_early_life.pdf')

# Feature importances
scaler = StandardScaler()
cont = pd.DataFrame(scaler.fit_transform(X1.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'))
cat = X1.iloc[:,4:]
SX1 = pd.concat([cont, cat.reset_index(drop=True)], axis=1)
rfecv.fit(SX1,Y1)
importances = rfecv.estimator_.feature_importances_
x = pd.DataFrame(importances)
y= pd.DataFrame(list2)
FI = pd.concat([y, x.reset_index(drop=True)], axis=1)
FI.columns = ['Feature', 'Feature importance (Gini)']
FI.to_csv("Feature_selection_balancedRFECV_final_feature_importances_early_life.csv")

bclf.fit(SX1,Y1)
importances = bclf.feature_importances_
x = pd.DataFrame(importances)
y= pd.DataFrame(SX1.columns)
FI = pd.concat([y, x.reset_index(drop=True)], axis=1)
FI.columns = ['Feature', 'Feature importance (Gini)']
FI.to_csv("Feature_selection_balancedRF_all_feature_importances_early_life.csv")

##############
### boruta ###
##############
boruta = BorutaPy(bclf, n_estimators=100, verbose=2, random_state=123)

pipeline = Pipeline([
	('standardising', Pipeline([
		('select', ColumnTransformer([
			('scale', StandardScaler(),['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'])
			],
			remainder='passthrough')
		)
	])),
   ('bclf', boruta)	
])

# apply Boruta to data
fit=pipeline.fit(X1, Y1)

# No confirmed or tentative features came up

##################################################
### Feature Selection - Preschool (CAPP) Model ###
##################################################

###########
### RFE ###
###########

# Import cleaned, unstandardised preschool dataset - data found in IOWBC_data.xlsx, sheet: "Preschool data"
data_4YR = pd.read_csv("Preschool_QC_1368IDs.csv", index_col=False)
del data_4YR['Unnamed: 0']
# 1368 Ids, 59 columns

# Remove those with NA to identify individuals with complete data. Only individuals with complete data for all candidate features were included in the feature selection
complete_data_4YR = data_4YR.dropna()

# Separate features and outcome for feature selection
X,Y=complete_data_4YR.iloc[:,1:complete_data_4YR.shape[1]-1],complete_data_4YR.iloc[:,complete_data_4YR.shape[1]-1]

# Order features so continuous variables are listed first
X = X[['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','SDS_BMI_4','Mat_smoking_birth', 'Pat_smoking_birth',
       'Dog_birth', 'Cat_birth', 'Furry_pet_birth', 'Mat_asthma', 'Mat_eczema',
       'Mat_hayfever', 'Pat_asthma', 'Pat_eczema', 'Pat_hayfever', 'Sex',
       'Prematurity', 'Delivery', 'Total.Bf.duration',
       'Exclusive.Bf.duration', 'Parity', 'Season_autumn',
       'Season_winter', 'Season_spring', 'Season_summer', 
       'Wheeze_2YR', 'Wheeze_without_cold_2YR', 'Cough_2YR', 'Nasal_symp_2YR',
       'Chest_infection_2YR', 'Noct_symp_2YR', 'Eczema_2YR', 'Hayfever_2YR',
       'Atopy_2YR', 'Monosensitisation_2YR', 'Polysensitisation_2YR',
       'Smoking_2YR', 'Dog_2YR', 'Cat_2YR', 'Furry_pet_2YR', 
       'Wheeze_4YR', 'Wheeze_without_cold_4YR', 'Cough_4YR', 'Nasal_symp_4YR',
       'Noct_Symp_4YR', 'Eczema_4YR', 'Hayfever_4YR', 'Atopy_4YR',
       'Monosensitisation_4YR', 'Polysensitisation_4YR', 'Smoking_4YR',
       'Dog_4YR', 'Cat_4YR', 'Furry_pet_4YR', 'Farm_early_life', 'SES']]
	   

# Define parameters for RFE - used default settings 
best_param1= {'bootstrap': True,'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}


# Define RFE model
bclf = BalancedRandomForestClassifier(n_estimators=best_param1["n_estimators"],max_depth=best_param1["max_depth"],
                              min_samples_split =best_param1["min_samples_split"],max_features=best_param1["max_features"],random_state=123)

rfecv = RFECV(bclf, X=X, y=Y, step=1, cv=StratifiedKFold(5,random_state=123), scoring='balanced_accuracy')
			  
estimators = Pipeline([
	('standardising', Pipeline([
		('select', ColumnTransformer([
			('scale', StandardScaler(),['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'])
			],
			remainder='passthrough')
		)
	])),
   ('bclf', rfecv)	
])


			  
# apply RFE to data
fit=estimators.fit(X, Y)

list = []
for i in range(0, 57):
	if rfecv.ranking_[i] == 1:
		list.append(X.columns.values[i])

print("Optimal number of features : %d" % rfecv.n_features_)
# 12 features

print("Accuracy: \n", rfecv.grid_scores_[11]) # n features - 1 for indexing	
# accuracy = 0.7493123196248196

print("Feature Selected: \n",list)
#'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES'

# Plot number of features against cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross-validation balanced accuracy score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.savefig('Feature_selection_balancedRFE_complete_preschool.pdf')


# Feature importances
scaler = StandardScaler()
cont = pd.DataFrame(scaler.fit_transform(X.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'))
cat = X.iloc[:,5:]
SX = pd.concat([cont, cat.reset_index(drop=True)], axis=1)
rfecv.fit(SX,Y)
importances = rfecv.estimator_.feature_importances_
x = pd.DataFrame(importances)
y= pd.DataFrame(list)
FI = pd.concat([y, x.reset_index(drop=True)], axis=1)
FI.columns = ['Feature', 'Feature importance (Gini)']
FI.to_csv("Feature_selection_balancedRFECV_final_feature_importances_preschool.csv")

bclf.fit(SX,Y)
importances = bclf.feature_importances_
x = pd.DataFrame(importances)
y= pd.DataFrame(SX.columns)
FI = pd.concat([y, x.reset_index(drop=True)], axis=1)
FI.columns = ['Feature', 'Feature importance (Gini)']
FI.to_csv("Feature_selection_balancedRF_all_feature_importances_preschool.csv")

##############
### boruta ###
##############
boruta = BorutaPy(bclf, n_estimators=100, verbose=2, random_state=123)

pipeline = Pipeline([
	('standardising', Pipeline([
		('select', ColumnTransformer([
			('scale', StandardScaler(),['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'])
			],
			remainder='passthrough')
		)
	])),
   ('bclf', boruta)	
])

# apply Boruta to data
fit=pipeline.fit(X, Y)

# Check selected features =1
boruta.support_

# check ranking of features
S = boruta.ranking_
Selected = pd.DataFrame(S)
Features = complete_data_4YR.iloc[:,1:complete_data_4YR.shape[1]-1]
names = pd.DataFrame(Features.columns)
list = pd.concat([names, Selected], axis=1)
list.columns=['Feature', 'Boruta_ranking']
# call transform() on X to filter it down to selected features
X_filtered = boruta.transform(X)
X_filtered = list.loc[(list['Boruta_ranking'] <= 1)] 
print(X_filtered)
# Cough_4YR