# This script creates multiple datasets for training the CAPP model, with different degrees of oversampling applied to the complete dataset (individuals with missing data excluded).
# Oversampling performed using ADASYN
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import ADASYN

# Set working directory
os.chdir("/../../..")

### PRESCHOOL MODEL DEVELOPMENT ###
# Import cleaned, unstandardised 4YR dataset - data found in IOWBC_data.xlsx, sheet: "Preschool data"
data_4YR = pd.read_csv("Preschool_QC_1368IDs.csv", index_col=False)
del data_4YR['Unnamed: 0']
# 1368 Ids, 59 columns

# Subset the 12 features selected from Balanced Random Forest RFE - 5-fold CV
selected_data = data_4YR[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR']]
complete_data_4YR = selected_data.dropna()
# n=548

# Create training and test set
#create data_features
complete_subset_features = complete_data_4YR.drop(['Asthma_10YR'], axis=1)

#create data_outcome
complete_subset_outcome = complete_data_4YR['Asthma_10YR']

# Split dataset into training set and test set: 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(complete_subset_features, complete_subset_outcome,
                                                    stratify=complete_subset_outcome, 
                                                    test_size=0.333, shuffle=True, random_state=123)
													
# Training set (n=365, asthma=51, no asthma=314)	Test set (n=183, asthma=25, no asthma=158)


# Save the original train/test set IDs
Train_IDs = X_train.iloc[:,0]
Train_IDs = Train_IDs.to_frame()
Test_IDs = X_test.iloc[:,0]
Test_IDs = Test_IDs.to_frame()

# delete Study ID columns from training and test sets
del X_train['Study_ID']
del X_test['Study_ID']

# Identify continuous variables and standardise them in the training and test sets
scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'))
cat_train = X_train.iloc[:,5:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)
train = pd.concat([Train_IDs.reset_index(drop=True), SX_train], axis=1)
train = pd.concat([train, y_train.reset_index(drop=True)], axis=1)
train.to_csv("Preschool_standardised_initial_training_dataset_365IDs.csv", index=False)

cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4'))
cat_test = X_test.iloc[:,5:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)

# Save test training and test set
test = pd.concat([Test_IDs.reset_index(drop=True), SX_test], axis=1)
test = pd.concat([test, y_test.reset_index(drop=True)], axis=1)
test.to_csv("Preschool_standardised_test_dataset_183IDs.csv", index=False)

##########################
### Oversampling - 25% ###
##########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 25%
OSX_train, Oy_train = ADASYN(sampling_strategy=(70/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 70})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_25_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_25_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_25_train], axis=1)
Oversampled_25_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_25 = pd.concat([Oversampled_25_train, Oy_train_df], axis=1)

Oversampled_25.to_csv("Oversampled_preschool_dataset_25%.csv", index=False)

##########################
### Oversampling - 50% ###
##########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 50%
OSX_train, Oy_train = ADASYN(sampling_strategy=(77/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 85})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_50_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_50_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_50_train], axis=1)
Oversampled_50_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_50 = pd.concat([Oversampled_50_train, Oy_train_df], axis=1)

Oversampled_50.to_csv("Oversampled_preschool_dataset_50%.csv", index=False)


###########################
### Oversampling - 100% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 100%
OSX_train, Oy_train = ADASYN(sampling_strategy=(108/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 117})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_100_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_100_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_100_train], axis=1)
Oversampled_100_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_100 = pd.concat([Oversampled_100_train, Oy_train_df], axis=1)

Oversampled_100.to_csv("Oversampled_preschool_dataset_100%.csv", index=False)


###########################
### Oversampling - 150% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 150%
OSX_train, Oy_train = ADASYN(sampling_strategy=(128/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 132})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_150_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_150_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_150_train], axis=1)
Oversampled_150_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_150 = pd.concat([Oversampled_150_train, Oy_train_df], axis=1)

Oversampled_150.to_csv("Oversampled_preschool_dataset_150%.csv", index=False)


###########################
### Oversampling - 200% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 200%
OSX_train, Oy_train = ADASYN(sampling_strategy=(153/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 158})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_200_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_200_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_200_train], axis=1)
Oversampled_200_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_200 = pd.concat([Oversampled_200_train, Oy_train_df], axis=1)

Oversampled_200.to_csv("Oversampled_preschool_dataset_200%.csv", index=False)


###########################
### Oversampling - 250% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 250%
OSX_train, Oy_train = ADASYN(sampling_strategy=(183/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 192})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_250_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_250_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_250_train], axis=1)
Oversampled_250_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_250 = pd.concat([Oversampled_250_train, Oy_train_df], axis=1)

Oversampled_250.to_csv("Oversampled_preschool_dataset_250%.csv", index=False)


###########################
### Oversampling - 300% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 300%
OSX_train, Oy_train = ADASYN(sampling_strategy=(208/314), random_state=123).fit_resample(SX_train, y_train)

print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 314, 1: 205})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:5].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 5:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_300_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_300_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_300_train], axis=1)
Oversampled_300_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_300 = pd.concat([Oversampled_300_train, Oy_train_df], axis=1)

Oversampled_300.to_csv("Oversampled_preschool_dataset_300%.csv", index=False)