# This script creates multiple datasets for training the CAPE model, with different degrees of oversampling applied to the complete dataset (individuals with missing data excluded).
# Oversampling performed using ADASYN
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import ADASYN

# Set working directory
os.chdir("/../../..")

### EARLY LIFE MODEL DEVELOPMENT ###
# Import cleaned, unstandardised 2YR dataset - data found in IOWBC_data.xlsx, sheet: "Early life data"
data_2YR = pd.read_csv("/scratch/dk2e18/Asthma_Prediction_Model/Oversampling/Early_life_QC_1368IDs.csv", index_col=False)
del data_2YR['Unnamed: 0']
# 1368 Ids, 44 columns

# Subset the 8 features selected from Balanced Random Forest RFE - 5-fold CV
selected_data = data_2YR[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES', 'Asthma_10YR']]
complete_data_2YR = selected_data.dropna()
# n=765

# Create training and test set
#create data_features
complete_subset_features = complete_data_2YR.drop(['Asthma_10YR'], axis=1)

#create data_outcome
complete_subset_outcome = complete_data_2YR['Asthma_10YR']

# Split dataset into training set and test set: 2/3 training and 1/3 test
X_train, X_test, y_train, y_test = train_test_split(complete_subset_features, complete_subset_outcome,
                                                    stratify=complete_subset_outcome, 
                                                    test_size=0.333, shuffle=True, random_state=123)
													
# Training set (n=510, asthma=68, no asthma=442)	Test set (n=255, asthma=34, no asthma=221)

# Save the original train/test set IDs
Train_IDs = X_train.iloc[:,0]
Train_IDs = Train_IDs.to_frame()
Test_IDs = X_test.iloc[:,0]
Test_IDs = Test_IDs.to_frame()

# delete Study Id columns from training and test sets
del X_train['Study_ID']
del X_test['Study_ID']


# Identify continuous variables and standardise them in the training and test sets
scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'))
cat_train = X_train.iloc[:,4:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)
train = pd.concat([Train_IDs.reset_index(drop=True), SX_train], axis=1)
train = pd.concat([train, y_train.reset_index(drop=True)], axis=1)
train.to_csv("/scratch/dk2e18/Asthma_Prediction_Model/Oversampling/Early_life_standardised_initial_training_dataset_510IDs.csv", index=False)

cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'))
cat_test = X_test.iloc[:,4:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)
													
# Save test training and test set
test = pd.concat([Test_IDs.reset_index(drop=True), SX_test], axis=1)
test = pd.concat([test, y_test.reset_index(drop=True)], axis=1)
test.to_csv("/scratch/dk2e18/Asthma_Prediction_Model/Oversampling/Early_life_standardised_test_dataset_255IDs.csv", index=False)


##########################
### Oversampling - 25% ###
##########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 25%
OSX_train, Oy_train = ADASYN(sampling_strategy=(96/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 98})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_25_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_25_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_25_train], axis=1)
Oversampled_25_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_25 = pd.concat([Oversampled_25_train, Oy_train_df], axis=1)

Oversampled_25.to_csv("Oversampled_earlylife_dataset_25%.csv", index=False)


##########################
### Oversampling - 50% ###
##########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 50%
OSX_train, Oy_train = ADASYN(sampling_strategy=(103/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 118})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_50_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_50_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_50_train], axis=1)
Oversampled_50_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_50 = pd.concat([Oversampled_50_train, Oy_train_df], axis=1)

Oversampled_50.to_csv("Oversampled_earlylife_dataset_50%.csv", index=False)


###########################
### Oversampling - 100% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 100%
OSX_train, Oy_train = ADASYN(sampling_strategy=(152/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 165})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_100_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_100_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_100_train], axis=1)
Oversampled_100_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_100 = pd.concat([Oversampled_100_train, Oy_train_df], axis=1)

Oversampled_100.to_csv("Oversampled_earlylife_dataset_100%.csv", index=False)


###########################
### Oversampling - 150% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 150%
OSX_train, Oy_train = ADASYN(sampling_strategy=(173/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 185})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_150_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_150_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_150_train], axis=1)
Oversampled_150_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_150 = pd.concat([Oversampled_150_train, Oy_train_df], axis=1)

Oversampled_150.to_csv("Oversampled_earlylife_dataset_150%.csv", index=False)


###########################
### Oversampling - 200% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 200%
OSX_train, Oy_train = ADASYN(sampling_strategy=(207/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 229})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_200_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_200_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_200_train], axis=1)
Oversampled_200_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_200 = pd.concat([Oversampled_200_train, Oy_train_df], axis=1)

Oversampled_200.to_csv("Oversampled_earlylife_dataset_200%.csv", index=False)


###########################
### Oversampling - 250% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 250%
OSX_train, Oy_train = ADASYN(sampling_strategy=(242/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 249})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_250_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_250_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_250_train], axis=1)
Oversampled_250_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_250 = pd.concat([Oversampled_250_train, Oy_train_df], axis=1)

Oversampled_250.to_csv("Oversampled_earlylife_dataset_250%.csv", index=False)


###########################
### Oversampling - 300% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 300%
OSX_train, Oy_train = ADASYN(sampling_strategy=(272/442), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 442, 1: 279})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_300_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_300_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_300_train], axis=1)
Oversampled_300_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_300 = pd.concat([Oversampled_300_train, Oy_train_df], axis=1)

Oversampled_300.to_csv("Oversampled_earlylife_dataset_300%.csv", index=False)