# This script creates multiple datasets for training the CAPE model, with different degrees of oversampling applied to the MICE imputed early life training dataset (n=1113).
# Oversampling performed using ADASYN
# Python version 3.6.8 is used

# Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.over_sampling import ADASYN

# Set working directory
os.chdir("/../../")

# Import cleaned, unstandardised, imputed early life training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "MICE imputed earlylife training"
train = pd.read_csv("MICE_imputed_earlylife_training_dataset_1113ID.csv", index_col=False)
# 1113 Ids, 10 columns

# Import cleaned, unstandardised early life test data used during the initial model development stage - data found in IOWBC_training_test_data.xlsx, sheet: "Early life test set"
test = pd.read_csv("Earlylife_test_dataset_255IDs.csv", index_col=False)

# Save the original train/test set IDs
Train_IDs = train.iloc[:,0]
Train_IDs = Train_IDs.to_frame()
Test_IDs = test.iloc[:,1]
Test_IDs = Test_IDs.to_frame()

# Separate training data into features and outcome
# Split test data into features and outcome
X_train = train.drop(['Study_ID','Asthma_10YR'], axis=1)
y_train = train['Asthma_10YR']

# Standardise training sets
scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1',))
cat_train = X_train.iloc[:,4:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)

# Split test data into features and outcome
X_test = test.drop(['Unnamed: 0','Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']

# Standardise test set
cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:4]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1'))
cat_test = X_test.iloc[:,4:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)

##########################
### Oversampling - 25% ###
##########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 25%
OSX_train, Oy_train = ADASYN(sampling_strategy=(235/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 235})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_25_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_25_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_25_train], axis=1)
Oversampled_25_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_25 = pd.concat([Oversampled_25_train, Oy_train_df], axis=1)

Oversampled_25.to_csv("MICE_imputed_oversampled_earlylife_dataset_25%.csv", index=False)

##########################
### Oversampling - 50% ###
##########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 50%
OSX_train, Oy_train = ADASYN(sampling_strategy=(252/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 293})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_50_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_50_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_50_train], axis=1)
Oversampled_50_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_50 = pd.concat([Oversampled_50_train, Oy_train_df], axis=1)

Oversampled_50.to_csv("MICE_imputed_oversampled_earlylife_dataset_50%.csv", index=False)


###########################
### Oversampling - 100% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 100%
OSX_train, Oy_train = ADASYN(sampling_strategy=(372/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 398})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_100_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_100_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_100_train], axis=1)
Oversampled_100_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_100 = pd.concat([Oversampled_100_train, Oy_train_df], axis=1)

Oversampled_100.to_csv("MICE_imputed_oversampled_earlylife_dataset_100%.csv", index=False)

###########################
### Oversampling - 150% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 150%
OSX_train, Oy_train = ADASYN(sampling_strategy=(422/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 456})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_150_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_150_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_150_train], axis=1)
Oversampled_150_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_150 = pd.concat([Oversampled_150_train, Oy_train_df], axis=1)

Oversampled_150.to_csv("MICE_imputed_oversampled_earlylife_dataset_150%.csv", index=False)


###########################
### Oversampling - 200% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 200%
OSX_train, Oy_train = ADASYN(sampling_strategy=(507/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 555})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_200_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_200_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_200_train], axis=1)
Oversampled_200_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_200 = pd.concat([Oversampled_200_train, Oy_train_df], axis=1)

Oversampled_200.to_csv("MICE_imputed_oversampled_earlylife_dataset_200%.csv", index=False)


###########################
### Oversampling - 250% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 250%
OSX_train, Oy_train = ADASYN(sampling_strategy=(591/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 613})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_250_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_250_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_250_train], axis=1)
Oversampled_250_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_250 = pd.concat([Oversampled_250_train, Oy_train_df], axis=1)

Oversampled_250.to_csv("MICE_imputed_oversampled_earlylife_dataset_250%.csv", index=False)


###########################
### Oversampling - 300% ###
###########################
# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 300%
OSX_train, Oy_train = ADASYN(sampling_strategy=(668/946), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 946, 1: 681})


# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)

# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:4].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 4:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_300_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_300_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_300_train], axis=1)
Oversampled_300_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES']
Oy_train_df.columns =['Asthma_10YR']
Oversampled_300 = pd.concat([Oversampled_300_train, Oy_train_df], axis=1)

Oversampled_300.to_csv("MICE_imputed_oversampled_earlylife_dataset_300%.csv", index=False)
