# notes
# 

# changes
# corrected error that was causing october to be assigned to winter instead of autumn season


if (!require("pacman")) install.packages("pacman")

pacman::p_load(tidyverse, data.table, broom, janitor)


###############################################################
##################                           ##################
##################      ORGANIC ACIDS        ##################
##################                           ##################
###############################################################

source("code/R/acids_processing_func_v0.0.8.R") # load custom processing function to process the HPLC data

"~/chapter-04_strawberry_analytical_survey/data/processed/"

organic_acids(
  hplc_data = "~/chapter-04_strawberry_analytical_survey/data/primary/hplc/results/",
  dry_weight = "~/chapter-04_strawberry_analytical_survey/data/primary/hplc/extraction_weights/",
  experiment = "SS",
  out_dir = "~/chapter-04_strawberry_analytical_survey/data/processed/2023_2024_supermarket_sampling/"
)

acids_process <- acids_process %>% 
  select(
    -date
  ) %>% 
  pivot_wider(
    names_from = "measure_var",
    values_from = "value",
    values_fn = mean
  )

###############################################################
##################                           ##################
##################        SAMPLE INFO        ##################
##################                           ##################
###############################################################

sample_info <- read_csv(
  "~/chapter-04_strawberry_analytical_survey/data/primary/20240513_SS_SampleInformation.csv"
) 


sample_info_process <- sample_info %>% 
  mutate(
    experiment = rep(
      "SS", times = n() # adding a unique experiment ID (SS = Supermarket Sampling) for identification in final thesis dataset
    ),
    avg_berry_size_grams = sample_freshweight_grams / number_of_berries_sampled,
    perc_dw = corrected_subsample_dryweight_grams / corrected_subsample_freshweight_grams, # % dry weight needed to convert acids values to fresh weight
    cost_per_kg = cost_gbp / (volume_purchased_grams / 1000),
    shelf_life = lubridate::yday(lubridate::as_date(best_before_date)) - lubridate::yday(lubridate::as_date(purchase_date)),
    product_quality = case_when(
      str_detect(product_brand, "Finest|Taste|Specially") ~ "premium", # grouping product categories, as described in thesis section 4.2.1
      str_detect(product_brand, "Rosdene|Imperfect|Wonky|Stamford") ~ "budget",
      str_detect(product_brand, "Organic") ~ "organic",
      TRUE ~ "standard"
    ),
    season = case_when( # defining the seasons based on the date identifier in the sample ID - %m%d
      str_detect(sample, "1223|0124|0224") ~ "winter",
      str_detect(sample, "032|0423|0523") ~ "spring",
      str_detect(sample, "0623|0723|0823") ~ "summer",
      str_detect(sample, "0923|1023|1123") ~ "autumn"
    )
  ) %>% 
  rename(
    sample_name = sample
  ) %>% 
  separate(
    sample_name, 
    into = c(
      "month", "sampling_point", "store", "punnet" # extracting information from sample identifiers. see ~/chapter-04_strawberry_analytical_survey/README.txt for more info on sample ID's
      ),
    sep = "-",
    remove = FALSE
  )  %>%  
  select(
    c(
    purchase_date,  
    month,
    experiment,
    sample_name,
    sampling_point,
    store,
    season,
    product_retailer,
    product_retailer_store,
    product_brand,
    product_quality,
    product_origin_country:grower_details,
    shelf_life,
    cost_gbp,
    cost_per_kg,
    avg_berry_size_grams,
    perc_dw,
    brix_1:brix_3
    )
  ) %>% 
  pivot_longer(
    starts_with(
      "brix"
    ),
    names_to = "var",
    values_to = "brix"
  ) %>% 
  summarise(
    across(
      brix,
      ~ mean(.)
    ),
    .by = c(1:20)
  )


###############################################################
##################                           ##################
##################        COMPILATION        ##################
##################                           ##################
###############################################################


SS_compiled_data <- sample_info_process %>% 
  left_join(
    acids_process,
    by = c("experiment", "sample_name")
  ) %>% 
  mutate(
    ascorbic_acid_mg_100g_fw = (ascorbic_acid * perc_dw) * 100, # converting dry ascorbate measurements to fresh weight
    malic_acid_mg_per_g_fw = (malic_acid * perc_dw), # converting dry malate measurements to fresh weight
    citric_acid_mg_per_g_fw = (citric_acid * perc_dw), # converting dry citrate measurements to fresh weight
    theoretical_titratable_acidity = ((citric_acid_mg_per_g_fw / 10) + (malic_acid_mg_per_g_fw / 10)) / (6.4/6.71), # see thesis section 4.2.2.2 for info on this calculation
    tss_tta_ratio = (brix / theoretical_titratable_acidity), 
    variety = if_else(variety == "LinValnera", "Limvalnera", variety), # correction of typo in variety name
    across(
      c(
        product_origin_region:grower_details
      ),
      ~ replace_na(., "unknown") # re-code missing values so that na's won't be dropped from these columns 
    ),
    across(
      c(
        month:experiment,
        sampling_point:grower_details
      ),
      ~ as.factor(.)
    )
  ) %>% 
  select(
    -c(
      ends_with(
        "acid"
      )
    )
  )  %>% 
  pivot_longer(
    where(
      is.numeric
    ),
    names_to = "measure_var",
    values_to = "value"
  ) %>% 
  drop_na(
    value # drop any missing values in the numeric variables
  )


rm(
  list = c(
    "acids_process", "sample_info", "sample_info_process"
  )
)


save.image(
  SS_compiled_data,
  file = paste0(
    "~/chapter-04_strawberry_analytical_survey/data/processed/", format(Sys.Date(), "%Y%m%d"), "_SS_compiled_dataset.Rdata"
  )
)

write_csv(
  SS_compiled_data,
  paste0(
    "~/chapter-04_strawberry_analytical_survey/data/processed/", format(Sys.Date(), "%Y%m%d"), "_SS_compiled_dataset.csv"
  )
)
