# notes
# 

# changes
# initial preparation of figures to meet examiner's suggested amendments

if (!require("pacman")) install.packages("pacman")

pacman::p_load(tidyverse, janitor, readxl, ggstatsplot, ggpubr, viridis, ggsci)

##### UK STRAWBERRY PRODUCTION #####

# dataset obtained from https://www.gov.uk/government/statistics/latest-horticulture-statistics
  # accessed on 28/05/2025
# Gov data is published under the Open Government Licence (OGL)
  # https://www.gov.uk/help/terms-conditions
    # Contains public sector information licensed under the Open Government Licence v3.0. 
  # https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
    # copy, publish, distribute and transmit the Information;
    # adapt the Information;
    # exploit the Information commercially and non-commercially for example, by combining it with other Information, or by including it in your own product or application.

hort_path <- "data/chap-01/hort-dataset-23.xlsx" # assigning dataset path to a variable

# extracting excel sheets into listed dataframes 
hort_stats <- hort_path %>%
  excel_sheets() %>%
  set_names() %>%
  map(read_excel, path = hort_path, skip = 3) 

# extracting the dataframes relevant to UK strawberry production and value
hort_fruit <- list(
  hort_stats[[7]], # fruit production
  hort_stats[[8]] # fruit value
)

# function to clean the production and value dataframes
# converting to long-format -> standardised variable names -> standardised coding of missing values
hort_clean <- function(input_list) {
  input_list %>% 
    rename(
    food = 1,
    variety = "CALENDAR YEAR"
  ) %>%
    pivot_longer(
      3:ncol(.),
      names_to = "year", values_to = "value"
    ) %>%
    fill(food) %>%
    drop_na(food, value) %>%
    mutate(
      food = tolower(str_replace(food, " -", "")),
      variety = tolower(variety),
      food = if_else(
        food == "soft fruit", variety, food
      ),
      value = as.numeric(str_replace_all(value, ". .", "0")),
      year = as.Date(year, format = "%Y")
    ) %>%
    filter(
      str_detect(variety, "total", negate = T)
    ) %>% 
    summarise(
      across(
        value,
        ~ sum(.)
      ),
      .by = c(food,year)
    )
}

# apply cleaning function to both list elements
hort_fruit_clean <- hort_fruit %>% 
  purrr::map(
  ~ hort_clean(.x)
)

# function to specify theme arguments that will be applied to all figures to maintain consistency
theme_thesis <- function() {
  theme_ggstatsplot() +
    theme(
      axis.ticks = element_blank(),
      axis.line = element_line(colour = "grey50"),
      panel.grid = element_line(color = "#b4aea9"),
      panel.grid.minor = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.major.y = element_blank(),
      plot.tag.position = c(0.12,1),
      strip.text = element_text(
        face = "bold"
      ),
      strip.background = element_rect(
        fill = NA,
        color = NA
      ),
      strip.placement = "inside",
      panel.border = element_rect(
        color = "grey50",
        fill = NA,
        linewidth = 0.5
      ),
      axis.title.x = element_text(
        margin = margin(
          0.5,0,0,0,
          unit = "cm"
        )
      ),
      axis.title.y = element_text(
        margin = margin(
          0,0.5,0,0,
          unit = "cm"
        )
      )
    )
}

# plotting time series of UK strawberry production volume
strawb_vol <- hort_fruit_clean[[1]] %>% 
  filter(
    year > '1989-05-28',
    food == "strawberries"
    ) %>% 
  ggplot(
    aes(
      year, value
      )
    ) +
  geom_line(
    linewidth = 1, 
    colour = "#16367C"
    ) +
  scale_x_date(
    "Year",
    date_labels = "%Y",
    date_breaks = "2 years",
    expand = c(0.01,0)
    ) +
  scale_y_continuous(
    "Volume\n (Thousand Tonnes)", 
    limits = c(20,150), 
    breaks = seq(30, 150, by = 15),
    expand = c(0.01,0)
    ) +
  theme_thesis() +
  theme(
    axis.text.x = element_blank(),
    axis.title.x = element_blank() # removing x axis information in preparation for combining the two figures
  )

# plotting time series of UK strawberry production value
strawb_val <- hort_fruit_clean[[2]] %>% 
  filter(
    year > '1989-05-28',
    food == "strawberries"
  ) %>% 
  ggplot(
    aes(
      year, value
    )
  ) +
  geom_line(
    linewidth = 1, 
    colour = "#F39200FF"
  ) +
  scale_x_date(
    "Year",
    limits = as.Date(c("1990", "2023"), format = "%Y"),
    date_labels = "%Y",
    date_breaks = "3 years",
    expand = c(0.01,0)
  ) +
  scale_y_continuous(
    "Value\n (Thousand GBP)", 
    limits = c(50,450), 
    breaks = seq(50, 450, by = 50),
    expand = c(0.01,0)
  ) +
  theme_thesis() +
  theme(
    axis.text.x = element_text(
      angle = 45,
      hjust = 1,
      vjust = 1
    )
  )

# combining volume and value plots into a single figure
strawb_prod <- ggarrange(
  strawb_vol, strawb_val,
  nrow = 2,
  ncol = 1,
  heights = c(1,1.2)
) 

# save the combined plot to ".png" file. 
# The specified dimensions have been found to maintain good image quality when inserting figures in .docx documents
ggsave(
  filename = paste0("./figures/chap-01/", format(Sys.Date(), "%Y%m%d"), "_uk_strawberry_production_timeseries.png"),
  plot = strawb_prod,
  width = 5,
  height = 4,
  units = "in",
  dpi = 900,
  limitsize = FALSE
)

##### Genotypic variation in strawberry vitamin C #####

# dataset obtained from https://www.tandfonline.com/doi/full/10.1080/15538362.2016.1250695#d1e311
  # accessed on 28/05/2025
  # full bibliographic ref: 
    # Mezzetti, B. et al. (2016) ‘Breeding Strawberry for Higher Phytochemicals Content and Claim It: Is It Possible?’, 
    # International Journal of Fruit Science, 16(sup1), pp. 194–206. doi: 10.1080/15538362.2016.1250695.
# data were downloaded directly from the online publication (Table 2) using the "download csv" option.
# the original data have not been manipulated in any way, other than to visualise them in a graph instead of a table.

# read in the downloaded data
mezzetti2016 <- read_csv("data/chap-01/mezzetti2016_asa_genotypes.csv",
                 col_types = "fcccccc")

# clean the data and retain only the values related to ascorbic acid / vitamin C
# cultivars are categorised depending on their commercial status (origin), as documented in Mezzetti et al. (2016)
mezzetti_vitc <- mezzetti2016 %>%
  clean_names() %>%
  select(
    cultivar, vit_c_mg_100_g_fw
    ) %>%
  separate(
    vit_c_mg_100_g_fw, 
    into = c("mean_vit_c", "sd_vit_c"),
    sep = "±"
    ) %>%
  mutate(
    cultivar = str_replace_all(cultivar, ",", "-"),
    mean_vit_c = as.numeric(mean_vit_c),
    sd_vit_c = as.numeric(sd_vit_c),
    origin = case_when(
      str_detect(cultivar, "Adria|Cristina|Romina") ~ "b",
      str_detect(cultivar, "Clery|Monterey") ~ "a",
      str_detect(cultivar, "AN0") ~ "c"
    )
  )

# plot the vitamin C data as a bar graph
mezzetti_vitc_bars <- mezzetti_vitc %>%
  arrange(
    origin
    ) %>%
  mutate(
    cultivar = fct_reorder(
      cultivar, origin
      )
    ) %>%
  ggplot(
    aes(
      x = cultivar,
      y = mean_vit_c,
      fill = origin
      )
    ) +
  geom_bar(
    stat = "identity",
    colour = "black"
    ) +
  geom_errorbar(
    aes(
      ymin = mean_vit_c - sd_vit_c,
      ymax = mean_vit_c + sd_vit_c
      ),
    width = 0.5
    ) +
  scale_x_discrete(
    name = "Genotype", expand = c(0.04, 0),
    guide = guide_axis(angle = 45)
  ) +
  scale_y_continuous(
    name = "Total Ascorbic Acid\n (mg/100g FW)", expand = c(0, 0),
    limits = c(0, 100),
    breaks = seq(0, 110, 10)
  ) +
  scale_fill_viridis(
    option = "G",
    begin = 0,
    discrete = TRUE,
    labels = c("Commercialised", "Advanced Selection", "Backcrossed Selection"),
    name = ""
  ) +
  theme_thesis() +
  theme(
    legend.position = "bottom"
  )


# save the combined plot to ".png" file. 
# The specified dimensions have been found to maintain good image quality when inserting figures in .docx documents
ggsave(
  filename = paste0("./figures/chap-01/", format(Sys.Date(), "%Y%m%d"), "_mezzetti2016_genotypes_asa.png"),
  plot = mezzetti_vitc_bars,
  width = 5,
  height = 4,
  units = "in",
  dpi = 900,
  limitsize = FALSE
)
