# notes
# please note that retailer names are not anonymised in the original dataset
# retailer names must be anonymised in any publication utilising these data to adhere to Kantar's (now known as Worldpanel by Numerator, circa 2025) publication guidance


# changes
# psuedonymised retailer names
# saved pseudonymised data to file for re-upload to Pure data repository



if (!require("pacman")) install.packages("pacman")

pacman::p_load(tidyverse, janitor, readxl, viridis, ggsci)

##### Read and Tidy Kantar Dataset #####

# please see '~/chapter-02_UK_fruit_and_vegetable_purchases/README.txt' for info on how to access the original Kantar dataset

k_path <- "~/chapter-02_UK_fruit_and_vegetable_purchases/data/secondary/kantar/Fruit & Veg 4we 5yrs.xlsx"

kantar <- k_path %>%
  excel_sheets() %>%
  set_names() %>%
  map(read_excel, path = k_path, skip = 2)

newnames <- c("value_1000gbp", "volume_tonnes")

valume <- list(kantar[[1]], kantar[[2]]) %>%
  map2(newnames, ~ {
    pivot_longer(.x, c(2:66), names_to = "date", values_to = .y)
  }) %>%
  map(
    rename, k_food = 1
    ) %>%
  map(
    distinct, k_food, date, .keep_all = T
    ) %>%
  reduce(
    left_join, by = c("k_food", "date")
    ) %>%
  separate(
    date, into = c("trash", "month_end"), sep = "e\\s"
    ) %>%
  select(-trash) %>%
  mutate(
    k_food = tolower(k_food),
    month_end = lubridate::dmy(str_replace_all(month_end, "\\s", "-")),
    volume_kg = volume_tonnes * 1000,
    value_gbp = value_1000gbp * 1000,
    gbp_per_kg = value_gbp / volume_kg
  ) %>%
  select(
    k_food, month_end, value_gbp, volume_kg, gbp_per_kg
    )


valume_strawb_retailer <- kantar[[4]] %>%
  pivot_longer(
    2:66, names_to = "date", values_to = "value_1000gbp"
    ) %>%
  rename(
    retailer = "...1"
    ) %>%
  separate(
    date, into = c("trash", "month_end"), sep = "e\\s"
    ) %>%
  select(-trash) %>%
  mutate(
    k_food = ifelse(is.na(value_1000gbp), retailer, NA),
    k_food = tolower(k_food),
    month_end = lubridate::dmy(str_replace_all(month_end, "\\s", "-")),
    value_gbp = value_1000gbp * 1000
  ) %>%
  fill(
    k_food
    ) %>%
  drop_na(
    retailer, value_1000gbp
    ) %>%
  left_join(
    select(
      valume, c(k_food, month_end, gbp_per_kg)
      ),
    by = c("k_food", "month_end")
  ) %>%
  mutate(
    volume_kg = value_gbp * gbp_per_kg,
    retailer = str_remove_all(retailer, "^Total\\s")
  ) %>%
  select(
    retailer, k_food, month_end, value_gbp, gbp_per_kg, volume_kg
  ) %>%
  filter(
    k_food == "strawberry"
  ) %>% 
  mutate(
    total_volume = sum(volume_kg),
    .by = retailer
  ) %>% 
  arrange(
    desc(
      total_volume
    )
  ) %>% 
  mutate(
    retailer_id = LETTERS[cur_group_id()],
    .by = retailer
    ) %>%  # Anonymise retailer names
  select(
    retailer_id, k_food, month_end, value_gbp, gbp_per_kg, volume_kg
  )

# remove unneeded variable from R environment
rm(
  list = c(
    "k_path", "kantar", "newnames"
  )
)

# save image of R environment that can be loaded directly
# change the file path to match the directory structure on your system
save.image(
  file = paste0(
    "~/chapter-04_strawberry_analytical_survey/data/processed/", format(Sys.Date(), "%Y%m%d"), "_SS_kantar_retailers.Rdata"
  )
)

# save relevant strawberry sales volumes as a .csv
# change the file path to match the directory structure on your system
write_csv(
  valume_strawb_retailer,
  paste0(
    "~/chapter-04_strawberry_analytical_survey/data/processed/", format(Sys.Date(), "%Y%m%d"), "_SS_kantar_retailers_strawberries.csv"
  )
)
