##Exploratory Data Analysis
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(maps)
library(countrycode)
library(rworldmap)
library(CoordinateCleaner)
library(zoo)
library(scales)
windowsFonts(Calibri=windowsFont("Calibri"))

#Distribution of parity by cohort (Figure 3.2)
data <- aggregate(weights[l_ind], by=list(coh[l_ind], as.factor(4-parityc[l_ind])), FUN = length)
dataw <- aggregate(weights[l_ind], by=list(coh[l_ind], as.factor(4-parityc[l_ind])), FUN = sum)

plotfunc <- function(filename,dat) {
  png(file=filename,width=20,height=12,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, fill = Group.2)) +
      geom_bar(stat = "identity", position = "fill") +
      scale_fill_brewer(type = "seq", palette = "YlOrRd", guide = guide_legend(reverse=TRUE), labels = c("3+","2","1","0")) +
      labs(x = "Cohort", y = "Proportion", fill = "Parity") +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      theme_gray() + theme(legend.position = "bottom", text = element_text("Calibri"))
    })
  dev.off()  
}

plotfunc("chap3/plots/fig2_uw.png", data)  # unweighted
plotfunc("chap3/plots/fig2_w.png", dataw)  # weighted

#Observed fertility rates by parity (Figure 3.3)
wratesfunc <- function(w,c,a,b,sixyc=F,FUN) {
  if (sixyc) c <- seq(1947.5,1989.5,by=6)[ceiling((c-1944)/6)]
  data <- aggregate(w, by=list(a,c), FUN = FUN)
  data2 <- aggregate(w[b==1], by=list(a[b==1],c[b==1]), FUN = FUN)
  data <- left_join(data,data2,by=c("Group.1","Group.2")) %>% mutate(across(.cols=everything(),function(x) (replace(x, which(is.na(x)), 0))))
  data$x <- data$x.y/data$x.x
  data
}

data <- rbind(cbind(par=0,yr=1,wratesfunc(weights,coh,age,birth.bin,F,length)),
              cbind(par=1,yr=1,wratesfunc(weights_0,coh_0,age_0,birth.bin_0,F,length)),
              cbind(par=2,yr=1,wratesfunc(weights_1,coh_1,age_1,birth.bin_1,F,length)),
              cbind(par=3,yr=1,wratesfunc(weights_2,coh_2,age_2,birth.bin_2,F,length)),
              cbind(par=4,yr=1,wratesfunc(weights_3,coh_3,age_3,birth.bin_3,F,length)),
              cbind(par=0,yr=5,wratesfunc(weights,coh,age,birth.bin,T,length)),
              cbind(par=1,yr=5,wratesfunc(weights_0,coh_0,age_0,birth.bin_0,T,length)),
              cbind(par=2,yr=5,wratesfunc(weights_1,coh_1,age_1,birth.bin_1,T,length)),
              cbind(par=3,yr=5,wratesfunc(weights_2,coh_2,age_2,birth.bin_2,T,length)),
              cbind(par=4,yr=5,wratesfunc(weights_3,coh_3,age_3,birth.bin_3,T,length)))

dataw <- rbind(cbind(par=0,yr=1,wratesfunc(weights,coh,age,birth.bin,F,sum)),
               cbind(par=1,yr=1,wratesfunc(weights_0,coh_0,age_0,birth.bin_0,F,sum)),
               cbind(par=2,yr=1,wratesfunc(weights_1,coh_1,age_1,birth.bin_1,F,sum)),
               cbind(par=3,yr=1,wratesfunc(weights_2,coh_2,age_2,birth.bin_2,F,sum)),
               cbind(par=4,yr=1,wratesfunc(weights_3,coh_3,age_3,birth.bin_3,F,sum)),
               cbind(par=0,yr=5,wratesfunc(weights,coh,age,birth.bin,T,sum)),
               cbind(par=1,yr=5,wratesfunc(weights_0,coh_0,age_0,birth.bin_0,T,sum)),
               cbind(par=2,yr=5,wratesfunc(weights_1,coh_1,age_1,birth.bin_1,T,sum)),
               cbind(par=3,yr=5,wratesfunc(weights_2,coh_2,age_2,birth.bin_2,T,sum)),
               cbind(par=4,yr=5,wratesfunc(weights_3,coh_3,age_3,birth.bin_3,T,sum)))

supp.labs1 <- c("All","Parity 0","Parity 1","Parity 2","Parity 3+")
names(supp.labs1) <- c(0,1,2,3,4)
supp.labs2 <- c("Single-year cohorts", "Six-year cohorts")
names(supp.labs2) <- c(1,5)

plotfunc <- function(filename,dat) {
  png(file=filename,width=30,height=15,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, color = Group.2)) +
      geom_point(size=1) +
      labs(x = "Age", y = "Observed rate", color = "Cohort") +
      scale_color_gradientn(colours=rainbow(100, start=0.3, end=1), guide = guide_colorbar(barheight = 20,frame.colour="black",ticks.colour="black"), breaks = seq(1945,2000,5)) +
      scale_y_continuous(breaks=seq(0,0.5,0.1)) +
      scale_x_continuous(limits=c(15,44), breaks=seq(15,40,5), minor_breaks=setdiff(15:44,seq(15,40,5)), expand=c(0.03,0.03)) +
      coord_cartesian(ylim=c(0,0.5)) +
      theme_bw() + theme(legend.position = "right", text = element_text("Calibri")) +
      facet_grid(cols=vars(par), rows=vars(yr), labeller = labeller(par=supp.labs1, yr=supp.labs2))
  })
  dev.off()  
}

plotfunc("chap3/plots/fig3_uw.png", data)  # unweighted
plotfunc("chap3/plots/fig3_w.png", dataw)  # weighted

#World map with points indicating countries of birth (Figure 3.4)
HDItab <- data.frame(table(factor(a_plbornc_all[index])))
colnames(HDItab)[1] <- "Country_org"
HDItab$ISO3 <- countrycode(HDItab$Country_org,"country.name","iso3c")
HDItab[which(is.na(HDItab$ISO3)),]
HDItab[which(HDItab$Country_org %in% c("inapplicable","england")),"ISO3"] <- "GBR"
HDItab[which(HDItab$Country_org=="aden"),"ISO3"] <- "YEM"
HDItab[which(HDItab$Country_org=="slovinia"),"ISO3"] <- "SVN"
HDItab[which(HDItab$Country_org=="gaudeloupe"),"ISO3"] <- "GLP"
HDItab[which(HDItab$Country_org=="union of soviet socialist states"),"ISO3"] <- "RUS"
HDItab[which(HDItab$Country_org=="czechoslovakia"),"ISO3"] <- "CZE"
HDItab <- HDItab[-which(is.na(HDItab$ISO3)),] # Remove missing/vague responses (all small counts - 1-7 women per response, 24 total)
HDItab[HDItab$Country_org=="inapplicable","Freq"] <- sum(HDItab[HDItab$Country_org %in% c("inapplicable", "england"),"Freq"])
HDItab[HDItab$Country_org=="yemen","Freq"] <- sum(HDItab[HDItab$Country_org %in% c("yemen", "aden"),"Freq"])
HDItab[HDItab$Country_org=="russia","Freq"] <- sum(HDItab[HDItab$Country_org %in% c("russia", "union of soviet socialist states"),"Freq"])
HDItab[HDItab$Country_org=="czech republic","Freq"] <- sum(HDItab[HDItab$Country_org %in% c("czech republic", "czechoslovakia"),"Freq"])
HDItab <- HDItab[-which(HDItab$Country_org %in% c("inapplicable","england","aden","union of soviet socialist states","czechoslovakia")),]
HDItab$long <- sapply(HDItab$ISO3, function(x) countryref$centroid.lon[which(countryref$iso3==x)[1]])
HDItab$long[HDItab$Country_org=="canada"] <- -110
HDItab$lat <- sapply(HDItab$ISO3, function(x) countryref$centroid.lat[which(countryref$iso3==x)[1]])
HDItab$HDIr <- sapply(HDItab$Country_org, function(x) HDIsurv$HDIr[which(HDIsurv$Country_org==x)])
HDItab$HDIc5 <- as.factor(sapply(HDItab$Country_org, function(x) HDIsurv$HDIc5[which(HDIsurv$Country_org==x)]))

plotfunc <- function(filename,dat) {
  png(file=filename,width=30,height=15,units="cm",res=400)
  print({
    ggplot(map_data("world"), aes(x=long, y=lat, group=group)) +
      geom_polygon(fill="white", colour="darkgray") +
      geom_point(data=dat, aes(x=long,y=lat,group=Freq,color=HDIc5,size=ifelse(Freq>300, 15, Freq/20))) +
      scale_size(name="Number of women", guide=guide_legend(order=1,nrow=1), breaks=c(0,1,25,50,100,150,300)/20,labels=c(0,1,25,50,100,150,"300+")) +
      scale_color_discrete(name="Level of human development",labels = c("Low","Medium","High","Very high"),guide=guide_legend(order=2)) +
      coord_cartesian(ylim=c(-55,80)) +
      labs(x="Longitude",y="Latitude") +
      theme_gray() + theme(legend.position = "bottom", text = element_text("Calibri"))
  })
  dev.off()  
}

plotfunc("chap3/plots/fig4.png", HDItab)

#Distribution of HDI by cohort (Figure 3.5)
data <- aggregate(weights[l_ind], by=list(coh[l_ind], as.factor(HDIc5[l_ind])), FUN = length)
dataw <- aggregate(weights[l_ind], by=list(coh[l_ind], as.factor(HDIc5[l_ind])), FUN = sum)

plotfunc <- function(filename,dat) {
  png(file=filename,width=20,height=12,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, fill = Group.2)) +
      geom_bar(stat = "identity", position = "fill") +
      scale_fill_brewer(type = "seq", palette = "YlOrRd", guide = guide_legend(reverse=TRUE, nrow=2, byrow=T), labels = c("Low", "Medium", "High", "Very high", "UK-born")) +
      labs(x = "Cohort", y = "Proportion", fill = "Level of human development") +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      theme_gray() + theme(legend.position = "bottom", text = element_text("Calibri"))
    })
  dev.off()  
}

plotfunc("chap3/plots/fig5_uw.png", data)  # unweighted
plotfunc("chap3/plots/fig5_w.png", dataw)  # weighted

#Distribution of qualification by cohort (Figure 3.6)
data <- aggregate(weights[l_ind], by=list(coh[l_ind], as.factor(4-qualf4[l_ind])), FUN = length)
dataw <- aggregate(weights[l_ind], by=list(coh[l_ind], as.factor(4-qualf4[l_ind])), FUN = sum)

plotfunc <- function(filename,dat) {
  png(file=filename,width=20,height=12,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, fill = Group.2)) +
      geom_bar(stat = "identity", position = "fill") +
      scale_fill_brewer(type = "seq", palette = "YlOrRd", guide = guide_legend(reverse=TRUE), labels = c("Degree","A Level","GCSE","< GCSE")) +
      labs(x = "Cohort", y = "Proportion", fill = "Highest educational qualification") +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      theme_gray() + theme(legend.position = "bottom", text = element_text("Calibri"))
  })
  dev.off()  
}

plotfunc("chap3/plots/fig6_uw.png", data)  # unweighted
plotfunc("chap3/plots/fig6_w.png", dataw)  # weighted
