##Imputing Qualification
library(ggplot2)
library(dplyr)
library(mgcv)
windowsFonts(Calibri=windowsFont("Calibri"))

#Set-up
dat <- data.frame(y=4-qualf4[l_ind],coh=coh[l_ind],
                  HDIc2a=as.factor(HDIc2a[l_ind]),HDIc2b=as.factor(HDIc2b[l_ind]),HDIc2c=as.factor(HDIc2c[l_ind]),HDIc2d=as.factor(HDIc2d[l_ind]),
                  HDIc3a=as.factor(HDIc3a[l_ind]),HDIc3b=as.factor(HDIc3b[l_ind]),HDIc3c=as.factor(HDIc3c[l_ind]),HDIc3d=as.factor(HDIc3d[l_ind]),HDIc3e=as.factor(HDIc3e[l_ind]),HDIc3f=as.factor(HDIc3f[l_ind]),
                  HDIc4a=as.factor(HDIc4a[l_ind]),HDIc4b=as.factor(HDIc4b[l_ind]),HDIc4c=as.factor(HDIc4c[l_ind]),HDIc4d=as.factor(HDIc4d[l_ind]),
                  HDIc5=as.factor(HDIc5[l_ind]),weight=weights_st[l_ind])

#Imputation
cutoff <- 1982
gam4.3e <- gam(list(y~1+coh+HDIc3e+coh:HDIc3e,~1+coh+HDIc3e+coh:HDIc3e,~1+coh+HDIc3e+coh:HDIc3e),family=multinom(K=3),data=dat,subset=coh<=cutoff)
countfunc <- function (x) c(sum(x==1),sum(x==2),sum(x==3),sum(x==4))
set.seed(1)
unic <- coh[l_ind]
unih3 <- HDIc3e[l_ind]
uids <- id[l_ind]
uniq <- qualf4[l_ind]
uniq2 <- uniq
for (cohort in c((cutoff+1):1992)) {
  for (i in 1:3) {
    fit <- predict(gam4.3e,newdata=data.frame(coh=cohort,HDIc3e=i),type="response")[4:1]
    tot <- sum(unic==cohort & unih3==i)
    obsd <- tot*fit
    obsr <- round(obsd)
    if (sum(obsr)-tot==1) {
      obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]-1
    }
    if (sum(obsr)-tot==-1) {
      obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]+1
    }
    icoh <- which(unic==cohort & unih3==i)
    cids <- uids[icoh]
    oriq <- uniq[icoh]
    newq <- uniq[icoh]
    #Move < O Level to O Level
    size <- countfunc(newq)[1]-obsr[1]
    if (size<0) {
      obsd <- c(countfunc(newq)[1],(sum(countfunc(newq)[2:4])*fit[2:4])/sum(fit[2:4]))
      obsr <- round(obsd)
    } 
    if (sum(obsr)-tot==1) {
      obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]-1
    }
    if (sum(obsr)-tot==-1) {
      obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]+1
    }
    size <- countfunc(newq)[1]-obsr[1]
    if (size>0) {
      x <- cids[newq==1]
      if (length(x)==1) ind1 <- x
      if (length(x)>1) ind1 <- sample(x,size)
      newq[cids %in% ind1] <- 2
    }
    #Move O Level to A Level
    size <- countfunc(newq)[2]-obsr[2]
    if (size<0) {
      obsd <- c(countfunc(newq)[1:2],(sum(countfunc(newq)[3:4])*fit[3:4])/sum(fit[3:4]))
      obsr <- round(obsd)
    } 
    if (sum(obsr)-tot==1) {
      obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]-1
    }
    if (sum(obsr)-tot==-1) {
      obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]+1
    }
    size <- countfunc(newq)[2]-obsr[2]
    if (size>0) {
      x <- cids[newq==2]
      if (length(x)==1) ind2 <- x
      if (length(x)>1) ind2 <- sample(x,size)
      newq[cids %in% ind2] <- 3
    }
    #Move A Level to Degree
    size <- countfunc(newq)[3]-obsr[3]
    if (size<0) {
      obsr <- countfunc(newq)
    }
    if (size>0) {
      x <- cids[newq==3]
      if (length(x)==1) ind3 <- x
      if (length(x)>1) ind3 <- sample(x,size)
      newq[cids %in% ind3] <- 4  
    }
    uniq2[icoh] <- newq   
  }
}

#Update Q
idimp <- uids[which(uniq2 != uniq)]
iddat <- data.frame(id=1:length(uids))
rownames(iddat) <- uids

qualf4imp <- qualf4
qualf4imp[id %in% idimp] <- uniq2[iddat[paste(id[id %in% idimp]),"id"]]
qualf2aimp <- ifelse(qualf4imp < 2, 1, 2)
qualf2bimp <- ifelse(qualf4imp < 3, 1, 2)
qualf2cimp <- ifelse(qualf4imp < 4, 1, 2)
qualf3aimp <- ifelse(qualf4imp > 3, qualf4imp-1, qualf4imp)
qualf3bimp <- ifelse(qualf4imp > 2, qualf4imp-1, qualf4imp)
qualf3cimp <- ifelse(qualf4imp > 1, qualf4imp-1, qualf4imp)

qualf2aimp_0 <- qualf2aimp[parityc == 0]
qualf2aimp_1 <- qualf2aimp[parityc == 1]
qualf2aimp_2 <- qualf2aimp[parityc == 2]
qualf2aimp_3 <- qualf2aimp[parityc == 3]

qualf2bimp_0 <- qualf2bimp[parityc == 0]
qualf2bimp_1 <- qualf2bimp[parityc == 1]
qualf2bimp_2 <- qualf2bimp[parityc == 2]
qualf2bimp_3 <- qualf2bimp[parityc == 3]

qualf2cimp_0 <- qualf2cimp[parityc == 0]
qualf2cimp_1 <- qualf2cimp[parityc == 1]
qualf2cimp_2 <- qualf2cimp[parityc == 2]
qualf2cimp_3 <- qualf2cimp[parityc == 3]

qualf3aimp_0 <- qualf3aimp[parityc == 0]
qualf3aimp_1 <- qualf3aimp[parityc == 1]
qualf3aimp_2 <- qualf3aimp[parityc == 2]
qualf3aimp_3 <- qualf3aimp[parityc == 3]

qualf3bimp_0 <- qualf3bimp[parityc == 0]
qualf3bimp_1 <- qualf3bimp[parityc == 1]
qualf3bimp_2 <- qualf3bimp[parityc == 2]
qualf3bimp_3 <- qualf3bimp[parityc == 3]

qualf3cimp_0 <- qualf3cimp[parityc == 0]
qualf3cimp_1 <- qualf3cimp[parityc == 1]
qualf3cimp_2 <- qualf3cimp[parityc == 2]
qualf3cimp_3 <- qualf3cimp[parityc == 3]

qualf4imp_0 <- qualf4imp[parityc == 0]
qualf4imp_1 <- qualf4imp[parityc == 1]
qualf4imp_2 <- qualf4imp[parityc == 2]
qualf4imp_3 <- qualf4imp[parityc == 3]

#Plots
#Labels
supp.labs1 <- c("Low/Medium HDI","High/Very high HDI","UK-born")
names(supp.labs1) <- c(1,2,3)
supp.labst <- c("Unweighted","Weighted")
names(supp.labst) <- c(1,2)

#Distribution of qualification by cohort - scatter plot (Figure 3.12)
scatterfunc <- function(w,c,q,ind,FUN) {
  data <- aggregate(w[ind], by=list(c[ind], as.factor(4-q[ind])), FUN = FUN)
  data <- right_join(data,expand.grid(Group.1=1945:1992, Group.2=as.factor(0:3)),by=c("Group.1","Group.2")) %>% mutate_each(function(x) (replace(x, which(is.na(x)), 0)))
  data <- right_join(data, aggregate(w[ind], by=list(c[ind]), FUN = FUN), by = "Group.1")
  data$x <- data$x.x/data$x.y
  data
}

data <- scatterfunc(weights, coh, qualf4, l_ind, length)
data <- data[order(data$Group.2,data$Group.1),]
dataw <- scatterfunc(weights, coh, qualf4, l_ind, sum)
dataw <- dataw[order(dataw$Group.2,dataw$Group.1),]
ymax <- max(data$x)

plotfunc <- function(filename,dat) {
  png(file=filename,width=20,height=12,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, color = Group.2)) +
      geom_point() +
      geom_line(linetype = 2) +
      geom_line(aes(x=Group.1[order(Group.2)],y=unlist(tapply(x,Group.2,rollmean,k=3,fill=NA,align="right")),color=sort(Group.2)), size=1, show.legend = F) +
      labs(x = "Cohort", y = "Proportion", color = "Highest educational qualification") +
      scale_color_discrete(guide = guide_legend(reverse=TRUE), labels = c("Degree","A Level","GCSE","< GCSE")) +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(limits=c(1945,1992), breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      coord_cartesian(ylim=c(0,ymax)) +
      theme_gray() + theme(legend.position = "bottom", text = element_text("Calibri"))
  })
  dev.off()  
}

plotfunc("chap3/plots/fig12_uw.png", data)  # unweighted
plotfunc("chap3/plots/fig12_w.png", dataw)  # weighted

#Distribution of imputed qualification by cohort and HDI (Figure 3.14)
scatterfunc2 <- function(w,c,q,h,ind,FUN) {
  data <- aggregate(w[ind], by=list(c[ind], as.factor(h[ind]), as.factor(4-q[ind])), FUN = FUN)
  data <- right_join(data,expand.grid(Group.1=1945:1992, Group.2=as.factor(1:3), Group.3=as.factor(0:3)),by=c("Group.1","Group.2","Group.3")) %>% mutate(across(.cols=everything(), function(x) (replace(x, which(is.na(x)), 0))))
  data <- right_join(data, aggregate(w[ind], by=list(c[ind], as.factor(h[ind])), FUN = FUN), by = c("Group.1","Group.2"))
  data$x <- data$x.x/data$x.y
  data
}

data2 <- scatterfunc2(weights_st, coh, qualf4, HDIc3e, l_ind, length)
data2 <- data2[order(data2$Group.3,data2$Group.2,data2$Group.1),]
data2a <- scatterfunc2(weights_st, coh, qualf4imp, HDIc3e, l_ind, length)
data2a <- data2a[order(data2a$Group.3,data2a$Group.2,data2a$Group.1),]
dataa <- scatterfunc(weights_st, coh, qualf4imp, l_ind, length)
dataa <- dataa[order(dataa$Group.2,dataa$Group.1),]
dataaw <- scatterfunc(weights_st, coh, qualf4imp, l_ind, sum)
dataaw <- dataaw[order(dataaw$Group.2,dataaw$Group.1),]
newdata <- data.frame(expand.grid(coh=1945:1992,HDIc3e=as.factor(1:3)))
data2a$pred <- as.vector(predict(gam4.3e,newdata=newdata,type="response"))
data2a$cutoff <- dataa$cutoff <- dataaw$cutoff <- cutoff
data2a$xorg <- data2$x
dataa$xorg <- data$x
dataaw$xorg <- dataw$x
ymax2 <- max(data2$x)

plotfunc <- function(filename,dat) {
  png(file=filename,width=20,height=9,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, color = Group.3)) +
      geom_line(aes(y=xorg), linetype = 2) +
      geom_point(aes(y=xorg), shape = 1) +
      geom_point() +
      geom_line() +
      geom_line(aes(y=pred),size=1,show.legend=F) +
      geom_vline(aes(xintercept=cutoff)) + 
      labs(x = "Cohort", y = "Proportion", color = "Highest educational qualification") +
      scale_color_discrete(guide = guide_legend(reverse=TRUE), labels = c("Degree","A Level","GCSE","< GCSE")) +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(limits=c(1945,1992), breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      coord_cartesian(ylim=c(0,ymax2)) +
      theme_gray() + theme(axis.text.x = element_text(angle = 90, vjust=0.45), legend.position = "bottom", text = element_text("Calibri")) +
      facet_grid(cols=vars(Group.2), labeller = labeller(Group.2=supp.labs1))
  })
  dev.off()  
}

plotfunc("chap3/plots/fig14.png", data2a)

#Distribution of imputed qualification by cohort, unweighted and weighted (Figure 3.15)
dataa2 <- rbind(data.frame(type=1,dataa),
                data.frame(type=2,dataaw))

plotfunc <- function(filename,dat) {
  png(file=filename,width=20,height=9,units="cm",res=400)
  print({
    ggplot(dat, aes(x=Group.1, y=x, color = Group.2)) +
      geom_line(aes(y=xorg), linetype = 2) +
      geom_point(aes(y=xorg), shape = 1) +
      geom_point() +
      geom_line() +
      geom_vline(aes(xintercept=cutoff)) + 
      labs(x = "Cohort", y = "Proportion", color = "Highest educational qualification") +
      scale_color_discrete(guide = guide_legend(reverse=TRUE), labels = c("Degree","A Level","GCSE","< GCSE")) +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(limits=c(1945,1992), breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      coord_cartesian(ylim=c(0,ymax)) +
      theme_gray() + theme(legend.position = "bottom", text = element_text("Calibri")) +
      facet_wrap(~type,labeller=labeller(type=supp.labst))
  })
  dev.off()  
}

plotfunc("chap3/plots/fig15.png", dataa2)

save(qualf2aimp, qualf2aimp_0, qualf2aimp_1, qualf2aimp_2, qualf2aimp_3,
     qualf2bimp, qualf2bimp_0, qualf2bimp_1, qualf2bimp_2, qualf2bimp_3,
     qualf2cimp, qualf2cimp_0, qualf2cimp_1, qualf2cimp_2, qualf2cimp_3,
     qualf3aimp, qualf3aimp_0, qualf3aimp_1, qualf3aimp_2, qualf3aimp_3,
     qualf3bimp, qualf3bimp_0, qualf3bimp_1, qualf3bimp_2, qualf3bimp_3,
     qualf3cimp, qualf3cimp_0, qualf3cimp_1, qualf3cimp_2, qualf3cimp_3,
     qualf4imp, qualf4imp_0, qualf4imp_1, qualf4imp_2, qualf4imp_3,
     file="chap3/results/Qmeanimp.RData")
