##Multiple imputation of qualification
library(rstan)
library(dplyr)
library(ggplot2)
library(mgcv)

#Fit original imputation model
cutoff <- 1982
datm <- data.frame(y=4-qualf4[l_ind],coh=coh[l_ind],HDIc3e=as.factor(HDIc3e[l_ind]))
gam4.3e <- gam(list(y~1+coh+HDIc3e+coh:HDIc3e,~1+coh+HDIc3e+coh:HDIc3e,~1+coh+HDIc3e+coh:HDIc3e),family=multinom(K=3),data=datm,subset=coh<=cutoff)

#Fit Bayesian version of imputation model
dat <- aggregate(qualf4 ~ coh + HDIc3e, FUN = function(x)
  c(y=length(x), q1=sum(x==1), q2=sum(x==2), q3=sum(x==3), q4=sum(x==4)),
  subset=intersect(l_ind,which(coh<=cutoff)))
dat <- data.frame(dat$coh,dat$HDIc3e,dat$qualf4)
colnames(dat) <- c("c","h","y",paste0(1:4))
y <- dat[,paste0(1:4)]
N <- nrow(y)
cc <- dat$c - median(1945:1982)
h <- dat$h

standata <- list(N=N,y=y,cc=cc,h=h)
stanout <- stan(file="chap4/stan/Qimp.stan",data=standata,chains=1,iter=2000)
save(stanout, file="chap4/results/Qimp.RData")

#Forecast
datf <- expand.grid(c=1945:1992, h=1:3)
datf$cc <- datf$c - median(1945:cutoff)
beta0 <- extract(stanout,"beta0",permuted=F)
betaC <- extract(stanout,"betaC",permuted=F)
betaH1 <- extract(stanout,"betaH1",permuted=F)
betaH2 <- extract(stanout,"betaH2",permuted=F)
betaH3 <- extract(stanout,"betaH3",permuted=F)
betaCH1 <- extract(stanout,"betaCH1",permuted=F)
betaCH2 <- extract(stanout,"betaCH2",permuted=F)
betaCH3 <- extract(stanout,"betaCH3",permuted=F)

Yhat <- list()
for (i in 1:1000) {
  Yhat[[i]] <- matrix(0,nrow(datf),4)
  Yhat[[i]][,2] <- beta0[i,1,1] + betaC[i,1,1]*datf$cc + betaH1[i,1,datf$h] + betaCH1[i,1,datf$h]*datf$cc
  Yhat[[i]][,3] <- beta0[i,1,2] + betaC[i,1,2]*datf$cc + betaH2[i,1,datf$h] + betaCH2[i,1,datf$h]*datf$cc
  Yhat[[i]][,4] <- beta0[i,1,3] + betaC[i,1,3]*datf$cc + betaH3[i,1,datf$h] + betaCH3[i,1,datf$h]*datf$cc
  Yhat[[i]] <- exp(Yhat[[i]])/apply(exp(Yhat[[i]]),1,sum)
}

#Imputation set-up
dat <- aggregate(qualf4 ~ coh + HDIc3e, FUN = function(x) 
  c(y=length(x), q1=sum(x==1), q2=sum(x==2), q3=sum(x==3), q4=sum(x==4)),
  subset=l_ind)
dat <- data.frame(dat$coh,dat$HDIc3e,dat$qualf4)
colnames(dat) <- c("c","h","y",paste0(1:4))

#Imputation
Nimp <- 10
set.seed(1)
imp.ind <- sample(1000,10)
theta_red <- Yhat[imp.ind]

countfunc <- function (x) c(sum(x==1),sum(x==2),sum(x==3),sum(x==4))
set.seed(1)
unic <- coh[l_ind]
unih3 <- HDIc3e[l_ind]
uids <- id[l_ind]
uniq <- qualf4[l_ind]
uniqimp <- list()
uniqreq <- list()
for (n in 1:Nimp) {
  uniqimp[[n]] <- uniq
  uniqreq[[n]] <- list()
  uniqreq[[n]][[1]] <- uniqreq[[n]][[2]] <- uniqreq[[n]][[3]] <- matrix(0,10,4)
  for (cohort in c((cutoff+1):1992)) {
    for (i in 1:3) {
      fit <- theta_red[[n]][((i-1)*length(1945:1992))+(cohort-1944),]
      tot <- sum(unic==cohort & unih3==i)
      obsr <- as.numeric(rmultinom(1,tot,fit))
      fitn <- obsr/tot
      icoh <- which(unic==cohort & unih3==i)
      cids <- uids[icoh]
      oriq <- uniq[icoh]
      newq <- uniq[icoh]
      #Move < O Level to O Level
      size <- countfunc(newq)[1]-obsr[1]
      if (size<0) {
        obsd <- c(countfunc(newq)[1],(sum(countfunc(newq)[2:4])*fitn[2:4])/sum(fitn[2:4]))
        obsr <- round(obsd)
      } 
      if (sum(obsr)-tot==1) {
        obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]-1
      }
      if (sum(obsr)-tot==-1) {
        obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]+1
      }
      size <- countfunc(newq)[1]-obsr[1]
      if (size>0) {
        x <- cids[newq==1]
        if (length(x)==1) ind1 <- x
        if (length(x)>1) ind1 <- sample(x,size)
        newq[cids %in% ind1] <- 2
      }
      #Move O Level to A Level
      size <- countfunc(newq)[2]-obsr[2]
      if (size<0) {
        obsd <- c(countfunc(newq)[1:2],(sum(countfunc(newq)[3:4])*fitn[3:4])/sum(fitn[3:4]))
        obsr <- round(obsd)
      } 
      if (sum(obsr)-tot==1) {
        obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]-1
      }
      if (sum(obsr)-tot==-1) {
        obsr[which.min(abs(obsd%%1-0.5))] <- obsr[which.min(abs(obsd%%1-0.5))]+1
      }
      size <- countfunc(newq)[2]-obsr[2]
      if (size>0) {
        x <- cids[newq==2]
        if (length(x)==1) ind2 <- x
        if (length(x)>1) ind2 <- sample(x,size)
        newq[cids %in% ind2] <- 3
      }
      #Move A Level to Degree
      size <- countfunc(newq)[3]-obsr[3]
      if (size<0) {
        obsr <- countfunc(newq)
      }
      if (size>0) {
        x <- cids[newq==3]
        if (length(x)==1) ind3 <- x
        if (length(x)>1) ind3 <- sample(x,size)
        newq[cids %in% ind3] <- 4  
      }
      uniqimp[[n]][icoh] <- newq
      uniqreq[[n]][[i]][cohort-cutoff,] <- fitn
    }
  } 
}

#Update Q
iddat <- data.frame(id=1:length(uids))
rownames(iddat) <- uids
qualf4MI <- list()
qualf2bMI <- list()
qualf3bMI <- list()
for (n in 1:Nimp) {
  idimp <- uids[which(uniqimp[[n]] != uniq)]
  qualf4MI[[n]] <- qualf4
  qualf4MI[[n]][id %in% idimp] <- uniqimp[[n]][iddat[paste(id[id %in% idimp]),"id"]]
  qualf2bMI[[n]] <- ifelse(qualf4MI[[n]] < 3, 1, 2)
  qualf3bMI[[n]] <- ifelse(qualf4MI[[n]] > 2, qualf4MI[[n]]-1, qualf4MI[[n]])
}

qualf4MI_0 <- qualf2bMI_1 <- qualf3bMI_2 <- list()
for (n in 1:Nimp) {
  qualf4MI_0[[n]] <- qualf4MI[[n]][parityc == 0]
  qualf2bMI_1[[n]] <- qualf2bMI[[n]][parityc == 1]
  qualf3bMI_2[[n]] <- qualf3bMI[[n]][parityc == 2]
}

#Plots
#Labels
supp.labs1 <- c("Low/Medium HDI","High/Very high HDI","UK-born")
names(supp.labs1) <- c(1,2,3)
supp.labsi <- paste0("Imputation ",1:10)
names(supp.labsi) <- c(1:10)
supp.labst <- c("Without additional multinomial variation","With additional multinomial variation")
names(supp.labst) <- c(1,2)

#Probability uncertainty
scatterfunc2 <- function(w,c,q,h,ind,FUN) {
  data <- aggregate(w[ind], by=list(c[ind], as.factor(h[ind]), as.factor(4-q[ind])), FUN = FUN)
  data <- right_join(data,expand.grid(Group.1=1945:1992, Group.2=as.factor(1:3), Group.3=as.factor(0:3)),by=c("Group.1","Group.2","Group.3")) %>% mutate_each(function(x) (replace(x, which(is.na(x)), 0)))
  data <- right_join(data, aggregate(w[ind], by=list(c[ind], as.factor(h[ind])), FUN = FUN), by = c("Group.1","Group.2"))
  data$x <- data$x.x/data$x.y
  data
}

thetafm <- Reduce("+",Yhat)/1000
thetaflower <- thetafupper <- thetafm
for (i in 1:nrow(datf)) {
  for (j in 1:4) {
    thetax <- numeric()
    for (k in 1:1000) {
      thetax[k] <- Yhat[[k]][i,j]
    }
    thetaflower[i,j] <- quantile(thetax,p=0.025)
    thetafupper[i,j] <- quantile(thetax,p=0.975)
  }
}

data2 <- scatterfunc2(weights_st, coh, qualf4imp, HDIc3e, l_ind, length)
data2 <- data2[order(data2$Group.3,data2$Group.2,data2$Group.1),]
ymax <- max(data2$x)
data2$mean <- c(thetafm[,4],thetafm[,3],thetafm[,2],thetafm[,1])
data2$lower <- c(thetaflower[,4],thetaflower[,3],thetaflower[,2],thetaflower[,1])
data2$upper <- c(thetafupper[,4],thetafupper[,3],thetafupper[,2],thetafupper[,1])
data2p <- data2
data2p[which(data2p$Group.1>cutoff),] <- NA
data2f <- data2
data2f[which(data2f$Group.1<=cutoff),] <- NA

#Proportion uncertainty
theta_add <- Yhat
set.seed(1)
for (n in 1:1000) {
  for (cohort in 1945:1992) {
    for (i in 1:3) {
      fit <- Yhat[[n]][((i-1)*length(1945:1992))+(cohort-1944),]
      tot <- sum(unic==cohort & unih3==i)
      obsr <- as.numeric(rmultinom(1,tot,fit))
      fitn <- obsr/tot
      theta_add[[n]][((i-1)*length(1945:1992))+(cohort-1944),] <- fitn
    }
  }
}

theta_lower <- theta_upper <- theta_mean <- theta_add[[1]]
for (i in 1:nrow(theta_lower)) {
  for (j in 1:4) {
    a <- numeric()
    for (n in 1:1000) a[n] <- theta_add[[n]][i,j]
    theta_lower[i,j] <- quantile(a,p=0.025)
    theta_upper[i,j] <- quantile(a,p=0.975)
    theta_mean[i,j] <- mean(a)
  }
}
data2$mean2 <- c(theta_mean[,4],theta_mean[,3],theta_mean[,2],theta_mean[,1])
data2$lower2 <- c(theta_lower[,4],theta_lower[,3],theta_lower[,2],theta_lower[,1])
data2$upper2 <- c(theta_upper[,4],theta_upper[,3],theta_upper[,2],theta_upper[,1])
data2p <- data2
data2p[which(data2p$Group.1>cutoff),] <- NA
data2f <- data2
data2f[which(data2f$Group.1<=cutoff),] <- NA

#Plot probability and proportion uncertainty (Figure 4.14)
data2b <- rbind(data.frame(type=1,data2[,1:9]),
                data.frame(type=2,data2[,1:6],mean=data2$mean2,lower=data2$lower2,upper=data2$upper2))
data2bp <- data2b
data2bp[which(data2bp$Group.1>cutoff),] <- NA
data2bf <- data2b
data2bf[which(data2bf$Group.1<=cutoff),] <- NA

png(file="chap4/plots/fig14.png",width=20,height=14.62,units="cm",res=400)
print({ggplot(data2b, aes(x=Group.1, y=x, color = Group.3)) +
  geom_point() +
  geom_line() +
  geom_ribbon(aes(ymin=lower,ymax=upper,fill=Group.3),color=NA,alpha=0.15,show.legend=F) +
  geom_line(aes(x=data2bp$Group.1,y=data2bp$mean,color=data2bp$Group.3),size=1,show.legend=F) +
  geom_line(aes(x=data2bf$Group.1,y=data2bf$mean,color=data2bf$Group.3),size=1,linetype=2,show.legend=F) +
  geom_line(aes(x=data2bp$Group.1,y=data2bp$lower,color=data2bp$Group.3),size=0.75,show.legend=F) +
  geom_line(aes(x=data2bf$Group.1,y=data2bf$lower,color=data2bf$Group.3),size=0.5,linetype=2,show.legend=F) +
  geom_line(aes(x=data2bp$Group.1,y=data2bp$upper,color=data2bp$Group.3),size=0.75,show.legend=F) +
  geom_line(aes(x=data2bf$Group.1,y=data2bf$upper,color=data2bf$Group.3),size=0.5,linetype=2,show.legend=F) +
  geom_vline(aes(xintercept=cutoff)) + 
  labs(x = "Cohort", y = "Proportion", color = "Highest educational qualification") +
  scale_color_discrete(guide = guide_legend(reverse=TRUE), labels = c("Degree","A Level","GCSE","< GCSE")) +
  scale_y_continuous(breaks=seq(0,1,0.2)) +
  scale_x_continuous(limits=c(1945,1992), breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
  coord_cartesian(ylim=c(0,1)) +
  theme_gray() + theme(axis.text.x = element_text(angle = 90, vjust=0.45), legend.position = "bottom", text = element_text("Calibri")) +
  facet_grid(cols=vars(Group.2), rows=vars(type), labeller = labeller(Group.2=supp.labs1,type=supp.labst))
})
dev.off()

#Plot imputations (Figures 4.15-4.16)
data3 <- data.frame()
for (n in 1:Nimp) {
  data2b <- scatterfunc2(weights_st, coh, qualf4MI[[n]], HDIc3e, l_ind, length)
  data2b <- data2b[order(data2b$Group.3,data2b$Group.2,data2b$Group.1),]
  tmp <- cbind(data2,ximp=data2b$x,imp=n,req=NA)
  for (j in 1:3) {
    tmp$req[tmp$Group.1 %in% (cutoff+1):1992 & tmp$Group.2==j & tmp$Group.3==0] <- uniqreq[[n]][[j]][,4]
    tmp$req[tmp$Group.1 %in% (cutoff+1):1992 & tmp$Group.2==j & tmp$Group.3==1] <- uniqreq[[n]][[j]][,3]
    tmp$req[tmp$Group.1 %in% (cutoff+1):1992 & tmp$Group.2==j & tmp$Group.3==2] <- uniqreq[[n]][[j]][,2]
    tmp$req[tmp$Group.1 %in% (cutoff+1):1992 & tmp$Group.2==j & tmp$Group.3==3] <- uniqreq[[n]][[j]][,1]
  }
  data3 <- rbind(data3,tmp)
}
data3$mean2 <- rep(data2$mean2,Nimp)
data3$lower2 <- rep(data2$lower2,Nimp)
data3$upper2 <- rep(data2$upper2,Nimp)
data3p <- data3
data3p[which(data3p$Group.1>cutoff),] <- NA
data3f <- data3
data3f[which(data3f$Group.1<=cutoff),] <- NA

plotfunc <- function(impn,filename) {
  png(file=filename,width=30,height=15,units="cm",res=400)
  print({ggplot(data3[impn,], aes(x=Group.1, y=x, color = Group.3)) +
      geom_point(aes(y=ximp)) +
      geom_line(aes(y=ximp)) +
      geom_point(aes(y=req),pch=1) +
      geom_line(aes(y=req),linetype=2) +
      geom_ribbon(aes(ymin=lower2,ymax=upper2,fill=Group.3),color=NA,alpha=0.15,show.legend=F) +
      geom_line(aes(x=data3p$Group.1[impn],y=data3p$mean2[impn],color=data3p$Group.3[impn]),size=1,show.legend=F) +
      geom_line(aes(x=data3f$Group.1[impn],y=data3f$mean2[impn],color=data3f$Group.3[impn]),size=1,linetype=2,show.legend=F) +
      geom_line(aes(x=data3p$Group.1[impn],y=data3p$lower2[impn],color=data3p$Group.3[impn]),size=0.75,show.legend=F) +
      geom_line(aes(x=data3f$Group.1[impn],y=data3f$lower2[impn],color=data3f$Group.3[impn]),size=0.5,linetype=2,show.legend=F) +
      geom_line(aes(x=data3p$Group.1[impn],y=data3p$upper2[impn],color=data3p$Group.3[impn]),size=0.75,show.legend=F) +
      geom_line(aes(x=data3f$Group.1[impn],y=data3f$upper2[impn],color=data3f$Group.3[impn]),size=0.5,linetype=2,show.legend=F) +
      geom_vline(aes(xintercept=cutoff)) + 
      labs(x = "Cohort", y = "Proportion", color = "Highest educational qualification") +
      scale_color_discrete(guide = guide_legend(reverse=TRUE), labels = c("Degree","A Level","GCSE","< GCSE")) +
      scale_y_continuous(breaks=seq(0,1,0.2)) +
      scale_x_continuous(limits=c(1945,1992), breaks=seq(1945,1990,5), minor_breaks=setdiff(1945:1992,seq(1945,1990,5)), expand=c(0.02,0.02)) +
      coord_cartesian(ylim=c(0,1)) +
      theme_gray() + theme(axis.text.x = element_text(angle = 90, vjust=0.45), legend.position = "bottom", text = element_text("Calibri")) +
      facet_grid(rows=vars(Group.2), cols=vars(imp), labeller = labeller(imp=supp.labsi,Group.2=supp.labs1))
  })
  dev.off()  
}

imp1to5 <-  data3$imp<=5
imp6to10 <- data3$imp>5
plotfunc(imp1to5,  "chap4/plots/fig15.png")
plotfunc(imp6to10, "chap4/plots/fig16.png")

save(qualf4MI,qualf2bMI,qualf3bMI,qualf4MI_0,qualf2bMI_1,qualf3bMI_2,
     file="chap4/results/QMI.RData")
