# ERGM FOR P3 FOR CHINA 

# Select from global table all P3 data that have a CN country of residence code
# IMPORTANT: relevant node attributes go into the Nodes list. 
# Everything to the right of the first column in a node list is attributed about the nodes. 
# Information about the edges goes into the edge list. Everything to the right 
# of the first two columns in the edge list covers edge attributes 

# Also important: all the nodes that appear in the edge list must appear in the node list also! 

# Starting and loading packages 
rm(list = ls())
pacman::p_load(pacman, tidyverse, ggplot2)

#load the packages
library("igraph")
#library("plyr")
# library("RColorBrewer")

#read the P3 China edge list
edgelist <- read.csv("data/ERGM2/ERGM2_CN/P3/ERGM2_CN_P3_edgelist_ID.csv", header = TRUE)

#Read the P3 China node list
nodelist <- read.csv("data/ERGM2/ERGM2_CN/P3/ERGM2_CN_P3_nodelist_NO_ID_FINAL.csv", header = TRUE)

# add an ID column to the nodelist
nodelist$ID <- 1:nrow(nodelist)

#edgelist needs to be two columns only so let's ignore the attribute for now 
el_no_weight <- edgelist[,1:2]

# igraph requires a matrix
el_no_weight <- as.matrix(el_no_weight)

# convert ids to characters so they are preserved as names
el_no_weight[,1] <- as.character(el_no_weight[,1])
el_no_weight[,2] <- as.character(el_no_weight[,2])

# Graph the network
library(igraph)
CNtest <- graph.edgelist(el_no_weight, directed = T)
CNtest

# Finally, add attributes  
# First link vertex names to their place in the attribute dataset
linked_ids <- match(V(CNtest)$name, nodelist$ID)
linked_ids

# Then we can use that to assign a variable to each patent in the network
V(CNtest)$CountryOfPublicationCode <- nodelist$CountryOfPublicationCode[linked_ids]
V(CNtest)$ApplicantTypeCode <- nodelist$ApplicantTypeCode[linked_ids]
V(CNtest)$IPCNumberCode <- nodelist$IPCNumberCode[linked_ids]
V(CNtest)$CountryOfPublicationCode

#NAs are not allowed so they need to be replaced with zeros 
# Update Column names if this is still needed
#CNtest <- delete.vertices(CNtest, which(is.na(V(CNtest)$Region) | V(CNtest)$Region == 0))
#CNtest <- delete.vertices(CNtest, which(is.na(V(CNtest)$Organisation) | V(CNtest)$Organisation == 0))
#CNtest <- delete.vertices(CNtest, which(is.na(V(CNtest)$Recency) | V(CNtest)$Recency == 0))

# Calculate the assortativity of the network. This can be done for all attributes separately 
assortativity(CNtest, types1 = as.numeric(V(CNtest)$CountryOfPublicationCode))
assortativity(CNtest, types1 = as.numeric(V(CNtest)$ApplicantTypeCode))
assortativity(CNtest, types1 = as.numeric(V(CNtest)$IPCNumberCode))


# Now over to the ERGM
library(statnet)
library(intergraph)

# Create a statnet object
statnetCN <- asNetwork(CNtest)
statnetCN

# Plot the network
par(mfrow = c(1, 1))
plot(statnetCN, 
     vertex.col = "tomato", 
     vertex.cex = 1)

# Building and ERGM and start with the edges. Are these significant relative to a random graph? 
# First, build a random graph

random_graph <- ergm(statnetCN ~ edges, control = control.ergm(seed = 1234))
warnings()
random_graph

# We need to convert the coefficient back to a percentage. The estimate is in log-odds
# This is the formula. The three steps must be called separately 
# Theta is the probability of an edge in the graph
inv.logit <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}

theta <- coef(random_graph)
theta

theta2 <- -3.37110
inv.logit(theta2)

# Theta should be exactly the same as the density of the graph. Double check this: 
network.density(statnetCN)

# Provide summary statistics of the random graph
summary(random_graph)

# Let's compare our observed graph to some simulations 
#set.seed(1234)
# hundred_simulations <- simulate(random_graph, 
                                #coef = theta,
                                #nsim = 100,
                                #control = control.simulate.ergm(MCMC.burnin = 1000,
                                                                # MCMC.interval = 1000))
# ... and let's examine the first nine of them and plot them: 
# This takes ages and is not really needed. 
# par(mfrow = c(3, 3))
# sapply(hundred_simulations[1:9], plot, vertex.cex = 1, vertex.col = "tomato")

# We can compare the number of edges our observed graph has to the average of the simulated networks:
# net_densities <- unlist(lapply(hundred_simulations, network.density))

# par(mfrow = c(1, 1))
# hist(net_densities, xlab = "Density", main = "", col = "lightgray")
# abline(v = network.density(statnetCN), col = "red", lwd = 3, lty = 2)
# abline(v = mean(net_densities), col = "blue", lwd = 3, lty = 1)

# Plot some Goodness of Fit statistics. This is to demonstrate that in- and outdegrees are way off. 
# gof_stats <- gof(random_graph)

# par(mfrow = c(3, 2))
# plot(gof_stats, main = '')

# We need more. Let's add some dyads to the ERGM
model1 <- ergm(statnetCN ~ edges + 
                 nodematch("CountryOfPublicationCode") + 
                 nodematch("ApplicantTypeCode") + 
                 nodematch("IPCNumberCode"))
summary(model1)


# Plot the baby
ergm_stats <- gof(model1)

par(mfrow = c(3, 2))
plot(ergm_stats, main = '')

# Finally a model that adds a term for triadic closure, using the gwesp term. Look this up. 
# Perhaps leave this for some other time 

model4 <- ergm(statnetCN ~ edges + 
                 nodematch("CountryOfPublicationCode") + 
                 nodematch("ApplicantTypeCode") + 
                 nodematch("IPCNumberCode") + 
                 mutual + 
                 gwesp(0.4, fixed = T),
               control=control.ergm(MCMLE.maxit= 40))

summary(model4)

ergm_stats2 <- gof(model4)

par(mfrow = c(3, 2))
plot(ergm_stats2, main = '')

# Get better insight into the nodematch terms
# Double check if helpful at all. 

# model5 <- ergm(statnetCN ~ edges + 
                 nodematch("CountryOfPublicationCode", diff = T, 
                              levels = c("2", "4", "5", "6", "8", "9")) + 
                 nodematch("ApplicantTypeCode", diff = T, 
                           levels = c("15", "16", "17", "18", "19")) + 
                 nodematch("IPCNumberCode", diff = T, 
                           levels = c("21", "22", "23", "24", "25", "26", "27","28", "29", "30", "31", "32", "34", "37", "40")) + 
                 mutual + 
                 gwesp(0.4, fixed = T),
               control=control.ergm(MCMLE.maxit= 40))

# summary(model5)


