# THIS IS THE ONE THAT'S WORKING SO KEEP
# This is code to generate the summary statistics of the structural properties of the US patent networks. 


# Starting and loading packages 
rm(list = ls())
pacman::p_load(pacman, party, rio, tidyverse, ggplot2)

#load the packages
library("igraph")
library("plyr")
library("HiveR")
library("RColorBrewer")

#read the CN edge list. This needs to be a 2-column file so remove the attributed required for plotting in HiveR 
USNetworkEL <- read.csv(("data/USA/USPatentDataEdgeListFINALNOATTRIBUTES.csv"), header = TRUE)
class(USNetworkEL)

#Turn the edge list into a matrix
Network_Matrix <- as.matrix(USNetworkEL)
class(Network_Matrix)

#Turn into a graph object
g <- graph_from_edgelist(Network_Matrix, directed=TRUE)

#Show the network and its edges and how they are connected:
g

# Some centrality measures 
Degree.Directed <- degree(g)
Indegree <- degree(g, mode="in", normalized = FALSE)
Outdegree <- degree(g, mode="out", normalized = TRUE)

Degree.Directed
Indegree

# Count edges and vertices
ecount(g)
vcount(g)

# Density : The proportion of present edges from all possible edges in the network
ecount(g)/(vcount(g)*(vcount(g)-1))
graph.density(g)

# Reciprocity: The proportion of reciprocated ties (for a directed network).
# reciprocity(g)
# dyad_census(g) # Mutual, asymmetric, and null node pairs
# 2*dyad_census(g)$mut/ecount(g) # Calculating reciprocity

# Transitivity
transitivity(g, type="global") 
?transitivity

#Diameter: A network diameter is the longest geodesic distance 
#(length of the shortest path between two nodes)
diameter(g, directed=TRUE, weights=NA)
?diameter

# Node degrees and histograms thereof. Change scaling also
deg <- degree(g, mode="all") # for all nodes
hist(deg, breaks=1:vcount(g)-1, 
     main="Histogram of node degree \n distribution: USA", xlim=c(1,20), ylim=c(0,4000)) 

# Find and plot degree distributions 
deg.dist <- degree_distribution(g, cumulative=T, mode="all")
plot( x=0:max(deg), y=1-deg.dist, pch=19, cex=1.9, col="orange",
      xlab="Degree", ylab="Cumulative Frequency", main=c(paste("Degree distribution of the US"),
                                                         paste("quantum internet patent network")))

# The following are measured of the centralization of the entire graph. More details here: 
# https://igraph.org/r/doc/centralize.html 

# Graph centrality 
degree(g, mode="all")
centr_degree(g, mode="all", normalized=TRUE)

#Average number of edges to/from node
mean(degree(g, mode = "all"))


#Eigenvector centrality 
eigen_centrality(g, directed=T, weights=NA)
centr_eigen(g, directed=T, normalized=T)

# Mean distance: average path length (the mean of the shortest distances between pairs of nodes)
mean_distance(g, directed=TRUE)
?mean_distance

# Find the components of the graph (it can only be weakly connected as it is a directed graph)
component_list <- decompose.graph(g, mode = "weak")
component_list

# Create a list of these components that outputs the size of each subgraph in the component list above
size_components_g <-clusters(g, mode="weak")$csize
size_components_g

# Find the biggest subgraph in that list 
max_size <- max(size_components_g)
max_size

# Identify the most cited patent (i.e. the patents with the largest numbers of indegrees)
V(g)$name[degree(g,mode="in")==max(degree(g,mode="in"))]


###### For the following we require an undirected graph
 
USNetworkELund <- read.csv(("data/USA/USPatentDataEdgeListFINALNOATTRIBUTES.csv"), header = TRUE)

#Turn the edge list into a matrix
Network_Matrix_und <- as.matrix(USNetworkELund)

#Turn into a graph object
gund <- graph_from_edgelist(Network_Matrix_und, directed = FALSE)

# Calculate k-coreness
kc <- coreness(gund, mode="all")
#plot(g, vertex.size=kc*6, vertex.label=kc, vertex.color = "yellow", edge.width=0, edge.labels = NA, layout=layout.kamada.kawai)
kc
plot(density(kc))
table(kc)  

###################################

# Moving forward, first let's create a copy of our dataset
g2 <- gund

#Select the core level we want to collect in separate steps. Start with the most inner core and add up
#g3 <- which(kc == 5)
#g3
#plot(g3)
#g4 <- which(kc == 4)
#g4




# Find the k-core of the dataset and plot it 
coreness <- graph.coreness(g2, mode ="all") 
maxCoreness <- max(coreness)
# if you just need to know the vertices and not to build the subgraph 
# you can use this variable. This also works with separating the two 
# verticesHavingMaxCoreness1 <- which(coreness == 4) 
# verticesHavingMaxCoreness2 <- which(coreness == 5) 

verticesHavingMaxCoreness <- which(coreness == maxCoreness) 
kcore <- induced.subgraph(graph=g2,vids=verticesHavingMaxCoreness)
kcore
#print(kcore, n=100)


plot(kcore, 
     vertex.label=get.vertex.attribute(kcore,name='vert.names',index=V(kcore)),
     layout = layout.davidson.harel,
     edge.arrow.size=.5, 
     vertex.color="gold", 
     # vertex.color=(ifelse(V(g4)$name == "CN 104638076 A 20150520", "blue","pink")), 
     vertex.size=20,
     vertex.frame.color="gray", vertex.label.color="black",
     vertex.label.cex=0.5, vertex.label.dist=2, edge.curved=0.2)


# Export the k core as a csv file 
# write_graph(kcore, "/tmp/kcoreCN.txt", "edgelist")


