#
# This code complements 
#
# install.packages("rdist")
#
# For dist
library("stats")
# For agnes
library("cluster")
library("rdist")
library("arules")
library("randomForest")
library("apcluster")

library("ggplot2")

library("usedist")

# https://cran.r-project.org/web/packages/BayesBinMix/index.html
library("BayesBinMix")

library(factoextra)

library(NbClust)

library(reshape)

library(tidyverse)

##############
# Load the tactics and TTP data
#BinVecFile <- "C:/Users/cjm1e17/OneDrive - University of Southampton/ATTACK_DB_TEST/EntTacBin.csv"
BinVecFile <- "C:/Users/cjm1e17/OneDrive - University of Southampton/ATTACKTest7/MAFpt_ATTACK_ENT_TACTIC_BIN.csv"
BinVecFile <- "C:/Users/chris/OneDrive - University of Southampton/ATTACKTest7/MAFpt_ATTACK_ENT_TACTIC_BIN_INDEX.csv"
BinVecFile <- "C:/Users/chris/OneDrive - University of Southampton/ATTACK_DB_TEST/MAFpt_ENT_TACTICS_BIN_122023.csv"
BinVecData <- read.csv(file=BinVecFile, header=TRUE, row.names=1, sep=",")
BinVecDataCols=ncol(BinVecData)
print(BinVecDataCols)
BVDArr=data.matrix(BinVecData, rownames.force = NA)

# Quick look at potential 'clusterability'
# https://www.datanovia.com/en/lessons/assessing-clustering-tendency/#why-assessing-clustering-tendency
# library(factoextra)
# Compute Hopkins statistic for iris dataset
res <- get_clust_tendency(BinVecData, n = nrow(BinVecData)-1, graph = FALSE)
res$hopkins_stat

BinVecEntTTPFile <- "C:/Users/cjm1e17/OneDrive - University of Southampton/ATTACK_DB_TEST/MAFpt_ATTACK_ENT_TTP_BIN.csv"
BinVecEntTTPFile <- "C:/Users/chris/OneDrive - University of Southampton/ATTACK_DB_TEST/MAFpt_ATTACK_ENT_TTP_BIN.csv"
BinVecEntTTPData <- read.csv(file=BinVecEntTTPFile, header=TRUE, row.names=1, sep=",")

# GroupName	ClusterNum
GroupClusterFile <- "C:/Users/cjm1e17/OneDrive - University of Southampton/ATTACK_DB_TEST/MAFpt_ATTACK_ENT_TACTIC_GROUP_CLUSTER.csv"
GroupClusterFile <- "C:/Users/chris/OneDrive - University of Southampton/ATTACK_DB_TEST/MAFpt_ATTACK_ENT_TACTIC_GROUP_CLUSTER.csv"
GroupClusterFileData <- read.csv(file=GroupClusterFile, header=TRUE, row.names=1, sep=",")

###############
### Sum the data using apply for barplot
sumdata=data.frame(value=apply(BinVecData,2,sum))
sumdata$key=rownames(sumdata)
ggplot(data=sumdata, aes(x=key, y=value, fill=key)) +
  geom_bar(colour="black", stat="identity") +
  geom_text(aes(key, value+3, label = value, fill = NULL), data =sumdata)

################
# Create a hamming and RForest (unsupervised) distance matrix
# For tactics data
#
# https://cran.r-project.org/web/packages/ClusterR/vignettes/the_clusterR_package.html
# https://www.datanovia.com/en/lessons/k-medoids-in-r-algorithm-and-practical-examples/
#

# https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/dist
DistanceMatrix=dist(BinVecData,method="binary")
DistanceMatrix=rdist(BinVecData, metric = "hamming")
fviz_dist(DistanceMatrix)

ham_distance <- function (v1, v2) sum(v1 != v2)
NewDistanceMatrix <- dist_make(BinVecData, ham_distance)

# https://www.rdocumentation.org/packages/randomForest/versions/4.6-14/topics/randomForest
# https://nishanthu.github.io/articles/ClusteringUsingRandomForest.html
rf.fit <- randomForest(x = BinVecData, y = NULL, ntree = 10000, proximity = TRUE, oob.prox = TRUE)
DistanceMatrix.rf=as.dist(1-rf.fit$proximity)
# https://www.rdocumentation.org/packages/rpart/versions/4.1-15/topics/rpart
# https://www.gormanalysis.com/blog/decision-trees-in-r-using-rpart/

hc=hclust(DistanceMatrix, method = "ward.D2", members = NULL)
sil_cl <- silhouette(cutree(hc, k=2) ,DistanceMatrix, title=title(main = 'Good'))
plot(sil_cl)

hcd=as.dendrogram(hclust(DistanceMatrix, method = "ward.D2", members = NULL))
# heatmap(DistanceMatrix,cluster_rows=hcd)
# S3 method for hclust
plot(hc)

hc=agnes(BinVecData, metric="manhattan", method="ward")

NumberOfClusts=4
clusters=cutree(hc,k=NumberOfClusts)
for (x in 1:NumberOfClusts) {
  #head(clusters)
  Clust1List=names(clusters[which(clusters[1:133]==x)])
  Clust1Rows=BinVecData[Clust1List,]
  #head(Clust1Rows)
  # https://r-charts.com/correlation/heat-map-ggplot2/?utm_content=cmp-true
  # Put the row index name in as a column
  RInd <- row.names(Clust1Rows)
  Clust1Rows$RInd <- RInd
  #head(Clust1Rows)

  Clust1Melt=melt(Clust1Rows)
  #head(Clust1Melt)
  # colnames(Clust1Melt) <- c("x", "y", "value")
  print( ggplot(Clust1Melt, aes(variable, RInd, fill = c("red", "blue")[ value + 1 ])) +
    geom_tile() +
    scale_fill_identity() +
    theme_minimal() +
    ggtitle(paste("Cluster ",x)) +
    theme(axis.text.x = element_text(angle = 90)) )
  
  flush.console()
}


# https://www.datanovia.com/en/lessons/determining-the-optimal-number-of-clusters-3-must-know-methods/
NbClust(data = BinVecData, diss = DistanceMatrix ,distance = NULL, min.nc = 2, max.nc = 15, method = "ward.D2")
# https://towardsdatascience.com/10-tips-for-choosing-the-optimal-number-of-clusters-277e93d72d92


rect.hclust(hc , k = 4, border = 2:6)
#abline(h = 0.87, col = 'red')
abline(h = 1, col = 'red')

hc_test=hclust(NewDistanceMatrix, method = "ward.D2", members = NULL)
hcd_test=as.dendrogram(hclust(NewDistanceMatrix, method = "ward.D2", members = NULL))
plot(hc_test)

################
# Cluster analysis of tactics data using hclust
#      https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust
#      Hierarchical cluster analysis on a set of dissimilarities and methods for analyzing it.
hc=hclust(DistanceMatrix.rf, method = "ward.D2", members = NULL)
hcd=as.dendrogram(hclust(DistanceMatrix, method = "ward.D2", members = NULL))
# S3 method for hclust
plot(hc)

################
# Cluster analysis of tactics data using BayesBinMix
# number of heated paralled chains
nChains <- 2
nChains <- 12
heats <- seq(1,0.8,length = nChains)
heats <- seq(1,0.3,length = nChains)
## Not run:
cm <- coupledMetropolis(Kmax = 10,nChains = nChains,heats = heats,
                        binaryData = BVDArr, outPrefix = 'BayesBinMixExample_CM',
                        ClusterPrior = 'poisson', m = 1100, burn = 100)
# print summary using:
print(cm)


################
# Cluster analysis of tactics data using agnes
#       Computes agglomerative hierarchical clustering of the dataset.
#       https://www.rdocumentation.org/packages/cluster/versions/2.1.0/topics/agnes 
#
#       Agglomerative clustering
#       See also https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust
#       https://www.r-bloggers.com/2017/12/how-to-perform-hierarchical-clustering-using-r/
#       https://stat.ethz.ch/R-manual/R-devel/library/cluster/html/agnes.object.html
#
#
#  Using hamming distance
hc=agnes(DistanceMatrix,diss=TRUE, method="average")
hc=agnes(DistanceMatrix,diss=TRUE, method="single")
hc=agnes(DistanceMatrix,diss=TRUE, method="complete")
hc=agnes(DistanceMatrix,diss=TRUE, method="ward")

pltree(hc, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

#   using RF distance
hcf=agnes(DistanceMatrix.rf,diss=TRUE, method="ward")
pltree(hcf, cex = 0.6, hang = -1, main = "Dendrogram of agnes RF dist") 

################
# Cluster analysis of tactics data using diana
#       Computes a divisive hierarchical clustering of the dataset 
#                                  returning an object of class diana.
#       https://www.rdocumentation.org/packages/cluster/versions/2.1.0/topics/diana 
#

#
#  Using hamming distance
hc=diana(DistanceMatrix, diss=TRUE)
pltree(hc, cex = 0.6, hang = -1, main = "Dendrogram of diana")

hcf=diana(DistanceMatrix.rf, diss=TRUE)
pltree(hcf, cex = 0.6, hang = -1, main = "Dendrogram of dianaRF dist")

################
# Cluster analysis of tactics data using apcluster
#
# https://www.researchgate.net/post/How_can_I_test_the_performance_of_a_clustering_algorithm
# https://towardsdatascience.com/clustering-evaluation-strategies-98a4006fcfc
# http://rstudio-pubs-static.s3.amazonaws.com/63645_81729350c5d647a9954b4b2132ab5963.html

apc=apcluster(negDistMat(r=2), BinVecData, details=TRUE)
length(apc)
# https://www.datanovia.com/en/lessons/heatmap-in-r-static-and-interactive-visualization/
# https://www.r-graph-gallery.com/215-the-heatmap-function.html
# https://jokergoo.github.io/ComplexHeatmap-reference/book/a-single-heatmap.html
# http://girke.bioinformatics.ucr.edu/GEN242/pages/mydoc/Rclustering.html 
heatmap(apc)


################
# Use agnes cluster analysis output to find 7 clusters
#
#mycl <- cutree(hc, h=max(hc$height/1.2))
mycl <- cutree(hc, k=7)
#heatmap(as.dendrogram(mycl))

mycl.rf <- cutree(hc.rf, k=3)

BinVecDataList=row.names(BinVecData)
for (val in 1:7)
{
  cat("Val is ", val)
  ThisList=BinVecDataList[which(mycl == val)]
  print(ThisList)
}

################
# Look at dendograms for groups in each cluster (based on their TTP usage)
#

# First get a list of group names in a cluster
# Using the saved file (but could be done directly from above as well)

# $ GroupName : Factor w/ 107 levels "admin@338","APT-C-36",..: 1 4 21 29 32 39 48 65 93 97 ...
# $ ClusterNum: int  1 1 1 1 1 1 1 1 1 1 ...



for (val in 1:7)
{
    GroupListDF=GroupClusterFileData[GroupClusterFileData$ClusterNum == val,]
    GroupList=as.vector(GroupListDF['GroupName'])
  
    # Get the binary vectors for this cluster
    BinVecEntTTPDataSubset=BinVecEntTTPData[GroupList[,],]
    # Create a distance matrix
    DistanceMatrixEntTTP=rdist(BinVecEntTTPDataSubset, metric = "hamming")
  
    hc=agnes(DistanceMatrixEntTTP, diss=TRUE, method="ward")
    
    TitleString=paste("Dendrogram of agnes, for Tactic Cluster ", val)
    pltree(hc, cex = 0.6, hang = -1, main = TitleString)
    
    apc=apcluster(negDistMat(r=2), BinVecEntTTPDataSubset, details=TRUE)
    print(paste("apcluster results for ", val))
    print(length(apc))
    print(apc)
}


#######################################################################################


hc=agnes(DistanceMatrix,diss=TRUE, method="ward")

pltree(hc, cex = 0.6, hang = -1, main = "Dendrogram of agnes")


DistanceMatrixEntTTP=rdist(BinVecEntTTPData, metric = "hamming")
hcEntTTP=hclust(DistanceMatrixEntTTP, method = "ward.D2", members = NULL)
# S3 method for hclust
plot(hcEntTTP)

# Ward still creates best Agglomerative coefficient:  0.8334743 
#agnes(DistanceMatrixEntTTP,diss=TRUE, method="average")
#agnes(DistanceMatrixEntTTP,diss=TRUE, method="single")
#agnes(DistanceMatrixEntTTP,diss=TRUE, method="complete")
#agnes(DistanceMatrixEntTTP,diss=TRUE, method="ward")

hcEntTTP=agnes(DistanceMatrixEntTTP,diss=TRUE, method="ward")
pltree(hcEntTTP, cex = 0.6, hang = -1, main = "Dendrogram of agnes")

apc=apcluster(negDistMat(r=2), BinVecEntTTPData, details=TRUE)
length(apc)

mycl <- cutree(hcEntTTP, k=14)
BinVecEntTTPDataList=row.names(BinVecEntTTPData)
for (val in 1:14)
{
  cat("Val is ", val)
  ThisList=BinVecEntTTPDataList[which(mycl == val)]
  print(ThisList)
}

mycl <- cutree(hcEntTTP, k=3)
BinVecEntTTPDataList=row.names(BinVecEntTTPData)
for (val in 1:3)
{
  cat("Val is ", val)
  ThisList=BinVecEntTTPDataList[which(mycl == val)]
  print(ThisList)
}

## Mine association rules.
# https://www.kdnuggets.com/2016/04/association-rules-apriori-algorithm-tutorial.html
# Support -
# Confidence -
basket <- as(BinVecData, "transactions")
basket[] <- lapply(BinVecData, as.logical)
basket = BinVecData
basket <- basket == 1
rules <- apriori(basket, 
                 parameter = list(supp = 0.6, conf = 0.6, target = "rules"))
summary(rules)
#inspect(head(sort(rules, by = "confidence"), 30))
# Visualizing Association Rules: Introduction to the R-extension Package arulesViz
# https://www.google.co.uk/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&ved=2ahUKEwjj_82d4q_sAhUTesAKHZS3CN8QFjAAegQIBBAC&url=https%3A%2F%2Fcran.r-project.org%2Fweb%2Fpackages%2FarulesViz%2Fvignettes%2FarulesViz.pdf&usg=AOvVaw3GU7wKQdvzVmxI01bXgxtX
# Building the "transactions" Class for Association Rule Mining in R using arules and apriori
# https://blog.aptitive.com/building-the-transactions-class-for-association-rule-mining-in-r-using-arules-and-apriori-c6be64268bc4
# Visualizing association rules in hierarchical groups
# https://link.springer.com/article/10.1007/s11573-016-0822-8 
inspect(sort(rules, by = "confidence"))


library(tidyverse)

dabble <- ifelse(runif(200) < 0.8, 0, 1 )

df <- data.frame(z = dabble, x = rep(1:20, 10), y = rep(10:1, each = 20))

head(df)