Introduction

This is a first effort to implement “feature space diagrams” in R, inspired by https://towardsdatascience.com/escape-the-correlation-matrix-into-feature-space-4d71c51f25e5

library(ggbiplot)
library(lessR)
library(Hmisc)
library(corrplot)
library(gplots)
library(igraph)
library(tidyverse)
library(ggrepel)

Workflow

Our workflow is generally as described in the original post:

Data Load

# TODO: User uploads data
# Load data 
#boston <- read.csv("boston.csv", header = TRUE, fileEncoding="latin1")
WDBC <- read.csv("WDBC.csv", header = TRUE, fileEncoding="latin1")

# Ensure we have a matrix
#mydata <- na.omit(as.matrix(boston))
mydata <- na.omit(as.matrix(WDBC[,3:32]))
#mydata <- as.matrix(mtcars[,c(1:7,10,11)])

Initial Correlation Matrix

# Simple correlation matrix calculation
# TODO: user chooses method
mydata.cor <- cor(mydata, method = c("pearson"), use = "complete.obs")
#mydata.cor <- cor(mydata, method = c("kendall"), use = "complete.obs")
#mydata.cor <- cor(mydata, method = c("spearman"), use = "complete.obs")

# # Optional
# corrplot(mydata.cor)
# 
# palette <- colorRampPalette(c("green", "white", "red")) (20)
# heatmap.2(x = mydata.cor, col = palette, symm = TRUE)

# Reorder correlation matrix based on: https://rdrr.io/cran/lessR/man/corReorder.html 
mydata.cor.ro <- corReorder(mydata.cor)

Distance Matrix Calculation

Correlation can often be used as a distance metric for hierarchical clustering; see e.g. this discussion.

We’re using the dist() function to provide a number of different distance options (default is euclidean distance).

# # Simplistic distance matrix (Absolute value et.al.)
# mydata.cor.ro.1 <- abs(mydata.cor.ro) - 1

# Distance Matrix via: https://stat.ethz.ch/R-manual/R-devel/library/stats/html/dist.html
mydata.dist <- dist(mydata.cor.ro, upper = TRUE, method = "manhattan") 
mydata.dist.sym <- as.matrix(mydata.dist )

Hiearchical/Agglomerative Clustering (NEW)

# See eg: 
# * https://metudatascience.github.io/datascience/clustering.html
# * https://uc-r.github.io/hc_clustering

# Distance calculated above
mydata.hclust <- hclust(mydata.dist)
plot(mydata.hclust)

# NOTE! These clusters are in numerical order, not feature order! 
mydata.hclust.groups <- cutree(mydata.hclust, k=10) # Usually 3 is used for mtcars demos
rect.hclust(mydata.hclust, k=10, border="red")

PCA on Distance Matrix

# PCA 
#mydata.dist.sym.pca <- prcomp(mydata.dist.sym,scale=TRUE)
mydata.dist.sym.pca <- prcomp(mydata.dist.sym) # Un-scaled gives results closer to source article

# Plottable version:
mydata.dist.sym.pca.plot <- as.data.frame(mydata.dist.sym.pca$x[,1:2])

Finding the Clusters

Write the cluster assignments and the feature names into the PCA data frame.

# Pull out our clusters (calculated)
mydata.dist.sym.pca.plot$cluster <- as.factor(mydata.hclust.groups) 
mydata.dist.sym.pca.plot$name <- rownames(mydata.cor.ro)

Finding the Graph!

This is the hard part…

# Determine connectivity!
# This selects the nodes (features) we want to connect
threshold <- 5.5
selector <- ((abs(mydata.dist.sym) <= threshold ) * 1) # Filter out lines greater than threshold distance
for (i in 1:length(mydata.dist.sym.pca.plot$cluster)) {selector[,i] <- selector[,i] * as.numeric(mydata.dist.sym.pca.plot$cluster)}
diag(selector) <- 0 

filtered <- abs(mydata.dist.sym)

filtered[ filtered <= threshold ] <- 0

# Get the function for re-scaling the line weights
# Line thicknesses will automatically range from 1-10
range <- c(1,10)
domain <- c(min(filtered[filtered > 0]),max(filtered))
line.lm <- lm(range~domain)

adjust <- function(x){ 
  # stupid lm trick to scale
    line.lm$coefficients[[2]] * x + line.lm$coefficients[[1]]
  }
  
# Repeat selector matrix, this time for strength of relationships
# Select the nodes (features) we want to connect
thickness <- ((abs(mydata.dist.sym) <= threshold ) * 1) # Filter out lines greater than threshold distance
diag(thickness) <- 0  # remove diagonal

# Use distances as thicknesses for the arcs we're keeping
for (i in 1:length(mydata.dist.sym.pca.plot$cluster)) {
  for (j in 1:length(thickness[,i])) {
    if (thickness[j,i] != 0 ) {
#      thickness[j,i] <- adjust(thickness[j,i] * abs(mydata.dist.sym[j,i]))
      thickness[j,i] <- thickness[j,i] * abs(mydata.dist.sym[j,i])
    } else { thickness[j,i] <- 0}
  }
}

diag(thickness) <- 0 

# Switching to networks
# Create igraph structure
network <- graph_from_adjacency_matrix(selector, weighted = TRUE)
network.t <- graph_from_adjacency_matrix(thickness, weighted = TRUE)

# Gets us our edges!
mydata.edges <- get.data.frame(network) %>% # this is where "weight" is introduced
  dplyr::rename(Cluster = weight)

mydata.edges.t <- get.data.frame(network.t) %>% # this is where "weight" is introduced
  dplyr::rename(thickness = weight) %>%
#  mutate(thickness = as.integer(thickness))
  mutate(thickness = thickness)


# replace `from` with X1 and Y1, and `to` with X2 and Y2
# These will be our line segments!
mydata.segments <- mydata.edges %>% 
  left_join(mydata.dist.sym.pca.plot, by=c("from"="name")) %>%
  dplyr::select(-cluster) %>%
  mutate(X1=PC1, Y1=PC2) %>%
  dplyr::select(-PC1, -PC2)

mydata.segments <- mydata.segments %>%
  left_join(mydata.dist.sym.pca.plot, by=c("to"="name")) %>%
  dplyr::select(-cluster) %>%
  mutate(X2=PC1, Y2=PC2) %>%
  dplyr::select(-PC1, -PC2)

mydata.segments <- mydata.segments %>%
  dplyr::rename(cluster = Cluster)

mydata.dist.sym.pca.plot$cluster <- as.factor(mydata.dist.sym.pca.plot$cluster)

# NEW: cluster used for coloring 
mydata.segments$cluster <- as.factor(mydata.segments$cluster)
#mydata.segments$thickness <- as.integer(mydata.edges.t$thickness)
mydata.segments$thickness <- mydata.edges.t$thickness

Building the Plot

# Adding line segments to PCA plot
p <- ggplot(mydata.dist.sym.pca.plot, aes(x=PC1, y=PC2)) + 
  geom_point(aes(color=cluster, size=6), show.legend = FALSE) +
  geom_curve(aes(x=X1, y=Y1, xend=X2, yend=Y2, color=cluster, size=thickness, alpha=0.4), curvature=0.2, data=mydata.segments) +
  guides(alpha = FALSE) +
  scale_size(range = c(0.1, 2), guide = guide_none()) +
  geom_text_repel(aes(label=name),hjust=1, vjust=0, size=3) +
#  labs(title="Feature Space Diagram for the Boston Housing data set") +
  labs(title="Feature Space Diagram for the Wisconsin Breast Cancer data set") + 
#  labs(title="Feature Space Diagram for the classic `mtcars` data set") + 
  labs(caption = "See: https://github.rpi.edu/DataINCITE/FeatureSpaceDiagram/")

p 

ggsave(filename="WDBC_fsd.png", plot = p)
## Saving 10 x 10 in image

Sanity check: Biplot on the original dataset

Usually we review the feature space using a PCA biplot. Let’s see how it compares!

mydata.pca <- prcomp(na.omit(mydata), center = TRUE, scale. = TRUE)

ggbiplot(mydata.pca) +
    xlim(-2.5,2.5) + 
  ylim(-2.5,2.5)