Found in a list of 100+ interesting data sets, the Texas Department of Criminal Justice provides different collections of information on death row inmates both present and past. Prior occupations for each current offender are listed here.
Dependencies
We’ll need some packages to scrape, cluster, and chart our findings.
- rvest: “Wrappers around the ‘xml2’ and ‘httr’ packages to make it easy to download, then manipulate, HTML and XML.”
- wordcloud: “Pretty word clouds.”
- tm: “A framework for text mining applications within R.”
- apcluster: “The ‘apcluster’ package implements Frey’s and Dueck’s Affinity Propagation clustering in R.”
library(rvest)
library(wordcloud)
library(tm)
library(apcluster)
Collection
The data we wish to look at is not in the form we need it. We’ll need to capture the HTML and parse it looking for the prior occupation information. Using rvest
, we will grep for all <p>
tags that contain the occupation information. Not every page is consistent so we will need some logic to hunt around for the prior occupation field. Once we’ve parsed every page, all of the raw information will be saved to a CSV file for later pre-processing. Each line corresponds to each death row inmate prior occupations collected.
# -----------------------
# Scraping
# -----------------------
base_link <- 'https://www.tdcj.state.tx.us/death_row/'
main_link <- 'https://www.tdcj.state.tx.us/death_row/dr_offenders_on_dr.html'
csv_filename <- 'prior_occupations_dirty.csv'
if (!file.exists(csv_filename)) {
page <- read_html(main_link)
page
links <- html_attr(html_nodes(page, 'a'), 'href')
links
links <- grep('^dr_info.*html$', links, value=TRUE)
links
prior_occupations_dirty <- c()
for (link in links) {
texts <- read_html(paste(base_link, link, sep='')) %>% html_nodes('p') %>% html_text()
if (length(grep('Prior Prison', texts[3])) >= 1) {
prior_occupations_dirty <- c(prior_occupations_dirty, texts[2])
} else {
prior_occupations_dirty <- c(prior_occupations_dirty, texts[3])
}
}
prior_occupations_dirty_df = as.data.frame(prior_occupations_dirty)
write.csv(prior_occupations_dirty_df, file=csv_filename)
}
Pre-processing
With the CSV file opened, we need to clean each entry for unwanted white-space and punctuation. For those with multiple prior occupations, we’ll split on the commas. Once cleaned, we’ll add each prior occupation per inmate to the inmates_occupations
list.
# -----------------------
# Cleaning
# -----------------------
csv_file <- read.csv(file=csv_filename)
prior_occupations_dirty <- as.vector(t(csv_file$prior_occupations_dirty))
prior_occupations_cleaned = c()
inmates_prior_occupations = list()
i <- 1
for (prior_occupation_dirty in prior_occupations_dirty) {
prior_occupation <- gsub('Prior Occupation', '', prior_occupation_dirty)
prior_occupation <- tolower(prior_occupation)
prior_occupation <- gsub('^\\W+|^\\s+', ' ', prior_occupation)
prior_occupation <- gsub('`', '', prior_occupation)
prior_occupation <- gsub(':', ' ', prior_occupation)
prior_occupation <- gsub('\\.', ' ', prior_occupation)
prior_occupation <- gsub('\r', ' ', prior_occupation)
prior_occupation <- gsub('\n', ' ', prior_occupation)
prior_occupation <- gsub('\\s+', ' ', prior_occupation)
if (length(grep('n/a', prior_occupation)) == 0) {
prior_occupation <- gsub('/', ', ', prior_occupation)
}
prior_occupation <- gsub(' and ', ', ', prior_occupation)
prior_occupation <- gsub('&', ', ', prior_occupation)
prior_occupation <- gsub(';', ', ', prior_occupation)
prior_occupation <- trimws(prior_occupation)
prior_occupations <- strsplit(prior_occupation, ', ')
prior_occupations <- lapply(prior_occupations, function (x) return(gsub('\\s+', ' ', x)))
prior_occupations <- lapply(prior_occupations, function (x) return(trimws(x)))
if (length(prior_occupations[[1]]) > 0) {
prior_occupations_cleaned <- c(prior_occupations_cleaned, prior_occupations, recursive=T)
inmates_prior_occupations[[toString(i)]] <- prior_occupations[[1]]
i <- i + 1
}
}
total_inmate_count = length(inmates_prior_occupations)
Looking over the dataset, you’ll notice some redundancies such as mechanic
versus auto mechanic
. With care, we will condense the prior occupations to the sub-strings of the redundancies. We’ll also need to hard-code some corrections for irregularities such as fork lift
versus forklift
. Once the duplicates have been merged, we’ll generate a unique list of the prior occupations for our axis labels.
# -----------------------
# Duplication Removal
# -----------------------
prior_occupations_merged = c()
prior_occupations_merged_map = list()
for (prior_occupation_cleaned1 in prior_occupations_cleaned) {
prior_occupation <- prior_occupation_cleaned1
if (is.null(prior_occupations_merged_map[[prior_occupation]])) {
if (nchar(prior_occupation) >= 4) {
for (prior_occupation_cleaned2 in prior_occupations_cleaned) {
if (nchar(prior_occupation_cleaned2) >= 4) {
if (length(grep(prior_occupation_cleaned2, prior_occupation)) > 0) {
if (nchar(prior_occupation) >= nchar(prior_occupation_cleaned2)) {
prior_occupation <- prior_occupation_cleaned2
}
}
}
}
}
if (prior_occupation == 'n/a') {
prior_occupation <- 'none'
}
if (length(grep('air condition', prior_occupation)) > 0) {
prior_occupation <- 'ac'
}
if (length(grep('fork lift', prior_occupation)) > 0) {
prior_occupation <- 'forklift'
}
if (length(grep('forklift', prior_occupation)) > 0) {
prior_occupation <- 'forklift'
}
if (length(grep('clerical', prior_occupation)) > 0) {
prior_occupation <- 'clerk'
}
if (length(grep('welding', prior_occupation)) > 0) {
prior_occupation <- 'welder'
}
prior_occupations_merged_map[[prior_occupation_cleaned1]] <- prior_occupation
} else {
prior_occupation <- prior_occupations_merged_map[[prior_occupation]]
}
prior_occupations_merged <- c(prior_occupations_merged, prior_occupation)
}
total_prior_occupation_count = length(prior_occupations_merged)
i <- 1
for (inmate_prior_occupations in inmates_prior_occupations) {
temp = c()
for (prior_occupation in inmate_prior_occupations) {
temp <- c(temp, prior_occupations_merged_map[[prior_occupation]])
}
inmates_prior_occupations[[toString(i)]] <- temp
i <- i + 1
}
unique_prior_occupations = unique(sort(prior_occupations_merged))
unique_prior_occupations_length <- length(unique_prior_occupations)
Mining
We have collected, cleaned, and removed erroneous variations in our dataset. Now we’ll move on to mining and plotting/charting our data.
Total Prior Occupation Relative Frequency Distribution
Let us begin by charting the relative frequency of prior occupations listed across all sampled inmates.
# -----------------------
# Prior Occupation Relative Frequency Chart
# -----------------------
reverse_table <- function(t) { t[sort(names(t), decreasing=T)] }
prior_occupations_table <- reverse_table(table(prior_occupations_merged))
prior_occupations_table <- prior_occupations_table / sum(prior_occupations_table) * 100
png(file='prior_occupation_relative_freq_dist.png', width=1500, height=1500)
par(
mai=c(2.0, 3.0, 1.0, 1.0)
)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
las=1,
xlim=c(0, 100),
axes=T,
border=NA
)
axis(1, at=seq(0,100,5))
grid(nx=100, ny=NA)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
border=NA,
las=1,
xlim=c(0, 100),
axes=T,
add=T,
xlab=paste('Percentage of', total_prior_occupation_count, 'listed Prior Occupations'),
main=paste('Prior Occupation Relative Frequency Distribution of', total_inmate_count, 'Texas Death Row Inmates')
)
mtext('Prior Occupation', side=2, line=13)
The unique prior occupations are in alphabetical order along the x-axis. The y-axis is the relative frequency distribution. laborer
accounts for nearly 41% of the 291 listed prior occupations among the 182 death row inmates sampled. Attributing to its large percentage was the merging of duplicates, for example, general laborer
and assembly line laborer
into just laborer
. One could argue that they are distinct labels. contruction
and none
are the second and third largest respectively.
We can explore the dataset further by clustering the sampled inmates by the prior occupations they had. We’ll treat each inmate as a document with their prior occupations making up the document. By clustering the inmates via their prior occupations, we can partition the dataset into different prior occupation profiles.
Inmate Prior Occupations Matrix
To begin clustering, we will need to vectorize each inmates’ prior occupations. We’ll assemble these vectors, into the inmate_matrix_count
where [i][j]
is >= 1
if the ith inmate had the jth occupation and 0 otherwise. No inmate had the same prior occupation listed more than once, so no count will be greater than one.
# -----------------------
# Inmate x Prior Occupations
# -----------------------
inmate_matrix_count <- c()
for (row in 1:length(inmates_prior_occupations)) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(row)]]
for (unique_prior_occupation in unique_prior_occupations) {
inmate_matrix_count <- c(
inmate_matrix_count,
sum(unique_prior_occupation == inmate_prior_occupations)
)
}
}
inmate_matrix_count = matrix(
inmate_matrix_count,
nrow=total_inmate_count,
ncol=unique_prior_occupations_length,
byrow=T
)
Prior Occupation Weighting (TF-IDF)
Since we are dealing with text and that nearly half of all sampled inmates were a laborer at some point, we’ll perform a term weighting technique known as TF-IDF. Given that an inmate was both a laborer
and welder
at some point, TF-IDF will weight laborer
less since welder
appears less frequently in the text corpus. Knowing that an inmate was a laborer
does not say as much as knowing they were also a welder
.
# -----------------------
# TF-IDF
# -----------------------
inmate_matrix_tfidf <- matrix(
c(
1:(total_inmate_count * unique_prior_occupations_length)
) * 0,
nrow=total_inmate_count,
ncol=unique_prior_occupations_length,
byrow=T
)
nrow_count <- nrow(inmate_matrix_count)
for (row in 1:nrow(inmate_matrix_count)) {
for (col in 1:ncol(inmate_matrix_count)) {
tf <- inmate_matrix_count[row,col]
df <- length(which(inmate_matrix_count[,col] > 0))
idf <- log(nrow_count / (df + 1))
tfidf <- tf * idf
inmate_matrix_tfidf[row,col] <- tfidf
}
}
Multidimensional Scaling (MDS)
To visualize the inmate vectors in two dimensions we’ll employ multidimensional scaling.
An MDS algorithm aims to place each object in N-dimensional space such that the between-object distances are preserved as well as possible.
# -----------------------
# MDS
# -----------------------
plot_scale <- 1.0
scatter_plot_matrix <- function(prefix, input_matrix) {
png(file=paste(tolower(prefix), 'scatter_plot.png', sep='_'), width=1500, height=1500)
plot.new()
input_matrix[,1]
input_matrix[,2]
textplot(
input_matrix[, 1],
input_matrix[, 2],
seq(1:length(input_matrix[,1])),
xlim=c(
min(input_matrix[, 1]) * plot_scale,
max(input_matrix[, 1]) * plot_scale
),
ylim=c(
min(input_matrix[, 2]) * plot_scale,
max(input_matrix[, 2]) * plot_scale
),
main=paste('Inmate Prior Occupations 2D Visualization', paste0('(', prefix, ')'))
)
}
dist_matrix <- dist(inmate_matrix_count, method='euclidean')
inmate_matrix_count_mds <- cmdscale(dist_matrix)
scatter_plot_matrix('COUNT-MDS', inmate_matrix_count_mds)
dist_matrix <- dist(inmate_matrix_tfidf, method='euclidean')
inmate_matrix_tfidf_mds <- cmdscale(dist_matrix)
scatter_plot_matrix('TFIDF-MDS', inmate_matrix_tfidf_mds)
We can see a large mass around the origin. Another distinct mass is seen at (-0.8951273 -0.702121).
Affinity Propagation (AP) Clustering
Due to the concave shapes and variable density of the MDS scatter plot, we’ll employ Affinity Propagation Clustering. This has the added benefit of not having to specify the amount of clusters ahead of time (K
).
# -----------------------
# AP CLUSTERING
# -----------------------
cluster_matrix <- function(prefix, input_matrix, inmates_prior_occupations) {
apresult <- apcluster(negDistMat(r=2), input_matrix)
input_cluster_labels = labels(apresult, type='enum')
inmate_clusters_flt <- list()
inmate_clusters_str <- list()
i <- 1
for (input_cluster_label in input_cluster_labels) {
input_cluster_label <- toString(input_cluster_label)
if (is.null(inmate_clusters_flt[[input_cluster_label]])) {
inmate_clusters_flt[[input_cluster_label]] <- c()
inmate_clusters_str[[input_cluster_label]] <- list()
}
inmate_clusters_flt[[input_cluster_label]] <- c(
inmate_clusters_flt[[input_cluster_label]],
input_matrix[i,]
)
i_str <- toString(i)
temp = list()
temp[[i_str]] <- inmates_prior_occupations[[i_str]]
inmate_clusters_str[[input_cluster_label]] <- append(
inmate_clusters_str[[input_cluster_label]],
temp
)
i <- i + 1
}
for (cluster_i in 1:length(inmate_clusters_flt)) {
cluster_i_str <- toString(cluster_i)
cluster <- matrix(inmate_clusters_flt[[cluster_i_str]], ncol=2, byrow=T)
inmate_clusters_flt[[cluster_i_str]] <- cluster
}
inmate_clusters_colors = rainbow(length(inmate_clusters_flt), v=0.8, s=0.5)
write(
sapply(
sort(as.integer(names(inmate_clusters_str))),
function(x) paste(
'Cluster',
toString(x),
'\n ',
paste('Inmate', names(inmate_clusters_str[[toString(x)]]), inmate_clusters_str[[toString(x)]], collapse='\n ')
)),
paste(tolower(prefix), 'inmate_clusters.txt', sep='_')
)
return(list(flt=inmate_clusters_flt, str=inmate_clusters_str, colors=inmate_clusters_colors, labels=input_cluster_labels))
}
result <- cluster_matrix('TFIDF-MDS-AP', inmate_matrix_tfidf_mds, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
inmate_cluster_labels <- result$labels
With the inmate_matrix_tfidf_mds
clustered, we can now plot the clusters.
plot_clustering <- function(prefix, input_matrix, inmate_cluster_flt, inmate_clusters_colors, inmate_cluster_labels) {
png(file=paste(tolower(prefix), 'inmate_clusters.png', sep='_'), width=1500, height=1500)
plot.new()
plot(
input_matrix[,1],
input_matrix[,2],
main=paste('Death Row Inmate Clustering by Prior Occupations', paste0('(', prefix, ')'))
)
for (cluster_i in 1:length(inmate_clusters_flt)) {
cluster_i_str <- toString(cluster_i)
points(
inmate_clusters_flt[[cluster_i_str]],
col=inmate_clusters_colors[cluster_i],
bg=inmate_clusters_colors[cluster_i],
pch=19,
cex=5,
lwd=2
)
}
text(
input_matrix[, 1],
input_matrix[, 2],
inmate_cluster_labels,
xlim=c(
min(input_matrix[, 1]) * plot_scale,
max(input_matrix[, 1]) * plot_scale
),
ylim=c(
min(input_matrix[, 2]) * plot_scale,
max(input_matrix[, 2]) * plot_scale
),
)
legend(
'topright',
legend=lapply(c(1:length(inmate_clusters_colors)), function (x) return(toString(x))),
fill=inmate_clusters_colors,
cex=1.3,
pt.cex=1
)
}
plot_clustering('TFIDF-MDS-AP', inmate_matrix_tfidf_mds, inmate_clusters_flt, inmate_clusters_colors, inmate_cluster_labels)
In total, the algorithm generated 17 clusters. Cluster membership sizes range from one to 92 inmates.
Inmate Clusters, Prior Occupations Relative Frequency Distributions
Now let us generate a relative frequency distribution bar-chart for each TF-IDF MDS AP cluster.
plot_clusters_occupation_distributions <- function(prefix, inmate_clusters_str, unique_prior_occupations) {
for (i in 1:length(inmate_clusters_str)) {
cluster <- inmate_clusters_str[[toString(i)]]
cluster_prior_occupations <- c()
for (inmate_prior_occupations in cluster) {
cluster_prior_occupations <- c(cluster_prior_occupations, inmate_prior_occupations)
}
prior_occupations_table <- reverse_table(table(factor(cluster_prior_occupations, levels=unique_prior_occupations)))
prior_occupations_table <- prior_occupations_table / sum(prior_occupations_table) * 100
png(file=paste(tolower(prefix), toString(i), 'prior_occupation_distribution.png', sep='_'), width=1500, height=1500)
plot.new()
par(
mai=c(2.0, 3.0, 1.0, 1.0)
)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
las=1,
xlim=c(0, 100),
axes=T,
border=NA
)
axis(1, at=seq(0,100,5))
grid(nx=100, ny=NA)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
las=1,
xlim=c(0, 100),
axes=T,
add=T,
ylab='',
xlab=paste('Percentage of', length(cluster_prior_occupations), 'listed Prior Occupations'),
main=paste(
'Prior Occupation Relative Frequency Distribution of',
length(cluster),
'Texas Death Row Inmate(s) for Cluster',
toString(i),
paste0(
'(',
prefix,
')'
)
),
border=NA
)
mtext('Prior Occupation', side=2, line=13)
dev.off()
}
}
plot_clusters_occupation_distributions('TFIDF-MDS-AP', inmate_clusters_str, unique_prior_occupations)
Cluster 1, seen at (-0.8951273 -0.702121), is solely comprised of none
which agrees with the dataset. Intuitively, if an inmate had no prior occupations, you wouldn’t expect to see other prior occupation terms clustered with none
.
The following bar-charts are the clusters making up the large mass centered around the origin.
Cluster 5 accounts for the inmates that were mainly laborers with only a very few having one other prior occupation. This cluster also contains a large portion of the unique prior occupations found in the raw data set such as computer software
.
For cluster 10, there is a large mixture of both laborer
and warehouse
.
Cluster 17 is more difficult to interpret and could have been likely clustered with cluster 5, however, it is the only cluster that accounts for the inmates that had unknown
prior occupations. unknown
did not collocate with any other prior occupation so it is surprising to find it clustered with other prior occupations. This is likely due to the MDS.
Automatic Cluster Labels
Using differential cluster labeling and more specifically pointwise mutual information (PMI), we can label each cluster according to its prior occupation with the highest association.
pc <- function(cn, b) {
if (b == 1) {
return(sum(input_cluster_labels == cn) / length(input_cluster_labels))
} else {
return(sum(input_cluster_labels != cn) / length(input_cluster_labels))
}
}
pt <- function(ct, b) {
if (b == 1) {
in_d <- 0
for (inmate_prior_occupations in inmates_prior_occupations) {
count <- sum(ct == inmate_prior_occupations)
if (count > 0) {
in_d <- in_d + 1
}
}
return(in_d / length(inmates_prior_occupations))
} else {
n_in_d <- 0
for (inmate_prior_occupations in inmates_prior_occupations) {
count <- sum(ct == inmate_prior_occupations)
if (count == 0) {
n_in_d <- n_in_d + 1
}
}
return(n_in_d / length(inmates_prior_occupations))
}
}
pct <- function(cn, ct, b) {
if (all(b == c(0, 0))) {
n_in_d <- 0
inmate_indexes <- which(!input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count == 0) {
n_in_d <- n_in_d + 1
}
}
return(n_in_d / length(inmate_indexes))
} else if (all(b == c(1, 1))) {
in_d <- 0
inmate_indexes <- which(input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count > 0) {
in_d <- in_d + 1
}
}
return(in_d / length(inmate_indexes))
} else if (all(b == c(1, 0))) {
n_in_d <- 0
inmate_indexes <- which(input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count == 0) {
n_in_d <- n_in_d + 1
}
}
return(n_in_d / length(inmate_indexes))
} else if (all(b == c(0, 1))) {
in_d <- 0
inmate_indexes <- which(!input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count > 0) {
in_d <- in_d + 1
}
}
return(in_d / length(inmate_indexes))
}
}
pmi <- function(cn, ct) {
pmi_score <- 0
for (in_c in 0:1) {
for (has_t in 0:1) {
jpct_r <- pct(cn, ct, c(in_c, has_t))
pc_r <- pc(cn, in_c)
pt_r <- pt(ct, has_t)
result <- jpct_r * log2(jpct_r / (pc_r * pt_r))
if (is.nan(result)) {
result <- 0
}
pmi_score <- pmi_score + result
}
}
return(pmi_score)
}
output_cluster_labels <- list()
for(i in 1:length(inmate_clusters_str)) {
cluster <- inmate_clusters_str[[toString(i)]]
label <- ''
max_pmi_score <- -1.0
seen <- c()
for (inmate in cluster) {
for (prior_occupation in inmate) {
if (sum(prior_occupation == seen) == 0) {
pmi_score <- pmi(i, prior_occupation)
if (pmi_score > max_pmi_score) {
output_cluster_labels[[toString(i)]] <- prior_occupation
max_pmi_score <- pmi_score
}
seen <- c(seen, prior_occupation)
}
}
}
}
- Cluster 1:
none
- Cluster 2:
mechanic
- Cluster 3:
mechanic
- Cluster 4:
construction
- Cluster 5:
laborer
- Cluster 6:
cook
- Cluster 7:
welder
- Cluster 8:
welder
- Cluster 9:
ac
- Cluster 10:
warehouse
- Cluster 11:
construction
- Cluster 12:
clerk
- Cluster 13:
shipping
- Cluster 14:
clerk
- Cluster 15:
welder
- Cluster 16:
mechanic
- Cluster 17:
unknown
Condensing these even further:
none
: cluster 1mechanic
: cluster 2, 3, 16construction
: cluster 4, 11laborer
: cluster 5cook
: cluster 6welder
: cluster 7, 8, 15ac
: cluster 9warehouse
: cluster 10clerk
: cluster 12, 14shipping
: cluster 13unknown
: cluster 17
This effectively condenses down the original 60 prior occupations down to the 11 most prominent in the sampled data. These 11 prior occupations could all be described as blue-collar occupations with the exception of clerk
.
TF-IDF AP Clustering Only
Comparing the TF-IDF MDS AP clusters to TF-IDF AP clusters, using only the inmate_matrix_tfidf
, the Affinity Propagation clustering algorithm generated 48 clusters using the original 60 dimensions versus the 17 clusters generated using the two dimensions found by MDS.
These 48 clusters are more granular with a clearer partitioning of inmates with unique prior occupations. 30 out of the 48 clusters contain only one inmate due to their unique combination of prior occupations. Cluster membership ranges from one to 71. Like with the MDS clusters, none
is its own cluster. However, unlike the MDS clusters, all of the unknown
prior occupations are clustered together alone.
Latent Semantic Indexing (LSI), Singular Value Decomposition (SVD)
We can take the TF-IDF normalized inmate-prior-occupation matrix and perform SVD on it.
LSI can be viewed as soft clustering by interpreting each dimension of the reduced space as a cluster and the value that a document has on that dimension as its fractional membership in that cluster.
This will give us three matrices U
, D
, and V
. U
contains our inmate-concept vectors in the new orthogonal basis.
# -----------------------
# SVD
# -----------------------
inmate_matrix_tfidf_svd <- svd(inmate_matrix_tfidf)
k <- 38
inmate_k_basis_vectors = inmate_matrix_tfidf_svd$u[, 1:k]
k_singular_values_matrix = diag(inmate_matrix_tfidf_svd$d)[1:k, 1:k]
inmate_matrix_tfidf_svd_truncated <- inmate_k_basis_vectors %*% k_singular_values_matrix %*% t(inmate_matrix_tfidf_svd$v[, 1:k])
png(file='svd_explained_variance.png', width=1500, height=1500)
plot.new()
cumulativeVarianceExplained = cumsum(inmate_matrix_tfidf_svd$d^2 / sum(inmate_matrix_tfidf_svd$d^2)) * 100
plot(
cumulativeVarianceExplained,
type='o',
ylim=c(0,100),
xlab='Index',
ylab='Percent of Variability Explained',
main='Singular Value Decomposition'
)
axis(1, at=seq(1,length(inmate_matrix_tfidf_svd$d)))
abline(v=k)
abline(h=90.0)
bar_colors <- c()
for (i in 1:length(inmate_matrix_tfidf_svd$d)) {
if (i == k) {
bar_colors <- c(bar_colors, 'brown2')
} else {
bar_colors <- c(bar_colors, 'chartreuse3')
}
}
png(
file='svd_singular_values.png',
width=1500,
height=1500,
)
plot.new()
barplot(
inmate_matrix_tfidf_svd$d,
names.arg=1:length(inmate_matrix_tfidf_svd$d),
col=bar_colors,
ylim=c(0, max(inmate_matrix_tfidf_svd$d) + 1),
border=NA,
xlab='Index',
ylab='Singular Value',
main='Singular Value Decomposition'
)
We’ll need to choose a K <= R
(R
being the rank) where R=60
. At K=38
, 90% of the variability of the original matrix is explained.
Now let us cluster and plot the prior occupation relative frequency distribution for each resulting cluster.
result <- cluster_matrix('TFIDF-SVD-AP', inmate_k_basis_vectors, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
inmate_cluster_labels <- result$in_labels
inmate_clusters_labels <- result$out_labels
if (k == 2) {
plot_clustering(
'TFIDF-SVD-AP',
inmate_k_basis_vectors,
inmate_clusters_flt,
inmate_clusters_colors,
inmate_cluster_labels
)
}
plot_clusters_occupation_distributions('TFIDF-SVD-AP', inmate_clusters_str, unique_prior_occupations)
After clustering, 40 clusters were found. Contrast this with the 48 found by TF-IDF alone which used 60 features instead the 38 used after applying SVD. Cluster membership ranges from one to 71.
Below are the cluster labels found by PMI:
- Cluster 1:
computer operator
- Cluster 2:
none
- Cluster 3:
kitchen worker
- Cluster 4:
computer programmer
- Cluster 5:
construction
- Cluster 6:
warehouse
- Cluster 7:
landscaper
- Cluster 8:
cabinet maker
- Cluster 9:
sales
- Cluster 10:
pipe fitter
- Cluster 11:
laborer
- Cluster 12:
barber
- Cluster 13:
machine operator
- Cluster 14:
truck driver
- Cluster 15:
oil field
- Cluster 16:
food service
- Cluster 17:
roofer
- Cluster 18:
fabricator
- Cluster 19:
cook
- Cluster 20:
computer technician
- Cluster 21:
janitor
- Cluster 22:
wrecker driver
- Cluster 23:
painter
- Cluster 24:
hydro-water blaster
- Cluster 25:
ranch hand
- Cluster 26:
heavy equipment operator
- Cluster 27:
iron worker
- Cluster 28:
shipping
- Cluster 29:
computer software
- Cluster 30:
clerk
- Cluster 31:
plumber's helper
- Cluster 32:
welder
- Cluster 33:
mechanic
- Cluster 34:
carpenter
- Cluster 35:
forklift
- Cluster 36:
heating
- Cluster 37:
unkown
- Cluster 38:
food service
- Cluster 39:
jewelry designer
- Cluster 40:
ac
Recap
We collected, parsed, and mined the prior occupations of current death row inmates. Interesting patterns discovered were the large portion of blue-collar occupations (most notably laborer
) and more rare prior occupations such as computer operator
and jewelry designer
. An interesting hypothesis test would be the correlation between being on death row and having been a laborer
. The SVD computation could be used for information retrieval allowing one to search for similar current inmates by some prior occupation query. Further analysis could include plotting the amount of each prior occupation seen per year.
Appendix
Full SVD AP Clustering
Computer operator Cluster 1
Inmate 1 computer operator
None Cluster 2
Inmate 3 none
Inmate 4 none
Inmate 8 none
Inmate 13 none
Inmate 18 none
Inmate 41 none
Inmate 42 none
Inmate 44 none
Inmate 50 none
Inmate 61 none
Inmate 63 none
Inmate 65 none
Inmate 66 none
Inmate 68 none
Inmate 177 none
Kitchen worker Cluster 3
Inmate 10 c("kitchen worker", "construction")
Inmate 48 kitchen worker
Computer programmer Cluster 4
Inmate 11 computer programmer
Inmate 156 c("computer programmer", "repair", "laborer")
Construction Cluster 5
Inmate 7 c("construction", "plumbing")
Inmate 15 construction
Inmate 16 construction
Inmate 17 construction
Inmate 45 construction
Inmate 47 construction
Inmate 54 construction
Inmate 116 c("construction", "laborer")
Warehouse Cluster 6
Inmate 5 c("warehouse", "construction")
Inmate 12 warehouse
Inmate 22 c("warehouse", "laborer")
Inmate 109 c("warehouse", "laborer")
Inmate 112 c("warehouse", "electrician", "laborer")
Landscaper Cluster 7
Inmate 25 c("landscaper", "cook")
Inmate 69 c("landscaper", "laborer")
Cabinet maker Cluster 8
Inmate 28 cabinet maker
Sales Cluster 9
Inmate 29 sales
Inmate 147 c("sound engineer", "laborer")
Pipe fitter Cluster 10
Inmate 33 pipe fitter
Inmate 60 c("truck driver", "backhoe operator", "welder", "pipe fitter")
Laborer Cluster 11
Inmate 6 repair
Inmate 21 laborer
Inmate 23 laborer
Inmate 26 laborer
Inmate 30 laborer
Inmate 32 laborer
Inmate 34 laborer
Inmate 35 laborer
Inmate 36 laborer
Inmate 38 laborer
Inmate 40 laborer
Inmate 43 laborer
Inmate 46 laborer
Inmate 51 laborer
Inmate 70 laborer
Inmate 71 laborer
Inmate 74 laborer
Inmate 77 c("electrician", "laborer")
Inmate 78 laborer
Inmate 79 laborer
Inmate 80 laborer
Inmate 82 laborer
Inmate 85 laborer
Inmate 86 laborer
Inmate 87 laborer
Inmate 89 c("maintenance", "laborer")
Inmate 90 laborer
Inmate 91 laborer
Inmate 94 laborer
Inmate 95 laborer
Inmate 96 laborer
Inmate 97 laborer
Inmate 98 laborer
Inmate 99 laborer
Inmate 100 c("fast food", "laborer")
Inmate 104 laborer
Inmate 107 laborer
Inmate 110 laborer
Inmate 111 c("fast food", "laborer")
Inmate 113 c("maintenance", "laborer")
Inmate 119 c("maintenance", "laborer")
Inmate 121 laborer
Inmate 122 laborer
Inmate 127 laborer
Inmate 128 laborer
Inmate 129 laborer
Inmate 130 laborer
Inmate 132 laborer
Inmate 133 laborer
Inmate 134 laborer
Inmate 136 laborer
Inmate 138 laborer
Inmate 139 laborer
Inmate 140 laborer
Inmate 142 laborer
Inmate 144 laborer
Inmate 145 laborer
Inmate 149 laborer
Inmate 150 laborer
Inmate 151 laborer
Inmate 152 laborer
Inmate 154 laborer
Inmate 157 laborer
Inmate 163 laborer
Inmate 166 laborer
Inmate 171 laborer
Inmate 174 laborer
Inmate 175 laborer
Inmate 176 laborer
Inmate 179 laborer
Inmate 181 laborer
Barber Cluster 12
Inmate 39 c("barber", "laborer")
Machine operator Cluster 13
Inmate 53 machine operator
Truck driver Cluster 14
Inmate 20 truck driver
Inmate 56 truck driver
Inmate 57 truck driver
Inmate 62 c("truck driver", "warehouse")
Inmate 88 c("truck driver", "laborer")
Oil field Cluster 15
Inmate 58 c("welder", "oil field", "architectural design", "construction")
Food service Cluster 16
Inmate 59 c("food service", "clerk")
Roofer Cluster 17
Inmate 64 roofer
Inmate 72 c("mechanic", "roofer", "concrete")
Fabricator Cluster 18
Inmate 75 c("fabricator", "refrigeration services")
Cook Cluster 19
Inmate 37 cook
Inmate 67 c("construction", "cook")
Inmate 81 c("cook", "laborer")
Inmate 83 c("cook", "laborer")
Inmate 118 c("cook", "construction", "welder", "laborer")
Inmate 124 c("cook", "welder", "laborer")
Inmate 137 c("cook", "mechanic", "laborer")
Computer technician Cluster 20
Inmate 84 c("computer technician", "laborer")
Janitor Cluster 21
Inmate 92 c("janitor", "laborer")
Wrecker driver Cluster 22
Inmate 93 c("wrecker driver", "construction", "lineman", "laborer")
Painter Cluster 23
Inmate 73 painter
Inmate 101 c("painter", "laborer")
Inmate 120 c("painter", "construction", "laborer")
Hydro-water blaster Cluster 24
Inmate 103 c("hydro-water blaster", "laborer")
Ranch hand Cluster 25
Inmate 105 c("repair", "ranch hand", "laborer")
Heavy equipment operator Cluster 26
Inmate 14 c("mechanic", "warehouse", "heavy equipment operator")
Inmate 114 c("heavy equipment operator", "laborer")
Inmate 162 c("carpenter", "heavy equipment operator", "laborer")
Iron worker Cluster 27
Inmate 115 c("iron worker", "construction", "carpet layer", "laborer")
Shipping Cluster 28
Inmate 123 c("shipping", "receiving", "clerk", "mechanic", "laborer")
Computer software Cluster 29
Inmate 126 c("computer software", "accounting", "laborer")
Clerk Cluster 30
Inmate 2 clerk
Inmate 27 clerk
Inmate 117 c("clerk", "welder", "laborer")
Inmate 131 c("clerk", "laborer")
Inmate 153 c("clerk", "orderly", "laborer")
"Plumber's helper" Cluster 31
Inmate 141 c("electrician", "plumber's helper", "laborer")
Welder Cluster 32
Inmate 19 welder
Inmate 31 c("welder", "lawn scaping", "laborer")
Inmate 55 welder
Inmate 76 c("welder", "fence builder")
Inmate 135 c("welder", "contractor", "laborer")
Inmate 143 c("welder", "laborer")
Mechanic Cluster 33
Inmate 9 c("mechanic", "welder")
Inmate 52 c("mechanic", "welder", "laborer")
Inmate 146 c("mechanic", "laborer")
Inmate 148 c("electrician", "mechanic", "laborer")
Inmate 158 c("mechanic", "laborer")
Inmate 161 c("auto", "mechanic", "laborer")
Inmate 182 mechanic
Carpenter Cluster 34
Inmate 106 c("warehouse", "carpenter", "laborer")
Inmate 108 c("maintenance", "carpenter", "laborer")
Inmate 125 c("welder", "carpenter", "laborer")
Inmate 155 c("carpenter", "laborer")
Inmate 164 c("carpenter", "laborer")
Forklift Cluster 35
Inmate 24 c("forklift", "warehouse")
Inmate 49 c("laborer", "forklift")
Inmate 159 c("forklift", "laborer")
Heating Cluster 36
Inmate 160 c("heating", "ac", "electronics", "exterminator", "laborer")
Unknown Cluster 37
Inmate 165 unknown
Inmate 167 unknown
Inmate 168 unknown
Inmate 169 unknown
Inmate 172 unknown
Inmate 173 unknown
Food service Cluster 38
Inmate 170 c("warehouse", "food service", "laborer")
Jewelry designer Cluster 39
Inmate 178 jewelry designer
Ac Cluster 40
Inmate 102 c("ac", "repair", "electrician", "laborer")
Inmate 180 ac
Full Source Code
# -----------------------
# (C) 2016 David lettier
# http://www.lettier.com/
# -----------------------
library(rvest)
library(wordcloud)
library(tm)
library(apcluster)
# -----------------------
# Scraping
# -----------------------
base_link <- 'https://www.tdcj.state.tx.us/death_row/'
main_link <- 'https://www.tdcj.state.tx.us/death_row/dr_offenders_on_dr.html'
csv_filename <- 'prior_occupations_dirty.csv'
if (!file.exists(csv_filename)) {
page <- read_html(main_link)
links <- html_attr(html_nodes(page, 'a'), 'href')
links <- grep('^dr_info.*html$', links, value=TRUE)
prior_occupations_dirty <- c()
for (link in links) {
texts <- read_html(paste(base_link, link, sep='')) %>% html_nodes('p') %>% html_text()
if (length(grep('Prior Prison', texts[3])) >= 1) {
prior_occupations_dirty <- c(prior_occupations_dirty, texts[2])
} else {
prior_occupations_dirty <- c(prior_occupations_dirty, texts[3])
}
}
prior_occupations_dirty_df = as.data.frame(prior_occupations_dirty)
write.csv(prior_occupations_dirty_df, file=csv_filename)
}
# -----------------------
# Cleaning
# -----------------------
csv_file <- read.csv(file=csv_filename)
prior_occupations_dirty <- as.vector(t(csv_file$prior_occupations_dirty))
prior_occupations_cleaned = c()
inmates_prior_occupations = list()
i <- 1
for (prior_occupation_dirty in prior_occupations_dirty) {
prior_occupation <- gsub('Prior Occupation', '', prior_occupation_dirty)
prior_occupation <- tolower(prior_occupation)
prior_occupation <- gsub('^\\W+|^\\s+', ' ', prior_occupation)
prior_occupation <- gsub('`', '', prior_occupation)
prior_occupation <- gsub(':', ' ', prior_occupation)
prior_occupation <- gsub('\\.', ' ', prior_occupation)
prior_occupation <- gsub('\r', ' ', prior_occupation)
prior_occupation <- gsub('\n', ' ', prior_occupation)
prior_occupation <- gsub('\\s+', ' ', prior_occupation)
if (length(grep('n/a', prior_occupation)) == 0) {
prior_occupation <- gsub('/', ', ', prior_occupation)
}
prior_occupation <- gsub(' and ', ', ', prior_occupation)
prior_occupation <- gsub('&', ', ', prior_occupation)
prior_occupation <- gsub(';', ', ', prior_occupation)
prior_occupation <- trimws(prior_occupation)
prior_occupations <- strsplit(prior_occupation, ', ')
prior_occupations <- lapply(prior_occupations, function (x) return(gsub('\\s+', ' ', x)))
prior_occupations <- lapply(prior_occupations, function (x) return(trimws(x)))
if (length(prior_occupations[[1]]) > 0) {
prior_occupations_cleaned <- c(prior_occupations_cleaned, prior_occupations, recursive=T)
inmates_prior_occupations[[toString(i)]] <- prior_occupations[[1]]
i <- i + 1
}
}
total_inmate_count = length(inmates_prior_occupations)
# -----------------------
# Duplication Removal
# -----------------------
prior_occupations_merged = c()
prior_occupations_merged_map = list()
for (prior_occupation_cleaned1 in prior_occupations_cleaned) {
prior_occupation <- prior_occupation_cleaned1
if (is.null(prior_occupations_merged_map[[prior_occupation]])) {
if (nchar(prior_occupation) >= 4) {
for (prior_occupation_cleaned2 in prior_occupations_cleaned) {
if (nchar(prior_occupation_cleaned2) >= 4) {
if (length(grep(prior_occupation_cleaned2, prior_occupation)) > 0) {
if (nchar(prior_occupation) >= nchar(prior_occupation_cleaned2)) {
prior_occupation <- prior_occupation_cleaned2
}
}
}
}
}
if (prior_occupation == 'n/a') {
prior_occupation <- 'none'
}
if (length(grep('air condition', prior_occupation)) > 0) {
prior_occupation <- 'ac'
}
if (length(grep('fork lift', prior_occupation)) > 0) {
prior_occupation <- 'forklift'
}
if (length(grep('forklift', prior_occupation)) > 0) {
prior_occupation <- 'forklift'
}
if (length(grep('clerical', prior_occupation)) > 0) {
prior_occupation <- 'clerk'
}
if (length(grep('welding', prior_occupation)) > 0) {
prior_occupation <- 'welder'
}
prior_occupations_merged_map[[prior_occupation_cleaned1]] <- prior_occupation
} else {
prior_occupation <- prior_occupations_merged_map[[prior_occupation]]
}
prior_occupations_merged <- c(prior_occupations_merged, prior_occupation)
}
total_prior_occupation_count = length(prior_occupations_merged)
i <- 1
for (inmate_prior_occupations in inmates_prior_occupations) {
temp = c()
for (prior_occupation in inmate_prior_occupations) {
temp <- c(temp, prior_occupations_merged_map[[prior_occupation]])
}
inmates_prior_occupations[[toString(i)]] <- temp
i <- i + 1
}
unique_prior_occupations = sort(unique(prior_occupations_merged))
unique_prior_occupations_length <- length(unique_prior_occupations)
# -----------------------
# Prior Occupation Relative Frequency Chart
# -----------------------
reverse_table <- function(t) { t[sort(names(t), decreasing=T)] }
prior_occupations_table <- reverse_table(table(prior_occupations_merged))
prior_occupations_table <- prior_occupations_table / sum(prior_occupations_table) * 100
png(file='prior_occupation_relative_freq_dist.png', width=1500, height=1500)
par(
mai=c(2.0, 3.0, 1.0, 1.0)
)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
las=1,
xlim=c(0, 100),
axes=T,
border=NA
)
axis(1, at=seq(0,100,5))
grid(nx=100, ny=NA)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
border=NA,
las=1,
xlim=c(0, 100),
axes=T,
add=T,
xlab=paste('Percentage of', total_prior_occupation_count, 'listed Prior Occupations'),
main=paste('Prior Occupation Relative Frequency Distribution of', total_inmate_count, 'Texas Death Row Inmates')
)
mtext('Prior Occupation', side=2, line=13)
# -----------------------
# Inmate x Prior Occupations
# -----------------------
inmate_matrix_count <- c()
for (row in 1:length(inmates_prior_occupations)) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(row)]]
for (unique_prior_occupation in unique_prior_occupations) {
inmate_matrix_count <- c(
inmate_matrix_count,
sum(unique_prior_occupation == inmate_prior_occupations)
)
}
}
inmate_matrix_count = matrix(
inmate_matrix_count,
nrow=total_inmate_count,
ncol=unique_prior_occupations_length,
byrow=T
)
# -----------------------
# TF-IDF
# -----------------------
inmate_matrix_tfidf <- matrix(
c(
1:(total_inmate_count * unique_prior_occupations_length)
) * 0,
nrow=total_inmate_count,
ncol=unique_prior_occupations_length,
byrow=T
)
nrow_count <- nrow(inmate_matrix_count)
for (row in 1:nrow(inmate_matrix_count)) {
for (col in 1:ncol(inmate_matrix_count)) {
tf <- inmate_matrix_count[row,col]
df <- length(which(inmate_matrix_count[,col] > 0))
idf <- log(nrow_count / (df + 1))
tfidf <- tf * idf
inmate_matrix_tfidf[row,col] <- tfidf
}
}
inmate_matrix_tfidf_l2_norm = c()
for (row in 1:nrow(inmate_matrix_tfidf)) {
row_vector <- inmate_matrix_tfidf[row,]
row_vector_norm <- row_vector/(sqrt(sum(row_vector^2)))
inmate_matrix_tfidf_l2_norm = c(
inmate_matrix_tfidf_l2_norm,
row_vector_norm
)
}
inmate_matrix_tfidf_l2_norm <- matrix(
inmate_matrix_tfidf_l2_norm,
nrow=total_inmate_count,
ncol=unique_prior_occupations_length,
byrow=T
)
# Uncomment to use the L2 norm version of TF-IDF.
# inmate_matrix_tfidf <- inmate_matrix_tfidf_l2_norm
# -----------------------
# MDS
# -----------------------
plot_scale <- 1.0
scatter_plot_matrix <- function(prefix, input_matrix) {
png(file=paste(tolower(prefix), 'scatter_plot.png', sep='_'), width=1500, height=1500)
plot.new()
input_matrix[,1]
input_matrix[,2]
textplot(
input_matrix[, 1],
input_matrix[, 2],
seq(1:length(input_matrix[,1])),
xlim=c(
min(input_matrix[, 1]) * plot_scale,
max(input_matrix[, 1]) * plot_scale
),
ylim=c(
min(input_matrix[, 2]) * plot_scale,
max(input_matrix[, 2]) * plot_scale
),
main=paste('Inmate Prior Occupations 2D Visualization', paste0('(', prefix, ')'))
)
}
dist_matrix <- dist(inmate_matrix_count, method='euclidean')
inmate_matrix_count_mds <- cmdscale(dist_matrix)
scatter_plot_matrix('COUNT-MDS', inmate_matrix_count_mds)
dist_matrix <- dist(inmate_matrix_tfidf, method='euclidean')
inmate_matrix_tfidf_mds <- cmdscale(dist_matrix)
scatter_plot_matrix('TFIDF-MDS', inmate_matrix_tfidf_mds)
# -----------------------
# AP CLUSTERING
# -----------------------
cluster_matrix <- function(prefix, input_matrix, inmates_prior_occupations) {
apresult <- apcluster(negDistMat(r=2), input_matrix)
input_cluster_labels = labels(apresult, type='enum')
inmate_clusters_flt <- list()
inmate_clusters_str <- list()
i <- 1
for (input_cluster_label in input_cluster_labels) {
input_cluster_label <- toString(input_cluster_label)
if (is.null(inmate_clusters_flt[[input_cluster_label]])) {
inmate_clusters_flt[[input_cluster_label]] <- c()
inmate_clusters_str[[input_cluster_label]] <- list()
}
inmate_clusters_flt[[input_cluster_label]] <- c(
inmate_clusters_flt[[input_cluster_label]],
input_matrix[i,]
)
i_str <- toString(i)
temp = list()
temp[[i_str]] <- inmates_prior_occupations[[i_str]]
inmate_clusters_str[[input_cluster_label]] <- append(
inmate_clusters_str[[input_cluster_label]],
temp
)
i <- i + 1
}
for (cluster_i in 1:length(inmate_clusters_flt)) {
cluster_i_str <- toString(cluster_i)
cluster <- matrix(inmate_clusters_flt[[cluster_i_str]], ncol=2, byrow=T)
inmate_clusters_flt[[cluster_i_str]] <- cluster
}
inmate_clusters_colors = rainbow(length(inmate_clusters_flt), v=0.8, s=0.5)
pc <- function(cn, b) {
if (b == 1) {
return(sum(input_cluster_labels == cn) / length(input_cluster_labels))
} else {
return(sum(input_cluster_labels != cn) / length(input_cluster_labels))
}
}
pt <- function(ct, b) {
if (b == 1) {
in_d <- 0
for (inmate_prior_occupations in inmates_prior_occupations) {
count <- sum(ct == inmate_prior_occupations)
if (count > 0) {
in_d <- in_d + 1
}
}
return(in_d / length(inmates_prior_occupations))
} else {
n_in_d <- 0
for (inmate_prior_occupations in inmates_prior_occupations) {
count <- sum(ct == inmate_prior_occupations)
if (count == 0) {
n_in_d <- n_in_d + 1
}
}
return(n_in_d / length(inmates_prior_occupations))
}
}
pct <- function(cn, ct, b) {
if (all(b == c(0, 0))) {
n_in_d <- 0
inmate_indexes <- which(!input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count == 0) {
n_in_d <- n_in_d + 1
}
}
return(n_in_d / length(inmate_indexes))
} else if (all(b == c(1, 1))) {
in_d <- 0
inmate_indexes <- which(input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count > 0) {
in_d <- in_d + 1
}
}
return(in_d / length(inmate_indexes))
} else if (all(b == c(1, 0))) {
n_in_d <- 0
inmate_indexes <- which(input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count == 0) {
n_in_d <- n_in_d + 1
}
}
return(n_in_d / length(inmate_indexes))
} else if (all(b == c(0, 1))) {
in_d <- 0
inmate_indexes <- which(!input_cluster_labels %in% cn)
for (inmate_index in inmate_indexes) {
inmate_prior_occupations <- inmates_prior_occupations[[toString(inmate_index)]]
count <- sum(ct == inmate_prior_occupations)
if (count > 0) {
in_d <- in_d + 1
}
}
return(in_d / length(inmate_indexes))
}
}
pmi <- function(cn, ct) {
pmi_score <- 0
for (in_c in 0:1) {
for (has_t in 0:1) {
jpct_r <- pct(cn, ct, c(in_c, has_t))
pc_r <- pc(cn, in_c)
pt_r <- pt(ct, has_t)
result <- jpct_r * log2(jpct_r / (pc_r * pt_r))
if (is.nan(result)) {
result <- 0
}
pmi_score <- pmi_score + result
}
}
return(pmi_score)
}
output_cluster_labels <- list()
for(i in 1:length(inmate_clusters_str)) {
cluster <- inmate_clusters_str[[toString(i)]]
label <- ''
max_pmi_score <- -1.0
seen <- c()
for (inmate in cluster) {
for (prior_occupation in inmate) {
if (sum(prior_occupation == seen) == 0) {
pmi_score <- pmi(i, prior_occupation)
if (pmi_score > max_pmi_score) {
output_cluster_labels[[toString(i)]] <- prior_occupation
max_pmi_score <- pmi_score
}
seen <- c(seen, prior_occupation)
}
}
}
}
write(
sapply(
sort(as.integer(names(inmate_clusters_str))),
function(x) paste(
paste0(
toupper(substring(output_cluster_labels[[toString(x)]], 1, 1)),
substring(
output_cluster_labels[[toString(x)]],
2,
nchar(output_cluster_labels[[toString(x)]])
)
),
'Cluster',
toString(x),
'\n ',
paste('Inmate', names(inmate_clusters_str[[toString(x)]]), inmate_clusters_str[[toString(x)]], collapse='\n ')
)),
paste(tolower(prefix), 'inmate_clusters.txt', sep='_')
)
return(
list(
flt=inmate_clusters_flt,
str=inmate_clusters_str,
colors=inmate_clusters_colors,
in_labels=input_cluster_labels,
out_labels=output_cluster_labels
)
)
}
plot_clustering <- function(prefix, input_matrix, inmate_cluster_flt, inmate_clusters_colors, inmate_cluster_labels) {
png(file=paste(tolower(prefix), 'inmate_clusters.png', sep='_'), width=1500, height=1500)
plot.new()
plot(
input_matrix[,1],
input_matrix[,2],
main=paste('Death Row Inmate Clustering by Prior Occupations', paste0('(', prefix, ')'))
)
for (cluster_i in 1:length(inmate_clusters_flt)) {
cluster_i_str <- toString(cluster_i)
points(
inmate_clusters_flt[[cluster_i_str]],
col=inmate_clusters_colors[cluster_i],
bg=inmate_clusters_colors[cluster_i],
pch=19,
cex=5,
lwd=2
)
}
text(
input_matrix[, 1],
input_matrix[, 2],
inmate_cluster_labels,
xlim=c(
min(input_matrix[, 1]) * plot_scale,
max(input_matrix[, 1]) * plot_scale
),
ylim=c(
min(input_matrix[, 2]) * plot_scale,
max(input_matrix[, 2]) * plot_scale
),
)
legend(
'topleft',
legend=lapply(c(1:length(inmate_clusters_colors)), function (x) return(toString(x))),
fill=inmate_clusters_colors,
cex=1.3,
pt.cex=1
)
}
plot_clusters_occupation_distributions <- function(prefix, inmate_clusters_str, unique_prior_occupations) {
for (i in 1:length(inmate_clusters_str)) {
cluster <- inmate_clusters_str[[toString(i)]]
cluster_prior_occupations <- c()
for (inmate_prior_occupations in cluster) {
cluster_prior_occupations <- c(cluster_prior_occupations, inmate_prior_occupations)
}
prior_occupations_table <- reverse_table(table(factor(cluster_prior_occupations, levels=unique_prior_occupations)))
prior_occupations_table <- prior_occupations_table / sum(prior_occupations_table) * 100
png(file=paste(tolower(prefix), toString(i), 'prior_occupation_distribution.png', sep='_'), width=1500, height=1500)
plot.new()
par(
mai=c(2.0, 3.0, 1.0, 1.0)
)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
las=1,
xlim=c(0, 100),
axes=T,
border=NA
)
axis(1, at=seq(0,100,5))
grid(nx=100, ny=NA)
barplot(
prior_occupations_table,
horiz=T,
col=rainbow(length(prior_occupations_table)),
cex.names=1.1,
las=1,
xlim=c(0, 100),
axes=T,
add=T,
ylab='',
xlab=paste('Percentage of', length(cluster_prior_occupations), 'listed Prior Occupations'),
main=paste(
'Prior Occupation Relative Frequency Distribution of',
length(cluster),
'Texas Death Row Inmate(s) for Cluster',
toString(i),
paste0(
'(',
prefix,
')'
)
),
border=NA
)
mtext('Prior Occupation', side=2, line=13)
dev.off()
}
}
result <- cluster_matrix('COUNT-AP', inmate_matrix_count, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
plot_clusters_occupation_distributions('COUNT-AP', inmate_clusters_str, unique_prior_occupations)
result <- cluster_matrix('TFIDF-AP', inmate_matrix_tfidf, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
plot_clusters_occupation_distributions('TFIDF-AP', inmate_clusters_str, unique_prior_occupations)
result <- cluster_matrix('COUNT-MDS-AP', inmate_matrix_count_mds, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
inmate_cluster_labels <- result$in_labels
inmate_clusters_labels <- result$out_labels
plot_clustering('COUNT-MDS-AP', inmate_matrix_count_mds, inmate_clusters_flt, inmate_clusters_colors, inmate_cluster_labels)
plot_clusters_occupation_distributions('COUNT-MDS-AP', inmate_clusters_str, unique_prior_occupations)
result <- cluster_matrix('TFIDF-MDS-AP', inmate_matrix_tfidf_mds, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
inmate_cluster_labels <- result$in_labels
inmate_clusters_labels <- result$out_labels
plot_clustering('TFIDF-MDS-AP', inmate_matrix_tfidf_mds, inmate_clusters_flt, inmate_clusters_colors, inmate_cluster_labels)
plot_clusters_occupation_distributions('TFIDF-MDS-AP', inmate_clusters_str, unique_prior_occupations)
# -----------------------
# SVD
# -----------------------
inmate_matrix_tfidf_svd <- svd(inmate_matrix_tfidf)
k <- 38
inmate_k_basis_vectors = inmate_matrix_tfidf_svd$u[, 1:k]
k_singular_values_matrix = diag(inmate_matrix_tfidf_svd$d)[1:k, 1:k]
inmate_matrix_tfidf_svd_truncated <- inmate_k_basis_vectors %*% k_singular_values_matrix %*% t(inmate_matrix_tfidf_svd$v[, 1:k])
png(file='svd_explained_variance.png', width=1500, height=1500)
plot.new()
cumulativeVarianceExplained = cumsum(inmate_matrix_tfidf_svd$d^2 / sum(inmate_matrix_tfidf_svd$d^2)) * 100
plot(
cumulativeVarianceExplained,
type='o',
ylim=c(0,100),
xlab='Index',
ylab='Percent of Variability Explained',
main='Singular Value Decomposition'
)
axis(1, at=seq(1,length(inmate_matrix_tfidf_svd$d)))
abline(v=k)
abline(h=90.0)
bar_colors <- c()
for (i in 1:length(inmate_matrix_tfidf_svd$d)) {
if (i == k) {
bar_colors <- c(bar_colors, 'brown2')
} else {
bar_colors <- c(bar_colors, 'chartreuse3')
}
}
png(
file='svd_singular_values.png',
width=1500,
height=1500,
)
plot.new()
barplot(
inmate_matrix_tfidf_svd$d,
names.arg=1:length(inmate_matrix_tfidf_svd$d),
col=bar_colors,
ylim=c(0, max(inmate_matrix_tfidf_svd$d) + 1),
border=NA,
xlab='Index',
ylab='Singular Value',
main='Singular Value Decomposition'
)
result <- cluster_matrix('TFIDF-SVD-AP', inmate_k_basis_vectors, inmates_prior_occupations)
inmate_clusters_flt <- result$flt
inmate_clusters_str <- result$str
inmate_clusters_colors <- result$colors
inmate_cluster_labels <- result$in_labels
inmate_clusters_labels <- result$out_labels
if (k == 2) {
plot_clustering(
'TFIDF-SVD-AP',
inmate_k_basis_vectors,
inmate_clusters_flt,
inmate_clusters_colors,
inmate_cluster_labels
)
}
plot_clusters_occupation_distributions('TFIDF-SVD-AP', inmate_clusters_str, unique_prior_occupations)