Saturday, October 18, 2025

Cluster analysis of Pyu corpus


Get the inventory of Pyu inscriptions

Retrieve the inventory of inscriptions from the xlsx file: (https://zenodo.org/records/1009598)

library(readxl)
 pyu_inventory <- read_excel("pyu_inventory_1009598.xlsx")
 
 # change names of columns for convenience
 names(pyu_inventory) <- c("Serial_no" , "Support" , "Faces_Lines", "Dimensions_cm", "Language_s", "Original_locality", "Present_locality" , "Substance_of_Inscription", "Available_reproductions", "References", "Remarks")    
Inventory of Pyu inscriptions(4 out of 172 rows)
Inventory of Pyu inscriptions(4 out of 172 rows)

Remove editorial marks from inscription text in dataframe

In tibble x1_df.2 (copy of x1_df.1), remove all editoral marks from x1_df.2$combined_values for cluster analysis

library(magrittr)
library(stringr)
# create a copy of the df of plaintext corpus of Pyu inscriptions
x1_df.2 <- x1_df.1

# Remove all edit marks
patt <- c("\\#","\\[|\\]","\\(|\\)","〈|〉","〈〈|〉〉","\\{\\{|\\}\\}","\\?","C","V","\\+","◊") %>% paste0(., collapse = "|")
x1_df.2$combined_values <- gsub(patt, "", x1_df.2$combined_values) %>% str_squish(.)
 
# check if all removals done:
grep(patt, x1_df.2$combined_values)
integer(0)

Merge dataframe of Pyu inscriptions with the dataframe of the inventory of Pyu inscriptions

Merging is done in order to have access to metadata.

# create new column for merge operation with inventory df
x1_df.2$Serial_no <- x1_df.2$InscriptionNumber %>% as.numeric(.)
# change  name of column "combined_values"
colnames(x1_df.2)[2] <- "Inscription_text"

# Left Join: Includes all rows from df1 and matching rows from df2
merged_pyu <- merge(x1_df.2, pyu_inventory, by = "Serial_no", all.x = TRUE)
Pyu inscriptions with metadata(4 out of 196 rows)
Pyu inscriptions with metadata(4 out of 196 rows)

Create quanteda corpus

Create corpus from df. In the df, first remove “@||” likely indicates a gap in transcription or an incomplete word, and “/” likely signifies the end of a line within the inscription.

# first we remove "@||" and "/" and extra spaces
merged_pyu$Inscription_text <- gsub("[\\@\\|\\/]", "", merged_pyu$Inscription_text) %>% 
  str_squish(.)

# create corpus
library(quanteda)
## S3 method for class 'data.frame'
pyu_corp_Wdocvars <- corpus( 
  merged_pyu,
  docid_field = "InscriptionNumber",
  text_field = "Inscription_text"
)

Compute document similarity

The textstat_dist() function calculates similarities of documents or features for various measures. The output is compatible with R’s dist(), so hierarchical clustering can be performed without any transformation.

For this analysis we select, quite arbitrarily, inscriptions with seven or more features. This gives us 58 inscriptions out of the 196.

library(quanteda)
library(quanteda.textstats)

# create document-feature matrix of Pyu inscriptions
pyu_dfm <- dfm(tokens(pyu_corp_Wdocvars, what = "fasterword"))

# Subset documents with at least 7 tokens
sub_pyu_dfm<- dfm_subset(pyu_dfm, ntoken(pyu_dfm) >= 7) 
# dfm_subset(pyu_dfm, ntoken(pyu_dfm) >= 7) %>% docnames()

The following is the list of inscriptions excluded from analysis:

# view inscriptions to be excluded from cluster analysis
xn <- dfm_subset(pyu_dfm, ntoken(pyu_dfm) < 7) %>% docnames()
xins <- merged_pyu[merged_pyu$InscriptionNumber %in% xn, 2:3]

library(tibble)
 t_xins <- as_tibble(xins)
 t_xins$Inscription_text <- ifelse(t_xins$Inscription_text== "", "NA", t_xins$Inscription_text)

 t1 <- t_xins[1:46, ]
 t2 <- t_xins[47:92, ]
 t3 <- t_xins[93:138, ]
 
 t1.1 <- t1[, 1]
 t1.2 <- t1[, 2]
 t2.1 <- t2[, 1]
 t2.2 <- t2[, 2]
 t3.1 <- t3[, 1]
 t3.2 <- t3[, 2]

 list(t1.1, t1.2, t2.1, t2.2, t3.1, t3.2) %>% 
   kbl(caption  = "<b>Inscriptions excluded from cluster analysis<b>") %>%   
   column_spec(c(1,3,5), width = "2.5cm", color = "blue" ) %>% 
   column_spec(c(2,4,6), width = "5cm") %>% 
   kable_styling(font_size = 10, full_width = FALSE, position = "left",
     bootstrap_options = c("striped", "condensed", "bordered"))

Distance measures of Pyu inscriptions are computed. We use the default option of Euclidean distance.

# compute distance measures of Pyu inscriptions
tstat_dist <- as.dist(textstat_dist(sub_pyu_dfm, margin = "documents"))

Here it is interesting to see that even the two copies of the same well known Myazedi inscriptions (007 and 008) are not identical as shown by a distance of 10.148892.

Computed distance among Pyu inscriptions
Computed distance among Pyu inscriptions

Test plot the dendrogram with dendextend package

library(dendextend)

hc <- hclust(tstat_dist)

dend <- as.dendrogram(hc)
# set x-axis label size
labels_cex(dend) <- 0.6
# set y-axis label size and margins of plot
par(mar = c(5, 4, 4, 2) + 0.1, cex.axis = 0.7)
plot(dend, main = "Test plot", cex.main = 0.8) 

Create more informative labels for the leaves of the dendrogram

As seen above, inscription numbers are used to label the leaves of the dendrogram by default. We’ll try creating more informative labels by using some metadata from the inventory of inscriptions retrieved earlier. Unfortunately, the inventory does not cover all of the inscriptions we retrieved from the plaintext corpus of Pyu inscriptions published on Zenodo.

Plot dendrograms with new labels

You can see inscriptions or more of their metadata from the inscription texts and inventory of inscriptions retrieved earlier on this post.

library(quanteda)
library(quanteda.textstats)

pyu_dfm <- dfm(tokens(pyu_corp_Wdocvars, what = "fasterword"))
# docnames(pyu_dfm) <- x1_df.1$InscriptionNumber
  # Subset documents with at least 10 tokens
sub_pyu_dfm<- dfm_subset(pyu_dfm, ntoken(pyu_dfm) >= 7) 

tstat_dist <- as.dist(textstat_dist(sub_pyu_dfm, margin = "documents"))

library(dendextend)

hc <- hclust(tstat_dist)
dend <- as.dendrogram(hc)
labels(dend) <- df_ordered$new_labels

labels_cex(dend) <- 0.6
# set y-axis label size 
par(mar = c(9, 4, 4, 2) + 0.1, cex.axis = 0.6) 

plot(dend) %>%
title(main = "Hierarchical Clustering of Pyu inscriptions\n (for 58 inscriptions with 7 or more features)", cex.main = .8)

To view more clearly the structure of the dendrogram, a rank_branches plot can be used. This sets all the vertical distance between any two connected nodes (a parent and its child branch) is set to 1. This means all merges happen at regular, incremental heights. Then the y axis of the regular dendrogram is not relevant anymore and therefore removed in the following plot.

labels_cex(dend) <- 0.6

# set y-axis label size 
par(mar = c(9, 4, 4, 2) + 0, cex.axis = 0.6)
plot(rank_branches(dend), yaxt="n" )
title(main = "Hierarchical Clustering (rank_branches) of Pyu inscriptions\n (for 58 inscriptions with 7 or more features)", cex.main = .8)

No comments:

Post a Comment