Saturday, October 18, 2025

Cluster analysis of Pyu corpus


Get the inventory of Pyu inscriptions

Retrieve the inventory of inscriptions from the xlsx file: (https://zenodo.org/records/1009598)

library(readxl)
 pyu_inventory <- read_excel("pyu_inventory_1009598.xlsx")
 
 # change names of columns for convenience
 names(pyu_inventory) <- c("Serial_no" , "Support" , "Faces_Lines", "Dimensions_cm", "Language_s", "Original_locality", "Present_locality" , "Substance_of_Inscription", "Available_reproductions", "References", "Remarks")    
Inventory of Pyu inscriptions(4 out of 172 rows)
Inventory of Pyu inscriptions(4 out of 172 rows)

Remove editorial marks from inscription text in dataframe

In tibble x1_df.2 (copy of x1_df.1), remove all editoral marks from x1_df.2$combined_values for cluster analysis

library(magrittr)
library(stringr)
# create a copy of the df of plaintext corpus of Pyu inscriptions
x1_df.2 <- x1_df.1

# Remove all edit marks
patt <- c("\\#","\\[|\\]","\\(|\\)","〈|〉","〈〈|〉〉","\\{\\{|\\}\\}","\\?","C","V","\\+","◊") %>% paste0(., collapse = "|")
x1_df.2$combined_values <- gsub(patt, "", x1_df.2$combined_values) %>% str_squish(.)
 
# check if all removals done:
grep(patt, x1_df.2$combined_values)
integer(0)

Merge dataframe of Pyu inscriptions with the dataframe of the inventory of Pyu inscriptions

Merging is done in order to have access to metadata.

# create new column for merge operation with inventory df
x1_df.2$Serial_no <- x1_df.2$InscriptionNumber %>% as.numeric(.)
# change  name of column "combined_values"
colnames(x1_df.2)[2] <- "Inscription_text"

# Left Join: Includes all rows from df1 and matching rows from df2
merged_pyu <- merge(x1_df.2, pyu_inventory, by = "Serial_no", all.x = TRUE)
Pyu inscriptions with metadata(4 out of 196 rows)
Pyu inscriptions with metadata(4 out of 196 rows)

Create quanteda corpus

Create corpus from df. In the df, first remove “@||” likely indicates a gap in transcription or an incomplete word, and “/” likely signifies the end of a line within the inscription.

# first we remove "@||" and "/" and extra spaces
merged_pyu$Inscription_text <- gsub("[\\@\\|\\/]", "", merged_pyu$Inscription_text) %>% 
  str_squish(.)

# create corpus
library(quanteda)
## S3 method for class 'data.frame'
pyu_corp_Wdocvars <- corpus( 
  merged_pyu,
  docid_field = "InscriptionNumber",
  text_field = "Inscription_text"
)

Compute document similarity

The textstat_dist() function calculates similarities of documents or features for various measures. The output is compatible with R’s dist(), so hierarchical clustering can be performed without any transformation.

For this analysis we select, quite arbitrarily, inscriptions with seven or more features. This gives us 58 inscriptions out of the 196.

library(quanteda)
library(quanteda.textstats)

# create document-feature matrix of Pyu inscriptions
pyu_dfm <- dfm(tokens(pyu_corp_Wdocvars, what = "fasterword"))

# Subset documents with at least 7 tokens
sub_pyu_dfm<- dfm_subset(pyu_dfm, ntoken(pyu_dfm) >= 7) 
# dfm_subset(pyu_dfm, ntoken(pyu_dfm) >= 7) %>% docnames()

The following is the list of inscriptions excluded from analysis:

# view inscriptions to be excluded from cluster analysis
xn <- dfm_subset(pyu_dfm, ntoken(pyu_dfm) < 7) %>% docnames()
xins <- merged_pyu[merged_pyu$InscriptionNumber %in% xn, 2:3]

library(tibble)
 t_xins <- as_tibble(xins)
 t_xins$Inscription_text <- ifelse(t_xins$Inscription_text== "", "NA", t_xins$Inscription_text)

 t1 <- t_xins[1:46, ]
 t2 <- t_xins[47:92, ]
 t3 <- t_xins[93:138, ]
 
 t1.1 <- t1[, 1]
 t1.2 <- t1[, 2]
 t2.1 <- t2[, 1]
 t2.2 <- t2[, 2]
 t3.1 <- t3[, 1]
 t3.2 <- t3[, 2]

 list(t1.1, t1.2, t2.1, t2.2, t3.1, t3.2) %>% 
   kbl(caption  = "<b>Inscriptions excluded from cluster analysis<b>") %>%   
   column_spec(c(1,3,5), width = "2.5cm", color = "blue" ) %>% 
   column_spec(c(2,4,6), width = "5cm") %>% 
   kable_styling(font_size = 10, full_width = FALSE, position = "left",
     bootstrap_options = c("striped", "condensed", "bordered"))

Distance measures of Pyu inscriptions are computed. We use the default option of Euclidean distance.

# compute distance measures of Pyu inscriptions
tstat_dist <- as.dist(textstat_dist(sub_pyu_dfm, margin = "documents"))

Here it is interesting to see that even the two copies of the same well known Myazedi inscriptions (007 and 008) are not identical as shown by a distance of 10.148892.

Computed distance among Pyu inscriptions
Computed distance among Pyu inscriptions

Test plot the dendrogram with dendextend package

library(dendextend)

hc <- hclust(tstat_dist)

dend <- as.dendrogram(hc)
# set x-axis label size
labels_cex(dend) <- 0.6
# set y-axis label size and margins of plot
par(mar = c(5, 4, 4, 2) + 0.1, cex.axis = 0.7)
plot(dend, main = "Test plot", cex.main = 0.8) 

Create more informative labels for the leaves of the dendrogram

As seen above, inscription numbers are used to label the leaves of the dendrogram by default. We’ll try creating more informative labels by using some metadata from the inventory of inscriptions retrieved earlier. Unfortunately, the inventory does not cover all of the inscriptions we retrieved from the plaintext corpus of Pyu inscriptions published on Zenodo.

Plot dendrograms with new labels

You can see inscriptions or more of their metadata from the inscription texts and inventory of inscriptions retrieved earlier on this post.

library(quanteda)
library(quanteda.textstats)

pyu_dfm <- dfm(tokens(pyu_corp_Wdocvars, what = "fasterword"))
# docnames(pyu_dfm) <- x1_df.1$InscriptionNumber
  # Subset documents with at least 10 tokens
sub_pyu_dfm<- dfm_subset(pyu_dfm, ntoken(pyu_dfm) >= 7) 

tstat_dist <- as.dist(textstat_dist(sub_pyu_dfm, margin = "documents"))

library(dendextend)

hc <- hclust(tstat_dist)
dend <- as.dendrogram(hc)
labels(dend) <- df_ordered$new_labels

labels_cex(dend) <- 0.6
# set y-axis label size 
par(mar = c(9, 4, 4, 2) + 0.1, cex.axis = 0.6) 

plot(dend) %>%
title(main = "Hierarchical Clustering of Pyu inscriptions\n (for 58 inscriptions with 7 or more features)", cex.main = .8)

To view more clearly the structure of the dendrogram, a rank_branches plot can be used. This sets all the vertical distance between any two connected nodes (a parent and its child branch) is set to 1. This means all merges happen at regular, incremental heights. Then the y axis of the regular dendrogram is not relevant anymore and therefore removed in the following plot.

labels_cex(dend) <- 0.6

# set y-axis label size 
par(mar = c(9, 4, 4, 2) + 0, cex.axis = 0.6)
plot(rank_branches(dend), yaxt="n" )
title(main = "Hierarchical Clustering (rank_branches) of Pyu inscriptions\n (for 58 inscriptions with 7 or more features)", cex.main = .8)

Saturday, October 4, 2025

 Frequency analysis of Pyu corpus text


The simplest quantitative analysis of the Pyu corpus would be the frequency analysis of features. We will use the dfm created from the previous analysis.

library(quanteda)
library(quanteda.textstats)
library(quanteda.textplots)
library(RColorBrewer)

# first we change the docnames text1, text2 etc., to the Inscription number of the Pyu inscriptions  
docnames(pyu_dfm) <- x1_df.1$InscriptionNumber
print(pyu_dfm)
Document-feature matrix of: 196 documents, 1,893 features (99.01% sparse) and 0 docvars.
     features
docs  @|| ḅay·ṁḥ dak·ṃ viy·ṃṁ tim·ṁ mlik· °o saḥ tgaṃ knon·
  001   1      2     1      1     1     1  3   1    1     1
  002   0      0     0      0     0     0  0   0    0     0
  003   1      0     0      0     0     0  4   0    0     0
  004   1      0     0      0     0     0  3   0    0     0
  005   1      0     0      0     0     0  3   0    0     0
  006   1      0     0      0     0     0  3   0    0     0
[ reached max_ndoc ... 190 more documents, reached max_nfeat ... 1,883 more features ]
# get frequencies of features  
tstat_freq <- textstat_frequency(pyu_dfm) 

#  view 30 most frequent features
 library(kableExtra)
 tstat_freq[1:30, ] %>%
  kbl() %>%
    kable_styling(full_width = F, font_size = 10) %>%
    column_spec(1, width = "2") %>%
    row_spec(0, background = "lightgrey") %>%
    row_spec(1:30, background = "lightblue") %>%  
    kable_styling(bootstrap_options = "condensed")
featurefrequencyrankdocfreqgroup
°o320148all
tiṁ190232all
ḅaṁḥ93327all
ta93324all
yaṁ85535all
ḅiṁḥ81616all
tin·ṁ73711all
ḅay·ṁḥ61811all
//5893all
||551035all
tar·551010all
///531218all
ḅin·ṁḥ471310all
ḅa451419all
saḥ441519all
gi42166all
tim·ṁ411711all
ma391813all
pau37196all
tdav·ṃḥ342010all
ḅaḥ332111all
kdaṅ·332112all
dav·ṃḥ29238all
mra282416all
ḅiṁ28248all
tir·ṁ27266all
priṅ·ḥ26278all
traḥ25283all
pay·ṁḥ25289all
/24302all
 
# create wordcloud of features   
set.seed(132)
textplot_wordcloud(pyu_dfm, max_size = 14, max_words = 200, color = rev(RColorBrewer::brewer.pal(10, "Spectral")))

Thursday, October 2, 2025

 Feature co-occurrence in Pyu corpus

In NLP, feature co-occurrence in text analysis is useful for exploring semantic relationships and patterns by identifying words or phrases that appear together frequently. Then it seems like a good idea to study the feature (syllable/word/phrase) co-occurence in the Pyu corpus.

To do so, I took the Pyu corpus I had used for the exercise reported in my previous post and remove all the editorial marks. The editorial marks used there were described in Studies in Pyu Epigraphy, I : State of the Field, Edition and Analysis of the Kan Wet Khaung Mound Inscription, and Inventory of the Corpus. To quote:

2.2 Diplomatic edition
We use bold-cum-italic typeface to highlight the Sanskrit phrases in the
text and indicate the faces (A, b, C, d) over which the lines are spread in
superscript, while we assign numbers to the Pyu glosses, also in superscript,
with the sign #. We use the following editorial conventions:
[ ] uncertain reading
( ) editorial restoration of lost text
〈 〉 editorial addition of omitted text
〈〈 〉〉 scribal insertion
{{ }} scribal deletion
? illegible akṣara
C illegible consonant element of an akṣara
V illegible vowel element of an akṣara
+ lost akṣara
◊ punctuation space

Mordifying the corpus

Remove all edit marks, and white spaces:

library(magrittr)
# Remove all edit marks
patt <- c("\\#","\\[|\\]","\\(|\\)","〈|〉","〈〈|〉〉","\\{\\{|\\}\\}","\\?","C","V","\\+","◊") %>% paste0(., collapse = "|")
pyuDoc <- gsub(patt, "", x1_df.1$combined_values)
# remove white spaces
 pyuDoc <- str_squish(pyuDoc)
 
# check if all removals done:
grep(patt, pyuDoc)
integer(0)
# view line 73 old and new
x1_df.1$combined_values[73]
[1] "siddha[m·] 2 || ◊ ḅay·ṁḥ kmak· [ḅa]y·ṁḥ toṅ· tṅav· tiṁ psiṁ ◊ ḅay·ṁḥ saḥ ḅay·ṁḥ goṃḥ ◊ °o saḥ ḅay·ṁḥ luṅ· hi[p]· ◊ ḅay·ṁḥ luṅ· ti[n·]ṁ droḥ kdiṃ ◊ ḅay·ṁḥ luṅ· tdav·ṃḥ ◊ daṅ·ṃṁ °oy· tsaṁḥ ḅuddha daṅ·ḥ tim·ṁ [m]l[i]y·ṁ kdaṅ· nhoḥ yaṁ ◊ ||@"
pyuDoc[73]
[1] "siddham· 2 || ḅay·ṁḥ kmak· ḅay·ṁḥ toṅ· tṅav· tiṁ psiṁ ḅay·ṁḥ saḥ ḅay·ṁḥ goṃḥ °o saḥ ḅay·ṁḥ luṅ· hip· ḅay·ṁḥ luṅ· tin·ṁ droḥ kdiṃ ḅay·ṁḥ luṅ· tdav·ṃḥ daṅ·ṃṁ °oy· tsaṁḥ ḅuddha daṅ·ḥ tim·ṁ mliy·ṁ kdaṅ· nhoḥ yaṁ ||@"

Run feature co-occurence analysis for a subset of features

Without modification I took the pieces of text separated by space that comes with the corpus to be the “features”.

library(quanteda)
library(quanteda.textplots)

# get names of most frequent features 

# create dfm
pyu_dfm <- dfm(tokens(pyuDoc, what = "fasterword"))
head(pyu_dfm)
Document-feature matrix of: 6 documents, 1,893 features (98.94% sparse) and 0 docvars.
       features
docs    @|| ḅay·ṁḥ dak·ṃ viy·ṃṁ tim·ṁ mlik· °o saḥ tgaṃ knon·
  text1   1      2     1      1     1     1  3   1    1     1
  text2   0      0     0      0     0     0  0   0    0     0
  text3   1      0     0      0     0     0  4   0    0     0
  text4   1      0     0      0     0     0  3   0    0     0
  text5   1      0     0      0     0     0  3   0    0     0
  text6   1      0     0      0     0     0  3   0    0     0
[ reached max_nfeat ... 1,883 more features ]
# get feature names sorted by feature frequencies in descending order
names.2 <- dfm_sort(pyu_dfm, margin = "features") %>% featfreq(.) %>% names(.)

# test
x <- dfm_sort(pyu_dfm) %>% featfreq(.)
y <- dfm_sort(pyu_dfm, margin = "features") %>% featfreq(.)

# get names of most frequently co-occuring freatures

# create feature co-occurence matrix
pyu_fcm <- fcm(pyu_dfm)
# Convert the FCM to a regular matrix to access counts more easily
pyu_fcm_matrix <- as.matrix(pyu_fcm)

# Calculate the sum of co-occurrence counts for each feature (row sums)
feature_counts <- rowSums(pyu_fcm_matrix)

# Sort the feature counts in descending order
sorted_features <- sort(feature_counts, decreasing = TRUE)

# Get the names of the features in the desired order
ordered_feature_names <- names(sorted_features)

# Reorder the FCM based on the sorted feature names
# This involves selecting features in the new order for both rows and columns
sorted_pyu_fcm <- pyu_fcm[ordered_feature_names, ordered_feature_names]

# Display the sorted FCM (optional)
print(sorted_pyu_fcm[1:10,1:10])
Feature co-occurrence matrix of: 10 by 10 features.
        features
features   °o  tiṁ ḅay·ṁḥ   ta tin·ṁ ḅiṁḥ  yaṁ ḅaṁḥ  gi   ḅa
  °o     3953 6186      0 1977  2202 2065 1473 1553 940 1147
  tiṁ       0 3391      0    0  1934  399    0    0 827    0
  ḅay·ṁḥ 2309 2708    628  484   648   39  121  111   6  394
  ta        0 1751      0  357   604  411    0    0 468    0
  tin·ṁ     0    0      0    0   415    0    0    0   0    0
  ḅiṁḥ      0    0      0    0    84 1026    0    0 201    0
  yaṁ       0  540      0  351   280  843  237  577 242    0
  ḅaṁḥ      0  440      0  290    63 1412    0  482 111    0
  gi        0    0      0    0   442    0    0    0 502    0
  ḅa        0 1109      0  340   379  175  132  128 125   93
# extract feature names for analysis

# most frequently co-occuring feature names for analysis
tokeep.1 <- ordered_feature_names[1:30]
toplot.1 <- fcm_keep(sorted_pyu_fcm, tokeep.1)

# most frequent feature names for analysis
tokeep.2 <- names.2[1:30]
toplot.2 <- fcm_keep(sorted_pyu_fcm, tokeep.2)

Plot a network of feature co-occurrences

An fcm object could be plotted as a network, where edges show co-occurrences of features. Currently the size of the network is limited to 1000, because of the computationally intensive nature of network formation for larger matrices. Here, we have opted to use 30 features. Besides, the resulting plot with too many features would be too crowded to be impossible or incomprehensible.

# plot with 30 most frequently co-occurring features
p.1 <- textplot_network(toplot.1)
library(ggplot2)
p.1 + theme(panel.border = element_rect(color = "black", fill = NA, size = 1))


# plot with 30 most frequently found features
p.2 <- textplot_network(toplot.2)
p.2 + theme(panel.border = element_rect(color = "black", fill = NA, size = 1))

Besides, the resulting plot with too many features would be too crowded to be impossible to view or incomprehensible, as seen below for 900 features:

Plot with 900 features
Plot with 900 features