In my previous analysis of OCR errors in the context of extracted headwords for volume-1 of the Abridged Myanmar Dictionary, I’d broken up both the raw erroneous headwords and corrected headwords into characters (Unicode code points) and used the setdiff() function to find the differences in the characters for each pair. This approach seemed to be standard for a word based language like English. The mechanism of this approach works for a Syllable based language like our Myanmar language also, but sometimes the result looked somewhat strange to me. So I tried out another way to look at the difference and therefore the correction made in terms of syllable(s) as shown in the examples below:
Extract raw OCR’ed headwords and corrected headwords
To create syllables for original OCR’ed headword and corrected headword, we extract these words first:
library(stringr)
errList.rd1 <- c()
NElist.rd1 <- c()
DRlist.rd1 <- c()
errChrDiff.rd1 <- c()
missErr.rd1 <- c()
# Nov.2, 2020
rawHW1 <- c()
corrHW1 <- c()
j <- 1
n <- 1
Q <- 0
r <- 1
tmp <- ""
for (i in 1:length(Z0)){
# if marked for error *
if (grepl("[*]", Z0[i])){
if (tmp != "") {
p <- paste0("<",i-1,">")
DRlist.rd1[r] <- paste(p, tmp)
r <- r+1
}
tmp <- Z0[i]
Q <- i
# not marked for error (= correct or corrected + new entry) with - or none
}else{
# if corrected + new entry (none)
if (grepl("[\u1000-\u104f]$", Z0[i])){
if (Q == 0|Q>1 & i != Q+1) { #new entry, test
m <- paste0("<",i,">")
NElist.rd1[n] <- paste(m, Z0[i])
n <- n+1
}else{ #Q > 0, current not marked at all, last element marked
if (tmp != "") { #corrected
k <- paste0("<",i-1,">", tmp)
# utf8::utf8_print(paste(k, Z0[i], sep = " ==> "))
errList.rd1[j] <- paste(k, Z0[i], sep = " ==> ")
tmp0 <- gsub("[^\u1000-\u104f]", "", tmp)
eD1 <- intToUtf8(setdiff(utf8ToInt(tmp0), utf8ToInt(Z0[i])))
eD2 <- intToUtf8(setdiff(utf8ToInt(Z0[i]), utf8ToInt(tmp0)))
u <- paste0("<",i-1,">")
v <- paste0("<",i,">")
errChrDiff.rd1[j] <- paste0(u, "Raw= ", tmp0, " (Extra= ", eD1, ", ", "Missed= ", eD2, "); ", v, "Correct= ", Z0[i], collapse = "")
missErr.rd1[j] <- eD2
rawHW1[j] <- tmp0
corrHW1[j] <- Z0[i]
tmp <- ""
j <- j+1
tmp0 <- ""
}
Q <- 0
}
}else{ #(1)with - mark, raw correct; (2)if last mark *, it is dropped
if (tmp != "") {
p <- paste0("<",i-1,">")
DRlist.rd1[r] <- paste(p, tmp)
r <- r+1
tmp <- ""
}
}
}
}
#utf8::utf8_print(errList.rd1)
cat((rawHW1)[1:10])
က ကညို ကာကာ ကတေ ကဒ် ကတိုး ကောက် ကောက်စပ် က် ကန
cat((corrHW1)[1:10])
ကစော့ ကညို့ ကတူး ကတောကမျော ကတော့ ကတိုးခွာ ကတောက် ကတောက်စပ် ကတွင်းပေါက် ကနဖေါ့
Convert original headwords and corrected headwords into syllables
First we tokenize headwords into Unicode code points using tokens() function of quanteda package.
library(quanteda)
rawHW1.t <- tokens(rawHW1, what="character")
corrHW1.t <- tokens(corrHW1, what="character")
length(rawHW1.t)
[1] 742
length(corrHW1.t)
[1] 742
head(rawHW1.t)
Tokens consisting of 6 documents.
text1 :
[1] "က"
text2 :
[1] "က" "ညို"
text3 :
[1] "က" "ာ" "က" "ာ"
text4 :
[1] "က" "တေ"
text5 :
[1] "က" "ဒ\u103a"
text6 :
[1] "က" "တို" "း"
head(corrHW1.t)
Tokens consisting of 6 documents.
text1 :
[1] "က" "စေ" "ာ့"
text2 :
[1] "က" "ညို့"
text3 :
[1] "က" "တူ" "း"
text4 :
[1] "က" "တေ" "ာ" "က" "မ\u103bေ" "ာ"
text5 :
[1] "က" "တေ" "ာ့"
text6 :
[1] "က" "တို" "း" "ခ\u103d" "ာ"
With the syllable segmentation of raw headwords (round-1) we create the syllables of a given headword with OR condition to be used as pattern for gsub() function.
rawHW1_syll <- c()
M <- length(rawHW1.t)
for (k in 1:M) {
rawHW1_syll.k <- list()
TEMP <- rawHW1.t[[k]][1]
j <- 1
L <- length(rawHW1.t[[k]])
if (L == 1) {
rawHW1_syll.k[[j]] <- TEMP
# rawHW1_syll[[k]] <- unlist(rawHW1_syll.k)
#rawHW1_syll[k] <- paste0(rawHW1_syll.k, collapse = " ")
rawHW1_syll[[k]] <- unlist(rawHW1_syll.k)
next
}else{ #L > 1
for(i in 2:L) {
y <- grepl("[\u102b-\u102c\u1038-\u1039\u103a]",rawHW1.t[[k]][i])
if (y == TRUE){
TEMP <- paste0(TEMP,rawHW1.t[[k]][i])
} else {
my.1 <- grepl("[\u1040-\u1049]", rawHW1.t[[k]][i])
my.0 <- grepl("[\u1040-\u1049]", rawHW1.t[[k]][i-1])
if (my.1 == TRUE){
if (my.0 == TRUE){
TEMP <- paste0(TEMP,rawHW1.t[[k]][i])
} else {
rawHW1_syll.k[[j]] <- TEMP
j <- j+1
TEMP <- rawHW1.t[[k]][i]
}
} else {
if (my.0 == TRUE){
rawHW1_syll.k[[j]] <- TEMP
j <- j+1
TEMP <- rawHW1.t[[k]][i]
} else {
# for stacked consonant
if (grepl("[\u1039]",rawHW1.t[[k]][i-1])==TRUE){
TEMP <- paste0(TEMP,rawHW1.t[[k]][i])
} else {
rawHW1_syll.k[[j]] <- TEMP
j <- j+1
TEMP <- rawHW1.t[[k]][i]
}
}
}
}
}
if (i == L){
rawHW1_syll.k[[j]] <- TEMP
}
}
rawHW1_syll[[k]] <- paste0(unlist(rawHW1_syll.k),collapse = "|")
}
rawHW1_syll[1:10]
[[1]]
[1] "က"
[[2]]
[1] "က|ညို"
[[3]]
[1] "ကာ|ကာ"
[[4]]
[1] "က|တေ"
[[5]]
[1] "ကဒ\u103a"
[[6]]
[1] "က|တိုး"
[[7]]
[1] "ကောက\u103a"
[[8]]
[1] "ကောက\u103a|စပ\u103a"
[[9]]
[1] "က\u103a"
[[10]]
[1] "က|န"
Syllable segmentation of corrected headwords (round-1):
corrHW1_syll <- c()
M <- length(corrHW1.t)
for (k in 1:M) {
corrHW1_syll.k <- list()
TEMP <- corrHW1.t[[k]][1]
j <- 1
L <- length(corrHW1.t[[k]])
if (L == 1) {
corrHW1_syll.k[[j]] <- TEMP
corrHW1_syll[[k]] <- unlist(corrHW1_syll.k)
next
}else{ #L > 1
for(i in 2:L) {
y <- grepl("[\u102b-\u102c\u1038-\u1039\u103a]",corrHW1.t[[k]][i])
if (y == TRUE){
TEMP <- paste0(TEMP,corrHW1.t[[k]][i])
} else {
my.1 <- grepl("[\u1040-\u1049]", corrHW1.t[[k]][i])
my.0 <- grepl("[\u1040-\u1049]", corrHW1.t[[k]][i-1])
if (my.1 == TRUE){
if (my.0 == TRUE){
TEMP <- paste0(TEMP,corrHW1.t[[k]][i])
} else {
corrHW1_syll.k[[j]] <- TEMP
j <- j+1
TEMP <- corrHW1.t[[k]][i]
}
} else {
if (my.0 == TRUE){
corrHW1_syll.k[[j]] <- TEMP
j <- j+1
TEMP <- corrHW1.t[[k]][i]
} else {
# for stacked consonant
if (grepl("[\u1039]",corrHW1.t[[k]][i-1])==TRUE){
TEMP <- paste0(TEMP,corrHW1.t[[k]][i])
} else {
corrHW1_syll.k[[j]] <- TEMP
j <- j+1
TEMP <- corrHW1.t[[k]][i]
}
}
}
}
}
if (i == L){
corrHW1_syll.k[[j]] <- TEMP
}
}
corrHW1_syll[[k]] <- paste0(unlist(corrHW1_syll.k),collapse = "|")
}
corrHW1_syll[1:10]
[[1]]
[1] "က|စော့"
[[2]]
[1] "က|ညို့"
[[3]]
[1] "က|တူး"
[[4]]
[1] "က|တော|က|မ\u103bော"
[[5]]
[1] "က|တော့"
[[6]]
[1] "က|တိုး|ခ\u103dာ"
[[7]]
[1] "က|တောက\u103a"
[[8]]
[1] "က|တောက\u103a|စပ\u103a"
[[9]]
[1] "က|တ\u103dင\u103aး|ပေ\u102bက\u103a"
[[10]]
[1] "က|န|ဖေ\u102b့"
Find differences between raw headwords and corrected headwords
We need syllabification of raw/corrected with OR for each syllable as pattern, and keep the corrected/raw unchanged in using the regex function gsub().
Difference in syllables both ways: raw-corrected and corrected-raw.
# example showing first 10 and last 10 of the list
utf8::utf8_print(HW1_raw_corrList[c(1:10,733:742)])
[1] "Raw= က (Extra= ; Missed= စော့), Corrected= ကစော့"
[2] "Raw= ကညို (Extra= ညို; Missed= ့), Corrected= ကညို့"
[3] "Raw= ကာကာ (Extra= ာာ; Missed= ကတူး), Corrected= ကတူး"
[4] "Raw= ကတေ (Extra= တေ; Missed= ာမျော), Corrected= ကတောကမျော"
[5] "Raw= ကဒ် (Extra= ဒ်; Missed= ကတော့), Corrected= ကတော့"
[6] "Raw= ကတိုး (Extra= ; Missed= ခွာ), Corrected= ကတိုးခွာ"
[7] "Raw= ကောက် (Extra= ော်; Missed= ကတောက်), Corrected= ကတောက်"
[8] "Raw= ကောက်စပ် (Extra= ော်; Missed= ကတောက်), Corrected= ကတောက်စပ်"
[9] "Raw= က် (Extra= ်; Missed= ကတွင်းပေါ), Corrected= ကတွင်းပေါက်"
[10] "Raw= ကန (Extra= ; Missed= ဖေါ့), Corrected= ကနဖေါ့"
[11] "Raw= စုံဆယ် (Extra= ; Missed= ဖြာ), Corrected= စုံဆယ်ဖြာ"
[12] "Raw= စုံတွဲ (Extra= တွဲ; Missed= ဆွဲ), Corrected= စုံဆွဲ"
[13] "Raw= စုံစုံ (Extra= စုံစုံ; Missed= းး), Corrected= စုံးစုံး"
[14] "Raw= စ္ဆေ (Extra= စ္ဆေ; Missed= စွေ), Corrected= စွေ"
[15] "Raw= စိ (Extra= ; Missed= စွတ်ကပါနမ်းပါ), Corrected= စွတ်ကပါစိနမ်းပါစိ"
[16] "Raw= စွတ်စွတ်နခ (Extra= နခ; Missed= ာ), Corrected= စွတ်စွတ်နာ"
[17] "Raw= စွန်ပြီး (Extra= ပြီး; Missed= မြီး), Corrected= စွန်မြီး"
[18] "Raw= စွန်ပြီးတော့ (Extra= ပြီးတော့; Missed= မြီးကော့), Corrected= စွန်မြီးကော့"
[19] "Raw= စွန်ရှိ (Extra= ရှိ; Missed= ရဲ), Corrected= စွန်ရဲ"
[20] "Raw= စွယ်စုံ (Extra= စုံ; Missed= ရံ), Corrected= စွယ်ရံ"