Friday, November 13, 2020

Syllable-based analysis of OCR errors

In my previous analysis of OCR errors in the context of extracted headwords for volume-1 of the Abridged Myanmar Dictionary, I’d broken up both the raw erroneous headwords and corrected headwords into characters (Unicode code points) and used the setdiff() function to find the differences in the characters for each pair. This approach seemed to be standard for a word based language like English. The mechanism of this approach works for a Syllable based language like our Myanmar language also, but sometimes the result looked somewhat strange to me. So I tried out another way to look at the difference and therefore the correction made in terms of syllable(s) as shown in the examples below:

Extract raw OCR’ed headwords and corrected headwords

To create syllables for original OCR’ed headword and corrected headword, we extract these words first:

library(stringr)
errList.rd1 <- c()
NElist.rd1 <- c()
DRlist.rd1 <- c()
errChrDiff.rd1 <- c()
missErr.rd1 <- c()

# Nov.2, 2020
rawHW1 <- c()
corrHW1 <- c()

j <- 1
n <- 1
Q <- 0
r <- 1
tmp <- ""
for (i in 1:length(Z0)){
  # if marked for error *
  if (grepl("[*]", Z0[i])){
    if (tmp != "") {
      p <- paste0("<",i-1,">")
      DRlist.rd1[r] <- paste(p, tmp)
      r <- r+1
    }
    tmp <- Z0[i]
    Q <- i
  # not marked for error (= correct or corrected + new entry) with - or none 
  }else{
    # if corrected + new entry (none)
    if (grepl("[\u1000-\u104f]$", Z0[i])){
       if (Q == 0|Q>1 & i != Q+1) {  #new entry, test
        m <- paste0("<",i,">")
        NElist.rd1[n] <- paste(m, Z0[i])
        n <- n+1
      }else{    #Q > 0, current not marked at all, last element marked
        if (tmp != "") {   #corrected
          k <- paste0("<",i-1,">", tmp)
          # utf8::utf8_print(paste(k, Z0[i], sep = " ==> "))
          errList.rd1[j] <- paste(k, Z0[i], sep = " ==> ")

          tmp0 <- gsub("[^\u1000-\u104f]", "", tmp)
          eD1 <- intToUtf8(setdiff(utf8ToInt(tmp0), utf8ToInt(Z0[i])))
          eD2 <- intToUtf8(setdiff(utf8ToInt(Z0[i]), utf8ToInt(tmp0)))
          u <- paste0("<",i-1,">")
          v <- paste0("<",i,">")
          errChrDiff.rd1[j] <- paste0(u, "Raw= ", tmp0, " (Extra= ", eD1, ", ", "Missed= ", eD2, "); ", v, "Correct= ", Z0[i], collapse = "")
          missErr.rd1[j] <- eD2
          
          rawHW1[j] <- tmp0
          corrHW1[j] <- Z0[i]
          
          tmp <- ""
          j <- j+1
          tmp0 <- ""
        }
        Q <- 0
      }
    }else{   #(1)with - mark, raw correct; (2)if last mark *, it is dropped
      if (tmp != "") {
        p <- paste0("<",i-1,">")
        DRlist.rd1[r] <- paste(p, tmp)
        r <- r+1
        tmp <- ""
      }
    }
  }
}
#utf8::utf8_print(errList.rd1)
cat((rawHW1)[1:10])
က ကညို ကာကာ ကတေ ကဒ် ကတိုး ကောက် ကောက်စပ် က် ကန
cat((corrHW1)[1:10])
ကစော့ ကညို့ ကတူး ကတောကမျော ကတော့ ကတိုးခွာ ကတောက် ကတောက်စပ် ကတွင်းပေါက် ကနဖေါ့

Convert original headwords and corrected headwords into syllables

First we tokenize headwords into Unicode code points using tokens() function of quanteda package.

library(quanteda)
rawHW1.t <- tokens(rawHW1, what="character")
corrHW1.t <- tokens(corrHW1, what="character")
length(rawHW1.t)
[1] 742
length(corrHW1.t)
[1] 742
head(rawHW1.t)
Tokens consisting of 6 documents.
text1 :
[1] "က"

text2 :
[1] "က" "ညို"

text3 :
[1] "က" "ာ" "က" "ာ"

text4 :
[1] "က"  "တေ"

text5 :
[1] "က"       "ဒ\u103a"

text6 :
[1] "က" "တို" "း"
head(corrHW1.t)
Tokens consisting of 6 documents.
text1 :
[1] "က"  "စေ" "ာ့" 

text2 :
[1] "က" "ညို့"

text3 :
[1] "က" "တူ" "း"

text4 :
[1] "က"        "တေ"       "ာ"        "က"        "မ\u103bေ" "ာ"       

text5 :
[1] "က"  "တေ" "ာ့" 

text6 :
[1] "က"       "တို"       "း"       "ခ\u103d" "ာ"      

With the syllable segmentation of raw headwords (round-1) we create the syllables of a given headword with OR condition to be used as pattern for gsub() function.

rawHW1_syll <- c()
M <- length(rawHW1.t)
for (k in 1:M) {
    rawHW1_syll.k <- list()
    TEMP <- rawHW1.t[[k]][1]
    j <- 1
    L <- length(rawHW1.t[[k]])
      if (L == 1) {
      rawHW1_syll.k[[j]] <- TEMP
      # rawHW1_syll[[k]] <- unlist(rawHW1_syll.k)
      #rawHW1_syll[k] <- paste0(rawHW1_syll.k, collapse = " ")
      rawHW1_syll[[k]] <- unlist(rawHW1_syll.k)
      next
      }else{ #L > 1
        for(i in 2:L) {
            y <- grepl("[\u102b-\u102c\u1038-\u1039\u103a]",rawHW1.t[[k]][i])
            if (y == TRUE){
                TEMP <- paste0(TEMP,rawHW1.t[[k]][i])
            } else { 
                my.1 <- grepl("[\u1040-\u1049]", rawHW1.t[[k]][i])
                my.0 <- grepl("[\u1040-\u1049]", rawHW1.t[[k]][i-1])
                if (my.1 == TRUE){
                    if (my.0 == TRUE){
                        TEMP <- paste0(TEMP,rawHW1.t[[k]][i])
                    } else {
                        rawHW1_syll.k[[j]] <- TEMP
                        j <- j+1
                        TEMP <- rawHW1.t[[k]][i]
                    }
                } else {
                    if (my.0 == TRUE){
                        rawHW1_syll.k[[j]] <- TEMP
                        j <- j+1
                        TEMP <- rawHW1.t[[k]][i]
                    } else {
                        # for stacked consonant
                        if (grepl("[\u1039]",rawHW1.t[[k]][i-1])==TRUE){
                            TEMP <- paste0(TEMP,rawHW1.t[[k]][i])
                        } else {
                            rawHW1_syll.k[[j]] <- TEMP
                            j <- j+1
                            TEMP <- rawHW1.t[[k]][i]
                        }
                    }
                }
            }
        }
        if (i == L){
            rawHW1_syll.k[[j]] <- TEMP
        }
      }
   rawHW1_syll[[k]] <- paste0(unlist(rawHW1_syll.k),collapse = "|")
}
rawHW1_syll[1:10]
[[1]]
[1] "က"

[[2]]
[1] "က|ညို"

[[3]]
[1] "ကာ|ကာ"

[[4]]
[1] "က|တေ"

[[5]]
[1] "ကဒ\u103a"

[[6]]
[1] "က|တိုး"

[[7]]
[1] "ကောက\u103a"

[[8]]
[1] "ကောက\u103a|စပ\u103a"

[[9]]
[1] "က\u103a"

[[10]]
[1] "က|န"

Syllable segmentation of corrected headwords (round-1):

corrHW1_syll <- c()
M <- length(corrHW1.t)
for (k in 1:M) {
    corrHW1_syll.k <- list()
    TEMP <- corrHW1.t[[k]][1]
    j <- 1
    L <- length(corrHW1.t[[k]])
      if (L == 1) {
      corrHW1_syll.k[[j]] <- TEMP
      corrHW1_syll[[k]] <- unlist(corrHW1_syll.k)
      next
      }else{ #L > 1
        for(i in 2:L) {
            y <- grepl("[\u102b-\u102c\u1038-\u1039\u103a]",corrHW1.t[[k]][i])
            if (y == TRUE){
                TEMP <- paste0(TEMP,corrHW1.t[[k]][i])
            } else { 
                my.1 <- grepl("[\u1040-\u1049]", corrHW1.t[[k]][i])
                my.0 <- grepl("[\u1040-\u1049]", corrHW1.t[[k]][i-1])
                if (my.1 == TRUE){
                    if (my.0 == TRUE){
                        TEMP <- paste0(TEMP,corrHW1.t[[k]][i])
                    } else {
                        corrHW1_syll.k[[j]] <- TEMP
                        j <- j+1
                        TEMP <- corrHW1.t[[k]][i]
                    }
                } else {
                    if (my.0 == TRUE){
                        corrHW1_syll.k[[j]] <- TEMP
                        j <- j+1
                        TEMP <- corrHW1.t[[k]][i]
                    } else {
                        # for stacked consonant
                        if (grepl("[\u1039]",corrHW1.t[[k]][i-1])==TRUE){
                            TEMP <- paste0(TEMP,corrHW1.t[[k]][i])
                        } else {
                            corrHW1_syll.k[[j]] <- TEMP
                            j <- j+1
                            TEMP <- corrHW1.t[[k]][i]
                        }
                    }
                }
            }
        }
        if (i == L){
            corrHW1_syll.k[[j]] <- TEMP
        }
      }
   corrHW1_syll[[k]] <- paste0(unlist(corrHW1_syll.k),collapse = "|")
}
corrHW1_syll[1:10]
[[1]]
[1] "က|စော့"

[[2]]
[1] "က|ညို့"

[[3]]
[1] "က|တူး"

[[4]]
[1] "က|တော|က|မ\u103bော"

[[5]]
[1] "က|တော့"

[[6]]
[1] "က|တိုး|ခ\u103dာ"

[[7]]
[1] "က|တောက\u103a"

[[8]]
[1] "က|တောက\u103a|စပ\u103a"

[[9]]
[1] "က|တ\u103dင\u103aး|ပေ\u102bက\u103a"

[[10]]
[1] "က|န|ဖေ\u102b့"

Find differences between raw headwords and corrected headwords

We need syllabification of raw/corrected with OR for each syllable as pattern, and keep the corrected/raw unchanged in using the regex function gsub().

Difference in syllables both ways: raw-corrected and corrected-raw.

# example showing first 10 and last 10 of the list
utf8::utf8_print(HW1_raw_corrList[c(1:10,733:742)])
 [1] "Raw= က (Extra= ; Missed= စော့), Corrected= ကစော့"                    
 [2] "Raw= ကညို (Extra= ညို; Missed= ့), Corrected= ကညို့"                       
 [3] "Raw= ကာကာ (Extra= ာာ; Missed= ကတူး), Corrected= ကတူး"                
 [4] "Raw= ကတေ (Extra= တေ; Missed= ာမျော), Corrected= ကတောကမျော"         
 [5] "Raw= ကဒ် (Extra= ဒ်; Missed= ကတော့), Corrected= ကတော့"                 
 [6] "Raw= ကတိုး (Extra= ; Missed= ခွာ), Corrected= ကတိုးခွာ"                  
 [7] "Raw= ကောက် (Extra= ော်; Missed= ကတောက်), Corrected= ကတောက်"            
 [8] "Raw= ကောက်စပ် (Extra= ော်; Missed= ကတောက်), Corrected= ကတောက်စပ်"        
 [9] "Raw= က် (Extra= ်; Missed= ကတွင်းပေါ), Corrected= ကတွင်းပေါက်"            
[10] "Raw= ကန (Extra= ; Missed= ဖေါ့), Corrected= ကနဖေါ့"                  
[11] "Raw= စုံဆယ် (Extra= ; Missed= ဖြာ), Corrected= စုံဆယ်ဖြာ"                
[12] "Raw= စုံတွဲ (Extra= တွဲ; Missed= ဆွဲ), Corrected= စုံဆွဲ"                      
[13] "Raw= စုံစုံ (Extra= စုံစုံ; Missed= းး), Corrected= စုံးစုံး"                  
[14] "Raw= စ္ဆေ (Extra= စ္ဆေ; Missed= စွေ), Corrected= စွေ"                  
[15] "Raw= စိ (Extra= ; Missed= စွတ်ကပါနမ်းပါ), Corrected= စွတ်ကပါစိနမ်းပါစိ"     
[16] "Raw= စွတ်စွတ်နခ (Extra= နခ; Missed= ာ), Corrected= စွတ်စွတ်နာ"             
[17] "Raw= စွန်ပြီး (Extra= ပြီး; Missed= မြီး), Corrected= စွန်မြီး"            
[18] "Raw= စွန်ပြီးတော့ (Extra= ပြီးတော့; Missed= မြီးကော့), Corrected= စွန်မြီးကော့"
[19] "Raw= စွန်ရှိ (Extra= ရှိ; Missed= ရဲ), Corrected= စွန်ရဲ"                    
[20] "Raw= စွယ်စုံ (Extra= စုံ; Missed= ရံ), Corrected= စွယ်ရံ"