Arabic renderer in four lines of Perl

From: Roman Czyborra (
Date: Thu Jun 18 1998 - 08:12:20 EDT


# arabjoin - a simple filter to render Arabic text
# © 1998-06-18
# Freeware license at
# Latest version at
# PostScript printout at

# This filter takes Arabic text (encoded in UTF-8 using the Unicode
# characters from the U+0600 Arabic block in logical order) as input
# and performs Arabic glyph joining on it and outputs a UTF-8 octet
# stream that is no longer logically arranged but in a visual order
# which gives readable results when formatted with a simple Unicode
# renderer like Yudit that does not handle Arabic differently yet
# but simply outputs all glyphs in left-to-right order.

# This little script also demonstrates that Arabic rendering is not
# that complicated after all (it makes you wonder why some software
# companies are still asking hundreds of dollars from poor students
# who just want to print their Arabic texts) and that even Perl 4 can
# handle Unicode text in UTF-8 without any nifty new add-ons.

# Usage examples:

# echo "أهلاً بالعالم!" | arabjoin
# prints !ﻢﻟﺎﻌﻟﺎﺑ ًﻼﻫﺃ
# which is the Arabic version of "Hello world!"

# | recode ISO-8859-6..UTF-8 | arabjoin | uniprint -f cyberbit.ttf
# prints an Arabic mail of charset=iso-8859-6-i on your printer

# | arabjoin | xviewer yudit
# delegates an Arabic UTF-8 message to a better viewer

# has uniprint in yudit-1.0
# has recode-3.4g
# has arabjoin
# has xviewer
# or
# or
# has cyberbit.ttf

# This is how we do it: First we learn the presentation forms of each
# Arabic letter from the end of this script:

    ($char, $_) = /^(\S+)\s+(\S+)/;
    ($isolated{$char},$final{$char},$medial{$char},$initial{$char}) =

# Then learn the (incomplete set of) transparent characters:

foreach $char (split (" ", "
 ً ٌ ٍ َ ُ ِ ٰ
 ۗ ۘ ۙ ۚ ۛ ۜ ۟ ۠ ۡ ۢ ۣ ۤ ۧ ۨ ۪ ۫ ۬ ۭ"))

# Finally we can process our text:

while (<>)
    s/\n$//; # chop off the end of the line so it won't jump upfront

    @uchar = # UTF-8 character chunks

    # We walk through the line of text and do contextual analysis:

    for ($i = $[; $i <= $#uchar; $i = $j)
        for ($b=$uchar[$j=$i]; $transparent{$c=$uchar[++$j]};){};

        # The following assignment is the heart of the algorithm.
        # It reduces the Arabic joining algorithm described on
        # pages 6-24 to 6-26 of the Arabic character block description
        # in the Unicode 2.0 Standard to four lines of Perl:

        $uchar[$i] = $a && $final{$c} && $medial{$b}
        || $final{$c} && $initial{$b}
        || $a && $final{$b}
        || $isolated{$b}
        || $b;

        $a = $initial{$b} && $final{$c};

    # Until the Unicode Consortium publishes its Unicode Technical
    # Report #9 (Bidirectional Algorithm Reference Implementation)
    # at
    # let us oversimplify things a bit and reverse everything:
    $_= join ('', reverse @uchar);

    # The following 8 obligatory LAM+ALEF ligatures are encoded in the
    # U+FE70 Arabic Presentation Forms-B block in Unicode's
    # compatibility zone:


    # Bitstream's Cyberbit font offers 57 of the other 466 optional
    # ligatures in the U+FB50 Arabic Presentation Forms-A block:


    print "$_\n";

# The following table lists the presentation variants of each
# character. Each value from the U+0600 block means that the
# necessary glyph variant has not been assigned a code in Unicode's
# U+FA00 compatibility zone. You may want to insert your private
# glyphs or approximation glyphs for them:

ء ﺀ
آ ﺁﺂ
أ ﺃﺄ
ؤ ﺅﺆ
إ ﺇﺈ
ئ ﺉﺊﺌﺋ
ا ﺍﺎ
ب ﺏﺐﺒﺑ
ة ﺓﺔ
ت ﺕﺖﺘﺗ
ث ﺙﺚﺜﺛ
ج ﺝﺞﺠﺟ
ح ﺡﺢﺤﺣ
خ ﺥﺦﺨﺧ
د ﺩﺪ
ذ ﺫﺬ
ر ﺭﺮ
ز ﺯﺰ
س ﺱﺲﺴﺳ
ش ﺵﺶﺸﺷ
ص ﺹﺺﺼﺻ
ض ﺽﺾﻀﺿ
ط ﻁﻂﻄﻃ
ظ ﻅﻆﻈﻇ
ع ﻉﻊﻌﻋ
غ ﻍﻎﻐﻏ
ـ ــــ
ف ﻑﻒﻔﻓ
ق ﻕﻖﻘﻗ
ك ﻙﻚﻜﻛ
ل ﻝﻞﻠﻟ
م ﻡﻢﻤﻣ
ن ﻥﻦﻨﻧ
ه ﻩﻪﻬﻫ
و ﻭﻮ
ى ﻯﻰ // ﯩﯨ
ي ﻱﻲﻴﻳ
ٱ ﭐ // ﭑ
ٲ ٲٲ
ٳ ٳٳ
ٴ ٴ
ٵ ٵٵ
ٶ ٶٶ
ٷ ﯝٷ
ٸ ٸٸٸٸ
ٹ ﭦﭧﭩﭨ
ٺ ﭞﭟﭡﭠ
ٻ ﭒﭓﭕﭔ
ټ ټټټټ
ٽ ٽٽٽٽ
پ ﭖﭗﭙﭘ
ٿ ﭢﭣﭥﭤ
ڀ ﭚﭛﭝﭜ
ځ ځځځځ
ڂ ڂڂڂڂ
ڃ ﭶﭷﭹﭸ
ڄ ﭲﭳﭵﭴ
څ څڅڅڅ
چ ﭺﭻﭽﭼ
ڇ ﭾﭿﮁﮀ
ڈ ﮈﮉ
ډ ډډ
ڊ ڊڊ
ڋ ڋڋ
ڌ ﮄﮅ
ڍ ﮂﮃ
ڎ ﮆﮇ
ڏ ڏڏ
ڐ ڐڐ
ڑ ﮌﮍ
ڒ ڒڒ
ړ ړړ
ڔ ڔڔ
ڕ ڕڕ
ږ ڕږ
ڗ ڗڗ
ژ ﮊﮋ
ڙ ڙڙ
ښ ښښښښ
ڛ ڛڛڛڛ
ڜ ڜڜڜڜ
ڝ ڝڝڝڝ
ڞ ڞڞڞڞ
ڟ ڟڟڟڟ
ڠ ڠڠڠڠ
ڡ ڡڡڡڡ
ڢ ڢڢڢڢ
ڣ ڣڣڣڣ
ڤ ﭪﭫﭭﭬ
ڥ ڥڥڥڥ
ڦ ﭮﭯﭱﭰ
ڧ ڧڧڧڧ
ڨ ڨڨڨڨ
ک ﮎﮏﮑﮐ
ڪ ڪڪڪڪ
ګ ګګګګ
ڬ ڬڬڬڬ
ڭ ﯓﯔﯖﯕ
ڮ ڮڮڮڮ
گ ﮒﮓﮕﮔ
ڰ ڰڰڰڰ
ڱ ﮚﮛﮝﮜ
ڲ ڲڲڲڲ
ڳ ﮖﮗﮙﮘ
ڴ ڴڴڴڴ
ڵ ڵڵڵڵ
ڶ ڶڶڶڶ
ڷ ڷڷڷڷ
ں ﮞﮟںں
ڻ ﮠﮡﮣﮢ
ڼ ڼڼڼڼ
ڽ ڽڽڽڽ
ھ ﮪﮫﮭﮬ
ۀ ﮤﮥ
ہ ﮦﮧﮩﮨ
ۂ ۂۂ
ۃ ۃۃ
ۄ ۄۄ
ۅ ﯠﯡ
ۆ ﯙﯚ
ۇ ﯗﯘ
ۈ ﯛﯜ
ۉ ﯢﯣ
ۊ ۊۊ
ۋ ﯞﯟ
ی ﯼﯽﯿﯾ
ۍ ۍۍ
ێ ێێێێ
ې ﯤﯥﯧﯦ
ہ ہہہہ
ۂ ۂۂ
ۃ ۃۃ
ۄ ۄۄ
ۅ ۅۅ
ۆ ۆۆ
ۇ ۇۇ
ۈ ۈۈ
ۉ ۉۉ
ۊ ۊۊ
ۋ ۋۋ
ی یییی
ۍ ۍۍ
ێ ێێێێ
ې ېېېې
ۑ ۑۑۑۑ
ے ﮮﮯ
ۓ ﮰﮱ
ە ە
‍ ‍‍‍‍

This archive was generated by hypermail 2.1.2 : Tue Jul 10 2001 - 17:20:40 EDT