Arabic renderer in four lines of Perl

From: Roman Czyborra (czyborra@cs.tu-berlin.de)
Date: Thu Jun 18 1998 - 08:12:20 EDT


#!/usr/local/bin/perl

# arabjoin - a simple filter to render Arabic text
# © 1998-06-18 roman@czyborra.com
# Freeware license at http://czyborra.com/
# Latest version at http://czyborra.com/unicode/
# PostScript printout at http://czyborra.com/unicode/arabjoin.ps.gz

# This filter takes Arabic text (encoded in UTF-8 using the Unicode
# characters from the U+0600 Arabic block in logical order) as input
# and performs Arabic glyph joining on it and outputs a UTF-8 octet
# stream that is no longer logically arranged but in a visual order
# which gives readable results when formatted with a simple Unicode
# renderer like Yudit that does not handle Arabic differently yet
# but simply outputs all glyphs in left-to-right order.

# This little script also demonstrates that Arabic rendering is not
# that complicated after all (it makes you wonder why some software
# companies are still asking hundreds of dollars from poor students
# who just want to print their Arabic texts) and that even Perl 4 can
# handle Unicode text in UTF-8 without any nifty new add-ons.

# Usage examples:

# echo "أهلاً بالعالم!" | arabjoin
# prints !ﻢﻟﺎﻌﻟﺎﺑ ًﻼﻫﺃ
# which is the Arabic version of "Hello world!"

# | recode ISO-8859-6..UTF-8 | arabjoin | uniprint -f cyberbit.ttf
# prints an Arabic mail of charset=iso-8859-6-i on your printer

# | arabjoin | xviewer yudit
# delegates an Arabic UTF-8 message to a better viewer

# ftp://sunsite.unc.edu/pub/Linux/apps/editors/X/ has uniprint in yudit-1.0
# ftp://ftp.iro.umontreal.ca/pub/contrib/pinard/pretest/ has recode-3.4g
# http://czyborra.com/unicode/ has arabjoin
# http://czyborra.com/unix/ has xviewer
# http://www.bitstream.com/cyberbit.htm or
# ftp://ccic.ifcss.org/pub/software/fonts/unicode/ms-win/ or
# ftp://ftp.irdu.nus.sg/pub/language/bitstream/ has cyberbit.ttf

# This is how we do it: First we learn the presentation forms of each
# Arabic letter from the end of this script:

while(<DATA>)
{
    ($char, $_) = /^(\S+)\s+(\S+)/;
    ($isolated{$char},$final{$char},$medial{$char},$initial{$char}) =
        /([\xC0-\xFF][\x80-\xBF]+)/g;
}

# Then learn the (incomplete set of) transparent characters:

foreach $char (split (" ", "
 ً ٌ ٍ َ ُ ِ ٰ
 ۗ ۘ ۙ ۚ ۛ ۜ ۟ ۠ ۡ ۢ ۣ ۤ ۧ ۨ ۪ ۫ ۬ ۭ"))
{
    $transparent{$char}=1;
}

# Finally we can process our text:

while (<>)
{
    s/\n$//; # chop off the end of the line so it won't jump upfront

    @uchar = # UTF-8 character chunks
        /([\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+)/g;

    # We walk through the line of text and do contextual analysis:

    for ($i = $[; $i <= $#uchar; $i = $j)
    {
        for ($b=$uchar[$j=$i]; $transparent{$c=$uchar[++$j]};){};

        # The following assignment is the heart of the algorithm.
        # It reduces the Arabic joining algorithm described on
        # pages 6-24 to 6-26 of the Arabic character block description
        # in the Unicode 2.0 Standard to four lines of Perl:

        $uchar[$i] = $a && $final{$c} && $medial{$b}
        || $final{$c} && $initial{$b}
        || $a && $final{$b}
        || $isolated{$b}
        || $b;

        $a = $initial{$b} && $final{$c};
    }

    # Until the Unicode Consortium publishes its Unicode Technical
    # Report #9 (Bidirectional Algorithm Reference Implementation)
    # at http://www.unicode.org/unicode/reports/techreports.html
    # let us oversimplify things a bit and reverse everything:
 
    $_= join ('', reverse @uchar);

    # The following 8 obligatory LAM+ALEF ligatures are encoded in the
    # U+FE70 Arabic Presentation Forms-B block in Unicode's
    # compatibility zone:

    s/ﺂﻟ/ﻵ/g;
    s/ﺂﻠ/ﻶ/g;
    s/ﺄﻟ/ﻷ/g;
    s/ﺄﻠ/ﻸ/g;
    s/ﺈﻟ/ﻹ/g;
    s/ﺈﻠ/ﻺ/g;
    s/ﺎﻟ/ﻻ/g;
    s/ﺎﻠ/ﻼ/g;

    # Bitstream's Cyberbit font offers 57 of the other 466 optional
    # ligatures in the U+FB50 Arabic Presentation Forms-A block:

    s/ﻢﺗ/ﰎ/g;
    s/ﻲﻓ/ﰲ/g;
    s/ﺞﻟ/ﰿ/g;
    s/ﺢﻟ/ﱀ/g;
    s/ﺦﻟ/ﱁ/g;
    s/ﻢﻟ/ﱂ/g;
    s/ﻰﻟ/ﱃ/g;
    s/ﻲﻟ/ﱄ/g;
    s/ﻢﻧ/ﱎ/g;
    s/ٌّ/ﱞ/g;
    s/ٍّ/ﱟ/g;
    s/َّ/ﱠ/g;
    s/ُّ/ﱡ/g;
    s/ِّ/ﱢ/g;
    s/ﺮﺒ/ﱪ/g;
    s/ﻦﺒ/ﱭ/g;
    s/ﻲﺒ/ﱯ/g;
    s/ﺮﺘ/ﱰ/g;
    s/ﻦﺘ/ﱳ/g;
    s/ﻲﺘ/ﱵ/g;
    s/ﻲﻨ/ﲏ/g;
    s/ﺮﻴ/ﲑ/g;
    s/ﻦﻴ/ﲔ/g;
    s/ﺠﺑ/ﲜ/g;
    s/ﺤﺑ/ﲝ/g;
    s/ﺨﺑ/ﲞ/g;
    s/ﻤﺑ/ﲟ/g;
    s/ﺠﺗ/ﲡ/g;
    s/ﺤﺗ/ﲢ/g;
    s/ﺨﺗ/ﲣ/g;
    s/ﻤﺗ/ﲤ/g;
    s/ﻤﺛ/ﲦ/g;
    s/ﻤﺟ/ﲨ/g;
    s/ﻤﺣ/ﲪ/g;
    s/ﻤﺧ/ﲬ/g;
    s/ﻤﺳ/ﲰ/g;
    s/ﺠﻟ/ﳉ/g;
    s/ﺤﻟ/ﳊ/g;
    s/ﺨﻟ/ﳋ/g;
    s/ﻤﻟ/ﳌ/g;
    s/ﻬﻟ/ﳍ/g;
    s/ﺠﻣ/ﳎ/g;
    s/ﺤﻣ/ﳏ/g;
    s/ﺨﻣ/ﳐ/g;
    s/ﻤﻣ/ﳑ/g;
    s/ﺠﻧ/ﳒ/g;
    s/ﺤﻧ/ﳓ/g;
    s/ﺨﻧ/ﳔ/g;
    s/ﻤﻧ/ﳕ/g;
    s/ﺠﻳ/ﳚ/g;
    s/ﺤﻳ/ﳛ/g;
    s/ﺨﻳ/ﳜ/g;
    s/ﻤﻳ/ﳝ/g;
    s/ﺤﻤﻟ/ﶈ/g;
    s/ﻪﻠﻟﺍ/ﷲ/g;
    s/ﻢﻠﺳﻭ/ﻪﻴﻠﻋ/g;
    s/ﻪﻟﺎﻠﺟ/ﻞﺟ/g;

    print "$_\n";
}

# The following table lists the presentation variants of each
# character. Each value from the U+0600 block means that the
# necessary glyph variant has not been assigned a code in Unicode's
# U+FA00 compatibility zone. You may want to insert your private
# glyphs or approximation glyphs for them:

__END__
ء ﺀ
آ ﺁﺂ
أ ﺃﺄ
ؤ ﺅﺆ
إ ﺇﺈ
ئ ﺉﺊﺌﺋ
ا ﺍﺎ
ب ﺏﺐﺒﺑ
ة ﺓﺔ
ت ﺕﺖﺘﺗ
ث ﺙﺚﺜﺛ
ج ﺝﺞﺠﺟ
ح ﺡﺢﺤﺣ
خ ﺥﺦﺨﺧ
د ﺩﺪ
ذ ﺫﺬ
ر ﺭﺮ
ز ﺯﺰ
س ﺱﺲﺴﺳ
ش ﺵﺶﺸﺷ
ص ﺹﺺﺼﺻ
ض ﺽﺾﻀﺿ
ط ﻁﻂﻄﻃ
ظ ﻅﻆﻈﻇ
ع ﻉﻊﻌﻋ
غ ﻍﻎﻐﻏ
ـ ــــ
ف ﻑﻒﻔﻓ
ق ﻕﻖﻘﻗ
ك ﻙﻚﻜﻛ
ل ﻝﻞﻠﻟ
م ﻡﻢﻤﻣ
ن ﻥﻦﻨﻧ
ه ﻩﻪﻬﻫ
و ﻭﻮ
ى ﻯﻰ // ﯩﯨ
ي ﻱﻲﻴﻳ
ٱ ﭐ // ﭑ
ٲ ٲٲ
ٳ ٳٳ
ٴ ٴ
ٵ ٵٵ
ٶ ٶٶ
ٷ ﯝٷ
ٸ ٸٸٸٸ
ٹ ﭦﭧﭩﭨ
ٺ ﭞﭟﭡﭠ
ٻ ﭒﭓﭕﭔ
ټ ټټټټ
ٽ ٽٽٽٽ
پ ﭖﭗﭙﭘ
ٿ ﭢﭣﭥﭤ
ڀ ﭚﭛﭝﭜ
ځ ځځځځ
ڂ ڂڂڂڂ
ڃ ﭶﭷﭹﭸ
ڄ ﭲﭳﭵﭴ
څ څڅڅڅ
چ ﭺﭻﭽﭼ
ڇ ﭾﭿﮁﮀ
ڈ ﮈﮉ
ډ ډډ
ڊ ڊڊ
ڋ ڋڋ
ڌ ﮄﮅ
ڍ ﮂﮃ
ڎ ﮆﮇ
ڏ ڏڏ
ڐ ڐڐ
ڑ ﮌﮍ
ڒ ڒڒ
ړ ړړ
ڔ ڔڔ
ڕ ڕڕ
ږ ڕږ
ڗ ڗڗ
ژ ﮊﮋ
ڙ ڙڙ
ښ ښښښښ
ڛ ڛڛڛڛ
ڜ ڜڜڜڜ
ڝ ڝڝڝڝ
ڞ ڞڞڞڞ
ڟ ڟڟڟڟ
ڠ ڠڠڠڠ
ڡ ڡڡڡڡ
ڢ ڢڢڢڢ
ڣ ڣڣڣڣ
ڤ ﭪﭫﭭﭬ
ڥ ڥڥڥڥ
ڦ ﭮﭯﭱﭰ
ڧ ڧڧڧڧ
ڨ ڨڨڨڨ
ک ﮎﮏﮑﮐ
ڪ ڪڪڪڪ
ګ ګګګګ
ڬ ڬڬڬڬ
ڭ ﯓﯔﯖﯕ
ڮ ڮڮڮڮ
گ ﮒﮓﮕﮔ
ڰ ڰڰڰڰ
ڱ ﮚﮛﮝﮜ
ڲ ڲڲڲڲ
ڳ ﮖﮗﮙﮘ
ڴ ڴڴڴڴ
ڵ ڵڵڵڵ
ڶ ڶڶڶڶ
ڷ ڷڷڷڷ
ں ﮞﮟںں
ڻ ﮠﮡﮣﮢ
ڼ ڼڼڼڼ
ڽ ڽڽڽڽ
ھ ﮪﮫﮭﮬ
ۀ ﮤﮥ
ہ ﮦﮧﮩﮨ
ۂ ۂۂ
ۃ ۃۃ
ۄ ۄۄ
ۅ ﯠﯡ
ۆ ﯙﯚ
ۇ ﯗﯘ
ۈ ﯛﯜ
ۉ ﯢﯣ
ۊ ۊۊ
ۋ ﯞﯟ
ی ﯼﯽﯿﯾ
ۍ ۍۍ
ێ ێێێێ
ې ﯤﯥﯧﯦ
ہ ہہہہ
ۂ ۂۂ
ۃ ۃۃ
ۄ ۄۄ
ۅ ۅۅ
ۆ ۆۆ
ۇ ۇۇ
ۈ ۈۈ
ۉ ۉۉ
ۊ ۊۊ
ۋ ۋۋ
ی یییی
ۍ ۍۍ
ێ ێێێێ
ې ېېېې
ۑ ۑۑۑۑ
ے ﮮﮯ
ۓ ﮰﮱ
ە ە
‍ ‍‍‍‍




This archive was generated by hypermail 2.1.2 : Tue Jul 10 2001 - 17:20:40 EDT