EUC-UTF8 is possible!

From: Dan Kogai (dankogai@dan.co.jp)
Date: Sat Mar 17 2007 - 14:17:53 CST

  • Next message: Doug Ewell: "Re: EUC-UTF8 is possible!"

    -----BEGIN PGP SIGNED MESSAGE-----
    Hash: SHA1

    Folks,

    I am really surprised to find that EUC and UTF-8 can be mashed up
    easily.

    The secret is \xFF. This byte NEVER appears in EUC or UTF-8. So you
    can define the combo character as follow;

      EUC_UTF8_CHAR = EUC_CHAR | \xFF + UTF8_CHAR

    What's that good for? You need less bytes to represent text which
    consists most of EUC-mappable characters and a few characters which
    require full Unicode. Which is the case for most documents that are
    locally used in SE Asia.

    EUC-UTF8 is so simple I already came up with a crude implementation
    in Perl 5. The module provides transcoding for euc-jp-utf8, euc-cn-
    utf8, and euc-kr-utf8 all at once. Source code right after my
    signature.

    Since UTF-8 is already popular enough, I am not sure how seriously we
    should take EUC-UTF8. I just wanted to tell you such an encoding is
    possible.

    Dan the Maintainer of Encode.pm
    ====

    package Encode::EUCUTF8;
    use 5.008001;
    use strict;
    use warnings;
    use Encode 2.12;

    our $VERSION = sprintf "%d.%02d", q$Revision: 0.1 $ =~ /(\d+)/g;

    our $EU_CHAR = qr{(
             [\x00-\x7F] # ASCII
             | [\xa1-\xfe][\xa1-\xfe] # 2byte EUC
             | \x8f[\xa1-\xfe][\xa1-\xfe] # 3byte EUC
             | \x8e[\xa1-\xdf] # EUC Kana
             | \xFF(?: # UTF-8 - ASCII
                     [\xC2-\xDF][\x80-\xBF] # U+0080 - U+07FF
                     | \xE0[\xA0-\xBF][\x80-\xBF] # U+0800 - U+0FFF
                     | [\xE1-\xEC][\x80-\xBF]{2} # U+1000 - U+CFFF
                     | \xED[\x80-\x9F][\x80-\xBF] # U+D000 - U+D7FF
                     | \xEF[\x80-\xBF][\x80-\xBD] # U+E000 - U+FFFD
                     | \xF0[\x90-\xBF][\x80-\xBF]{2} # U+10000 - U+3FFFF
                     | [\xF1-\xF3][\x80-\xBF]{3} # U+40000 - U+FFFFF
                     | \xF4[\x80-\x8F][\x80-\xBF]{2} # U+100000 - U+10FFFF
                    )
            )}ox;

    sub gen_decode($) {
         my $euc_what = shift;

         sub ($$;$) {
             my ( $obj, $bytes, $chk ) = @_;
             no warnings 'uninitialized';
             $bytes =~ s{
                     $Encode::EUCUTF8::$EU_CHAR
                    }{
                        substr($1,0,1) eq "\xFF"
                            ? Encode::decode('utf8', substr($1,1))
                                : Encode::decode($euc_what, $1);
                    }egx;
             $_[1] = '' if $chk;
             return $bytes;
           }
    }

    sub gen_encode {
         my $euc_what = shift;

         sub ($$;$) {
             my ( $obj, $str, $chk ) = @_;
             my $bytes =
               Encode::encode( $euc_what, $str,
                 sub { "\xFF" . Encode::encode_utf8 pack "U", shift } );
             $_[1] = '' if $chk;
             return $bytes;
           }
    }

    package Encode::EUCUTF8::JP;
    use base qw/Encode::Encoding/;
    __PACKAGE__->Define('euc-jp-utf8');
    sub needs_lines { 1 }
    *decode = Encode::EUCUTF8::gen_decode('euc-jp');
    *encode = Encode::EUCUTF8::gen_encode('euc-jp');

    package Encode::EUCUTF8::CN;
    use base qw/Encode::Encoding/;
    __PACKAGE__->Define('euc-cn-utf8');
    sub needs_lines { 1 }
    *decode = Encode::EUCUTF8::gen_decode('euc-cn');
    *encode = Encode::EUCUTF8::gen_encode('euc-cn');

    package Encode::EUCUTF8::KR;
    use base qw/Encode::Encoding/;
    __PACKAGE__->Define('euc-kr-utf8');
    sub needs_lines { 1 }
    *decode = Encode::EUCUTF8::gen_decode('euc-kr');
    *encode = Encode::EUCUTF8::gen_encode('euc-kr');

    1;

    -----BEGIN PGP SIGNATURE-----
    Version: GnuPG v1.4.3 (Darwin)

    iD8DBQFF/Ez1ErJia/WXtBsRAvT4AJ9igfpe/aqdwp+3RiOMHAGGGTjLRQCgkbmM
    b8gM9+6IooXUH32zvqOAuQo=
    =PsQO
    -----END PGP SIGNATURE-----



    This archive was generated by hypermail 2.1.5 : Sat Mar 17 2007 - 14:19:29 CST