/* ****************************************************************************** * * Copyright (C) 2002, International Business Machines * Corporation and others. All Rights Reserved. * * For licensing terms see the ICU X License: * http://oss.software.ibm.com/cvs/icu/~checkout~/icu/license.html * ****************************************************************************** * file name: bocu-1tst.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002jan24 * created by: Markus W. Scherer * * This is test code for a sample implementation of BOCU-1, * a MIME-compatible Binary Ordered Compression for Unicode. */ #include #include /* * Standard ICU header. * - Includes inttypes.h or defines its types. * - Defines UChar for UTF-16 as an unsigned 16-bit type (wchar_t or uint16_t). * - Defines UTF* macros to handle reading and writing * of in-process UTF-8/16 strings. * * For ICU see http://oss.software.ibm.com/icu/ * * This BOCU-1 test code uses only one implementation module from ICU, * icu/source/common/utf_impl.c . * http://oss.software.ibm.com/cvs/icu/~checkout~/icu/source/common/utf_impl.c * * It is sufficient for testing to use * public ICU headers (from the icu/source/common/unicode/ folder) * - where for Unixes platform.h must be generated by the configure script - * and to compile and link directly with utf_impl.c . */ #include "unicode/utypes.h" #include "bocu1.h" /* test code ---------------------------------------------------------------- */ /* test code options */ /* ignore comma when processing name lists in testText() */ #define TEST_IGNORE_COMMA 1 /** * Write a packed BOCU-1 byte sequence into a byte array, * without overflow check. * Test function. * * @param packed packed BOCU-1 byte sequence, see packDiff() * @param p pointer to byte array * @return number of bytes * * @see packDiff */ static int32_t writePacked(int32_t packed, uint8_t *p) { int32_t count=BOCU1_LENGTH_FROM_PACKED(packed); switch(count) { case 4: *p++=(uint8_t)(packed>>24); case 3: *p++=(uint8_t)(packed>>16); case 2: *p++=(uint8_t)(packed>>8); case 1: *p++=(uint8_t)packed; default: break; } return count; } /** * Unpack a packed BOCU-1 non-C0/space byte sequence and get * the difference to initialPrev. * Used only for round-trip testing of the difference encoding and decoding. * Test function. * * @param initialPrev bogus "previous code point" value to make sure that * the resulting code point is in the range 0..0x10ffff * @param packed packed BOCU-1 byte sequence * @return the difference to initialPrev * * @see packDiff * @see writeDiff */ static int32_t unpackDiff(int32_t initialPrev, int32_t packed) { Bocu1Rx rx={ initialPrev, 0, 0 }; int32_t count; count=BOCU1_LENGTH_FROM_PACKED(packed); switch(count) { case 4: decodeBocu1(&rx, (uint8_t)(packed>>24)); case 3: decodeBocu1(&rx, (uint8_t)(packed>>16)); case 2: decodeBocu1(&rx, (uint8_t)(packed>>8)); case 1: /* subtract initial prev */ return decodeBocu1(&rx, (uint8_t)packed)-initialPrev; default: return -0x7fffffff; } } /** * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, * preserving lexical order. * Also checks for roundtripping of the difference encoding. * Test function. * * @param diff difference value to test, -0x10ffff..0x10ffff * @param p pointer to output byte array * @return p advanced by number of bytes output * * @see unpackDiff */ static uint8_t * writeDiff(int32_t diff, uint8_t *p) { /* generate the difference as a packed value and serialize it */ int32_t packed, initialPrev; packed=packDiff(diff); /* * bogus initial "prev" to work around * code point range check in decodeBocu1() */ if(diff<=0) { initialPrev=0x10ffff; } else { initialPrev=-1; } if(diff!=unpackDiff(initialPrev, packed)) { fprintf(stderr, "error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", diff, packed, unpackDiff(initialPrev, packed)); } return p+writePacked(packed, p); } /** * Encode a UTF-16 string in BOCU-1. * Does not check for overflows, but otherwise useful function. * * @param s input UTF-16 string * @param length number of UChar code units in s * @param p pointer to output byte array * @return number of bytes output */ static int32_t writeString(const UChar *s, int32_t length, uint8_t *p) { uint8_t *p0; int32_t c, prev, i; prev=0; p0=p; i=0; while(i=0) { UTF_APPEND_CHAR_UNSAFE(s, sLength, c); } } return sLength; } /** * Pretty-print 0-terminated byte values. * Helper function for test output. * * @param bytes 0-terminated byte array to print */ static void printBytes(uint8_t *bytes) { int i; i=0; while(*bytes!=0) { printf(" %02x", *bytes++); ++i; } printf("%.*s", 3*(5-i), " "); } /** * Basic BOCU-1 test function, called when there are no command line arguments. * Prints some of the #define values and performs round-trip tests of the * difference encoding and decoding. */ static void testDiff() { uint8_t prev[5], level[5]; int32_t i, cmp, countErrors; printf("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1); printf("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2); printf("reach of 3 bytes : %ld\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3); puts(""); printf(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1); printf(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2); printf(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3); puts(""); printf(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE); printf(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2); printf(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n", BOCU1_START_NEG_3, BOCU1_START_POS_3); puts(""); /* test packDiff() & unpackDiff() with some specific values */ writeDiff(0, level); writeDiff(1, level); writeDiff(65, level); writeDiff(130, level); writeDiff(30000, level); writeDiff(1000000, level); writeDiff(-65, level); writeDiff(-130, level); writeDiff(-30000, level); writeDiff(-1000000, level); /* test that each value is smaller than any following one */ countErrors=0; i=-0x10ffff; *writeDiff(i, prev)=0; /* show first number and bytes */ printf(" wD(%8ld) ", i); printBytes(prev); puts(""); for(++i; i<=0x10ffff; ++i) { *writeDiff(i, level)=0; cmp=strcmp((const char *)prev, (const char *)level); if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) { printf("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n", level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i); } if(cmp<0) { if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) { /* * if the result is good, then print only if the length changed * to get little but interesting output */ printf("ok: strcmp(wD(%8ld), wD(%8ld))=%2d ", i-1, i, cmp); printBytes(prev); printBytes(level); puts(""); } } else { ++countErrors; printf("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d ", i-1, i, cmp); printBytes(prev); printBytes(level); puts(""); } /* remember the previous bytes */ memcpy(prev, level, 4); } /* show last number and bytes */ printf(" wD(%8ld) ", i-1); printBytes((uint8_t *)""); printBytes(prev); puts(""); if(countErrors==0) { puts("writeDiff(-0x10ffff..0x10ffff) works fine"); } else { printf("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors); } /* output signature byte sequence */ i=0; writePacked(encodeBocu1(&i, 0xfeff), level); printf("\nBOCU-1 signature byte sequence: %02x %02x %02x\n", level[0], level[1], level[2]); } /** * BOCU-1 test function for strings, * called when there is one filename argument on the command line. * * Reads a UTF-8 plain text file line by line. * For each line, converts to BOCU-1 and back, * prints statistics and tests for correct round-tripping. * * This function does not check for overflows and * will be trashed with long input lines. */ static void testText(FILE *in) { uint8_t line[200], bocu1[200]; UChar u[200], v[200]; UChar32 c; int32_t i, j, k, totalUChars=0, totalBytes=0; while(fgets((char *)line, sizeof(line), in)!=NULL) { /* convert the line from UTF-8 to UTF-16 */ i=j=0; for(;;) { UTF8_NEXT_CHAR_SAFE(line, i, sizeof(line), c, FALSE); if(c==0 || c==0xa || c==0xd) { break; } if(c==0xfeff && j==0) { /* ignore the signature byte sequence */ continue; } #if TEST_IGNORE_COMMA if(c==0x2c) { /* ignore comma */ continue; } #endif UTF_APPEND_CHAR_UNSAFE(u, j, c); } /* the UTF-16 string length is j */ i=writeString(u, j, bocu1); printf("%3ld UChars -> %3ld bytes %.4f bytes/UChar\n", j, i, (double)i/j); k=readString(bocu1, i, v); if(k!=j || 0!=memcmp(u, v, 2*j)) { fprintf(stderr, "error: readString(writeString()) does not roundtrip at file code point index %ld\n", totalUChars); } totalUChars+=j; totalBytes+=i; } printf(" totals: %3ld UChars -> %3ld bytes %.4f bytes/UChar\n", totalUChars, totalBytes, (double)totalBytes/totalUChars); } /** * File converter from UTF-8 files to BOCU-1 files. * Also outputs interesting statistics. * Called when the first command line argument is "encode". * * Checks for buffer overflows. * Illegal UTF-8 sequences are treated like * U+ffff or other code points and encoded as such. */ static void encodeFile(FILE *in, FILE *out) { uint8_t buffer[1024], bytes[4]; unsigned long inLength, inCount, outLength; unsigned int read, count, i, j; int32_t prev, c, packed; inLength=inCount=outLength=0; prev=0; i=read=0; while(!feof(in)) { /* read a block; the buffer still has i bytes from the previous time */ read=fread(buffer+i, 1, sizeof(buffer)-i, in); inLength+=read; read+=i; /* total number of bytes in the buffer */ /* * if not at the end of the file, * then avoid reading UTF-8 to the last byte * in case a character is split across buffers */ if(feof(in)) { count=read; } else { count=read-5; } /* get code points from UTF-8 and write in BOCU-1 */ for(i=0; i=0) { j=0; UTF8_APPEND_CHAR_SAFE(bytes, j, sizeof(bytes), c); fwrite(bytes, 1, j, out); outLength+=j; ++inCount; } } } printf(" input: %ld BOCU-1 bytes %ld code points output: %ld UTF-8 bytes\n", inLength, inCount, outLength); } /** * Main function of the BOCU-1 test and sample code. * For usage, run with command line "?" and see the source code above. */ extern int main(int argc, const char *argv[]) { if(argc>1) { FILE *in, *out; if( strcmp(argv[1], "?")==0 || strcmp(argv[1], "-?")==0 || strcmp(argv[1], "-h")==0 || strcmp(argv[1], "--help")==0 ) { fprintf(stderr, "usage:\n" " bocu1 (no arguments) -> test basic BOCU-1 implementation\n\n" " bocu1 -> read UTF-8 , encode each line in BOCU-1,\n" " round-trip test, print encoding ratio\n\n" " bocu1 encode -> read UTF-8 ,\n" " convert to BOCU-1, write to bocu-1.txt\n\n" " bocu1 decode -> read BOCU-1 file bocu-1.txt,\n" " convert to UTF-8, write to \n\n"); return 0; } else if(argc>2 && strcmp(argv[1], "encode")==0) { /* convert a UTF-8 file to BOCU-1 */ in=fopen(argv[2], "rb"); if(in==NULL) { printf("unable to open UTF-8 input file \"%s\"\n", argv[2]); return 1; } out=fopen("bocu-1.txt", "wb"); if(out==NULL) { printf("unable to open BOCU-1 output file \"bocu-1.txt\"\n"); return 1; } printf("converting \"%s\" to bocu-1.txt\n", argv[2]); encodeFile(in, out); fclose(out); fclose(in); } else if(argc>2 && strcmp(argv[1], "decode")==0) { /* convert a BOCU-1 file to UTF-8 */ in=fopen("bocu-1.txt", "rb"); if(in==NULL) { printf("unable to open BOCU-1 input file \"bocu-1.txt\"\n"); return 1; } out=fopen(argv[2], "wb"); if(out==NULL) { printf("unable to open UTF-8 output file \"%s\"\n", argv[2]); return 1; } printf("converting \"%s\" from bocu-1.txt\n", argv[2]); decodeFile(in, out); fclose(out); fclose(in); } else /* neither encode nor decode, test BOCU-1 on lines of input file */ { in=fopen(argv[1], "rb"); if(in==NULL) { printf("unable to open UTF-8 input file \"%s\"\n", argv[1]); return 1; } printf("test difference encoding with UTF-8 input file\n" " \"%s\"\n" " ignore comma: %d\n", argv[1], TEST_IGNORE_COMMA); testText(in); fclose(in); } } else /* no arguments, test difference encoding */ { testDiff(); } return 0; }