/*
******************************************************************************
*
*   Copyright (C) 2002, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*   For licensing terms see the ICU X License:
*   http://oss.software.ibm.com/cvs/icu/~checkout~/icu/license.html
*
******************************************************************************
*   file name:  bocu-1tst.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2002jan24
*   created by: Markus W. Scherer
*
*   This is test code for a sample implementation of BOCU-1,
*   a MIME-compatible Binary Ordered Compression for Unicode.
*/

#include <stdio.h>
#include <string.h>

/*
 * Standard ICU header.
 * - Includes inttypes.h or defines its types.
 * - Defines UChar for UTF-16 as an unsigned 16-bit type (wchar_t or uint16_t).
 * - Defines UTF* macros to handle reading and writing
 *   of in-process UTF-8/16 strings.
 *
 * For ICU see http://oss.software.ibm.com/icu/
 *
 * This BOCU-1 test code uses only one implementation module from ICU,
 * icu/source/common/utf_impl.c .
 * http://oss.software.ibm.com/cvs/icu/~checkout~/icu/source/common/utf_impl.c
 *
 * It is sufficient for testing to use
 * public ICU headers (from the icu/source/common/unicode/ folder)
 * - where for Unixes platform.h must be generated by the configure script -
 * and to compile and link directly with utf_impl.c .
 */
#include "unicode/utypes.h"

#include "bocu1.h"

/* test code ---------------------------------------------------------------- */

/* test code options */

/* ignore comma when processing name lists in testText() */
#define TEST_IGNORE_COMMA       1

/**
 * Write a packed BOCU-1 byte sequence into a byte array,
 * without overflow check.
 * Test function.
 *
 * @param packed packed BOCU-1 byte sequence, see packDiff()
 * @param p pointer to byte array
 * @return number of bytes
 *
 * @see packDiff
 */
static int32_t
writePacked(int32_t packed, uint8_t *p) {
    int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
    switch(count) {
    case 4:
        *p++=(uint8_t)(packed>>24);
    case 3:
        *p++=(uint8_t)(packed>>16);
    case 2:
        *p++=(uint8_t)(packed>>8);
    case 1:
        *p++=(uint8_t)packed;
    default:
        break;
    }

    return count;
}

/**
 * Unpack a packed BOCU-1 non-C0/space byte sequence and get
 * the difference to initialPrev.
 * Used only for round-trip testing of the difference encoding and decoding.
 * Test function.
 *
 * @param initialPrev bogus "previous code point" value to make sure that
 *                    the resulting code point is in the range 0..0x10ffff
 * @param packed packed BOCU-1 byte sequence
 * @return the difference to initialPrev
 *
 * @see packDiff
 * @see writeDiff
 */
static int32_t
unpackDiff(int32_t initialPrev, int32_t packed) {
    Bocu1Rx rx={ initialPrev, 0, 0 };
    int32_t count;
    
    count=BOCU1_LENGTH_FROM_PACKED(packed);
    switch(count) {
    case 4:
        decodeBocu1(&rx, (uint8_t)(packed>>24));
    case 3:
        decodeBocu1(&rx, (uint8_t)(packed>>16));
    case 2:
        decodeBocu1(&rx, (uint8_t)(packed>>8));
    case 1:
        /* subtract initial prev */
        return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
    default:
        return -0x7fffffff;
    }
}

/**
 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
 * preserving lexical order.
 * Also checks for roundtripping of the difference encoding.
 * Test function.
 *
 * @param diff difference value to test, -0x10ffff..0x10ffff
 * @param p pointer to output byte array
 * @return p advanced by number of bytes output
 *
 * @see unpackDiff
 */
static uint8_t *
writeDiff(int32_t diff, uint8_t *p) {
    /* generate the difference as a packed value and serialize it */
    int32_t packed, initialPrev;

    packed=packDiff(diff);

    /*
     * bogus initial "prev" to work around
     * code point range check in decodeBocu1()
     */
    if(diff<=0) {
        initialPrev=0x10ffff;
    } else {
        initialPrev=-1;
    }

    if(diff!=unpackDiff(initialPrev, packed)) {
        fprintf(stderr,
                "error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
                diff, packed, unpackDiff(initialPrev, packed));
    }
    return p+writePacked(packed, p);
}

/**
 * Encode a UTF-16 string in BOCU-1.
 * Does not check for overflows, but otherwise useful function.
 *
 * @param s input UTF-16 string
 * @param length number of UChar code units in s
 * @param p pointer to output byte array
 * @return number of bytes output
 */
static int32_t
writeString(const UChar *s, int32_t length, uint8_t *p) {
    uint8_t *p0;
    int32_t c, prev, i;

    prev=0;
    p0=p;
    i=0;
    while(i<length) {
        UTF_NEXT_CHAR(s, i, length, c);
        p+=writePacked(encodeBocu1(&prev, c), p);
    }
    return p-p0;
}

/**
 * Decode a BOCU-1 byte sequence to a UTF-16 string.
 * Does not check for overflows, but otherwise useful function.
 *
 * @param p pointer to input BOCU-1 bytes
 * @param length number of input bytes
 * @param s point to output UTF-16 string array
 * @return number of UChar code units output
 */
static int32_t
readString(const uint8_t *p, int32_t length, UChar *s) {
    Bocu1Rx rx={ 0, 0, 0 };
    int32_t c, i, sLength;

    i=sLength=0;
    while(i<length) {
        c=decodeBocu1(&rx, p[i++]);
        if(c<-1) {
            fprintf(stderr, "error: readString detects encoding error at string index %ld\n", i);
            return -1;
        }
        if(c>=0) {
            UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
        }
    }
    return sLength;
}

/**
 * Pretty-print 0-terminated byte values.
 * Helper function for test output.
 *
 * @param bytes 0-terminated byte array to print
 */
static void
printBytes(uint8_t *bytes) {
    int i;

    i=0;
    while(*bytes!=0) {
        printf(" %02x", *bytes++);
        ++i;
    }
    printf("%.*s", 3*(5-i), "               ");
}

/**
 * Basic BOCU-1 test function, called when there are no command line arguments.
 * Prints some of the #define values and performs round-trip tests of the
 * difference encoding and decoding.
 */
static void
testDiff() {
    uint8_t prev[5], level[5];
    int32_t i, cmp, countErrors;

    printf("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
    printf("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
    printf("reach of 3 bytes     : %ld\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
    puts("");

    printf("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
    printf("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
    printf("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
    puts("");

    printf("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
    printf("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
    printf("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
    puts("");

    /* test packDiff() & unpackDiff() with some specific values */
    writeDiff(0, level);
    writeDiff(1, level);
    writeDiff(65, level);
    writeDiff(130, level);
    writeDiff(30000, level);
    writeDiff(1000000, level);
    writeDiff(-65, level);
    writeDiff(-130, level);
    writeDiff(-30000, level);
    writeDiff(-1000000, level);

    /* test that each value is smaller than any following one */
    countErrors=0;
    i=-0x10ffff;
    *writeDiff(i, prev)=0;

    /* show first number and bytes */
    printf("              wD(%8ld)                    ", i);
    printBytes(prev);
    puts("");

    for(++i; i<=0x10ffff; ++i) {
        *writeDiff(i, level)=0;
        cmp=strcmp((const char *)prev, (const char *)level);
        if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
            printf("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
                   level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
        }
        if(cmp<0) {
            if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
                /*
                 * if the result is good, then print only if the length changed
                 * to get little but interesting output
                 */
                printf("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  ", i-1, i, cmp);
                printBytes(prev);
                printBytes(level);
                puts("");
            }
        } else {
            ++countErrors;
            printf("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  ", i-1, i, cmp);
            printBytes(prev);
            printBytes(level);
            puts("");
        }
        /* remember the previous bytes */
        memcpy(prev, level, 4);
    }

    /* show last number and bytes */
    printf("                            wD(%8ld)      ", i-1);
    printBytes((uint8_t *)"");
    printBytes(prev);
    puts("");

    if(countErrors==0) {
        puts("writeDiff(-0x10ffff..0x10ffff) works fine");
    } else {
        printf("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
    }

    /* output signature byte sequence */
    i=0;
    writePacked(encodeBocu1(&i, 0xfeff), level);
    printf("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
           level[0], level[1], level[2]);
}

/**
 * BOCU-1 test function for strings,
 * called when there is one filename argument on the command line.
 *
 * Reads a UTF-8 plain text file line by line.
 * For each line, converts to BOCU-1 and back,
 * prints statistics and tests for correct round-tripping.
 *
 * This function does not check for overflows and
 * will be trashed with long input lines.
 */
static void
testText(FILE *in) {
    uint8_t line[200], bocu1[200];
    UChar u[200], v[200];
    UChar32 c;
    int32_t i, j, k, totalUChars=0, totalBytes=0;

    while(fgets((char *)line, sizeof(line), in)!=NULL) {
        /* convert the line from UTF-8 to UTF-16 */
        i=j=0;
        for(;;) {
            UTF8_NEXT_CHAR_SAFE(line, i, sizeof(line), c, FALSE);
            if(c==0 || c==0xa || c==0xd) {
                break;
            }
            if(c==0xfeff && j==0) {
                /* ignore the signature byte sequence */
                continue;
            }
#if TEST_IGNORE_COMMA
            if(c==0x2c) {
                /* ignore comma */
                continue;
            }
#endif
            UTF_APPEND_CHAR_UNSAFE(u, j, c);
        }
        /* the UTF-16 string length is j */

        i=writeString(u, j, bocu1);
        printf("%3ld UChars -> %3ld bytes  %.4f bytes/UChar\n",
               j, i, (double)i/j);

        k=readString(bocu1, i, v);
        if(k!=j || 0!=memcmp(u, v, 2*j)) {
            fprintf(stderr, "error: readString(writeString()) does not roundtrip at file code point index %ld\n", totalUChars);
        }

        totalUChars+=j;
        totalBytes+=i;
    }

    printf("    totals: %3ld UChars -> %3ld bytes  %.4f bytes/UChar\n",
           totalUChars, totalBytes, (double)totalBytes/totalUChars);
}

/**
 * File converter from UTF-8 files to BOCU-1 files.
 * Also outputs interesting statistics.
 * Called when the first command line argument is "encode".
 *
 * Checks for buffer overflows.
 * Illegal UTF-8 sequences are treated like
 * U+ffff or other code points and encoded as such.
 */
static void
encodeFile(FILE *in, FILE *out) {
    uint8_t buffer[1024], bytes[4];
    unsigned long inLength, inCount, outLength;
    unsigned int read, count, i, j;
    int32_t prev, c, packed;

    inLength=inCount=outLength=0;
    prev=0;
    i=read=0;
    while(!feof(in)) {
        /* read a block; the buffer still has i bytes from the previous time */
        read=fread(buffer+i, 1, sizeof(buffer)-i, in);
        inLength+=read;
        read+=i; /* total number of bytes in the buffer */

        /*
         * if not at the end of the file,
         * then avoid reading UTF-8 to the last byte
         * in case a character is split across buffers
         */
        if(feof(in)) {
            count=read;
        } else {
            count=read-5;
        }

        /* get code points from UTF-8 and write in BOCU-1 */
        for(i=0; i<count;) {
            UTF8_NEXT_CHAR_SAFE(buffer, i, read, c, FALSE);
            packed=encodeBocu1(&prev, c);
            j=writePacked(packed, bytes);
            fwrite(bytes, 1, j, out);
            outLength+=j;
            ++inCount;
        }

        if(i<read) {
            memmove(buffer, buffer+i, read-i);
        }
        i=read-i;
    }

    printf("    input: %ld UTF-8 bytes %ld code points output: %ld BOCU-1 bytes\n",
            inLength, inCount, outLength);
    printf("    BOCU-1/UTF-8: %f    BOCU-1/code point: %f\n",
            (double)outLength/inLength, (double)outLength/inCount);
}

/**
 * File converter from BOCU-1 files to UTF-8 files.
 * Also outputs interesting statistics.
 * Called when the first command line argument is "decode".
 *
 * Checks for buffer overflows and illegal BOCU-1 byte sequences.
 */
static void
decodeFile(FILE *in, FILE *out) {
    uint8_t buffer[1024], bytes[4];
    Bocu1Rx rx={ 0, 0, 0 };
    unsigned long inLength, inCount, outLength;
    unsigned int read, i, j;
    int32_t c;

    inLength=inCount=outLength=0;
    read=0;
    while(!feof(in)) {
        /* read a block */
        read=fread(buffer, 1, sizeof(buffer), in);
        inLength+=read;

        /* get code points from BOCU-1 and write in UTF-8 */
        for(i=0; i<read; ++i) {
            c=decodeBocu1(&rx, buffer[i]);
            if(c<-1) {
                fprintf(stderr, "error: illegal BOCU-1 sequence at file byte index %ld\n", inLength-read+i);
                return;
            }
            if(c>=0) {
                j=0;
                UTF8_APPEND_CHAR_SAFE(bytes, j, sizeof(bytes), c);
                fwrite(bytes, 1, j, out);
                outLength+=j;
                ++inCount;
            }
        }
    }

    printf("    input: %ld BOCU-1 bytes %ld code points output: %ld UTF-8 bytes\n",
            inLength, inCount, outLength);
}

/**
 * Main function of the BOCU-1 test and sample code.
 * For usage, run with command line "?" and see the source code above.
 */
extern int
main(int argc, const char *argv[]) {
    if(argc>1) {
        FILE *in, *out;

        if( strcmp(argv[1], "?")==0 || strcmp(argv[1], "-?")==0 ||
            strcmp(argv[1], "-h")==0 || strcmp(argv[1], "--help")==0
        ) {
            fprintf(stderr,
                "usage:\n"
                "    bocu1 (no arguments) -> test basic BOCU-1 implementation\n\n"
                "    bocu1 <filename> -> read UTF-8 <filename>, encode each line in BOCU-1,\n"
                "                        round-trip test, print encoding ratio\n\n"
                "    bocu1 encode <filename> -> read UTF-8 <filename>,\n"
                "                               convert to BOCU-1, write to bocu-1.txt\n\n"
                "    bocu1 decode <filename> -> read BOCU-1 file bocu-1.txt,\n"
                "                               convert to UTF-8, write to <filename>\n\n");
            return 0;
        } else if(argc>2 && strcmp(argv[1], "encode")==0) {
            /* convert a UTF-8 file to BOCU-1 */
            in=fopen(argv[2], "rb");
            if(in==NULL) {
                printf("unable to open UTF-8 input file \"%s\"\n", argv[2]);
                return 1;
            }

            out=fopen("bocu-1.txt", "wb");
            if(out==NULL) {
                printf("unable to open BOCU-1 output file \"bocu-1.txt\"\n");
                return 1;
            }

            printf("converting \"%s\" to bocu-1.txt\n", argv[2]);
            encodeFile(in, out);
            fclose(out);
            fclose(in);
        } else if(argc>2 && strcmp(argv[1], "decode")==0) {
            /* convert a BOCU-1 file to UTF-8 */
            in=fopen("bocu-1.txt", "rb");
            if(in==NULL) {
                printf("unable to open BOCU-1 input file \"bocu-1.txt\"\n");
                return 1;
            }

            out=fopen(argv[2], "wb");
            if(out==NULL) {
                printf("unable to open UTF-8 output file \"%s\"\n", argv[2]);
                return 1;
            }

            printf("converting \"%s\" from bocu-1.txt\n", argv[2]);
            decodeFile(in, out);
            fclose(out);
            fclose(in);
        } else /* neither encode nor decode, test BOCU-1 on lines of input file */ {
            in=fopen(argv[1], "rb");
            if(in==NULL) {
                printf("unable to open UTF-8 input file \"%s\"\n", argv[1]);
                return 1;
            }

            printf("test difference encoding with UTF-8 input file\n"
                   "    \"%s\"\n"
                   "    ignore comma: %d\n",
                   argv[1],
                   TEST_IGNORE_COMMA);
            testText(in);
            fclose(in);
        }
    } else /* no arguments, test difference encoding */ {
        testDiff();
    }

    return 0;
}
