/* * Format convertor for UnicodeData.txt. * * Changes current two line range convention to a single * line XXXX..YYYY convention. (Also deletes comment lines * or blank lines in the input.) * * This version also checks the validity of range pairs (to * ensure that they have the same names and property values). * Use the -n option to just check validity, without actually * producing the converted format file. * * This source code is placed in the public domain. * Use at your own risk. It is not warrantied to do anything * useful. Feel free to hack away. */ #include #include #include #include static char versionString[] = "UCDReformat version 1.1, 2001-Jan-24\n"; char infile[] = "UnicodeData.txt"; char outfile[] = "UnicodeData2.txt"; char fileName[40]; char fileName2[40]; typedef unsigned char UChar8; typedef unsigned int UInt32; /* * Globals */ int linesProcessed = 0; int rangeCheckFailed = 0; int suppressOutput = 0; UInt32 rangeFirst = 0xFFFFFFFF; UInt32 rangeLast = 0xFFFFFFFF; /***********************************************************/ /* * SECTION: Utility routines. */ /* * Convert an "ASCII" representation of sb/mb int to real int. * * Bounded to 0 .. 0xFFFFFFFF. * * i.e. "c2ba" to 0xC2BA, "8000000A" to 0x8000000A, etc. * * RETURNS: * 0 converted o.k. * -1 input string too long to convert or too short to be valid. * -2 non-hex digit input. */ static int convertHexToInt ( UInt32 *dest, char *source ) { int slen; int i; char *sp; char b1; char n1; slen = strlen ( source ); if ( ( slen > 8 ) || ( slen < 4 ) ) { return ( -1 ); } sp = source; *dest = 0; for ( i = 0; i < slen; i++ ) { *dest <<= 4; b1 = (char)toupper(*sp++); if ( ( b1 >= 0x41 ) && ( b1 <= 0x46 ) ) { n1 = (UChar8)(b1 - 55); } else if ( ( b1 >= 0x30 ) && ( b1 <= 0x39 ) ) { n1 = (UChar8)(b1 - 48); } else { return ( -2 ); } *dest |= n1; } return 0 ; } /***********************************************************/ /* * SECTION: Field parsing utility routines. */ #define DELIM ';' /* #define DELIM '\t' */ static char *copyField ( char *dest, char *src ) { char *sp; char *dp; sp = src; dp = dest; while ( (*sp != DELIM) && (*sp != '\0') && (*sp != '\n') ) { *dp++ = *sp++; } *dp = '\0'; if ( ( *sp == DELIM ) || ( *sp == '\n') ) { sp++; } return sp; } /***********************************************************/ /* * SECTION: processData -- Process the line and reformat ranges. * * This can be modified arbitrarily to produce other outputs. */ static char saveline[512]; static int processData ( char *buf, FILE* fdo ) { char *sp; char *temp; int nn; UInt32 n; int rc; char localbuf[12]; char localbuf2[256]; char localbuf3[512]; char localbuf4[512]; sp = buf; /* field 0: Unicode value */ sp = copyField ( localbuf, sp ); nn = convertHexToInt ( &n, localbuf ); if ( nn != 0 ) { if ( nn == -1 ) { printf( "Error: Hex value too long or too short.\n" ); } else { printf( "Error: Invalid hex digit in unidata file.\n" ); } return ( -1 ); } /* field 1: Unicode name */ sp = copyField ( localbuf2, sp ); if ( strstr ( localbuf2, "First" ) != NULL ) /* encountered first entry of range pair; set rangeFirst value; * process the line into saveline, to do validity checking later */ { rangeFirst = n; temp = strchr ( localbuf2, ',' ); if ( temp == NULL ) { printf ( "Invalid range name.\n" ); printf ( buf ); rangeCheckFailed = 1; return ( -1 ); } saveline[0] = '\0'; strncat ( saveline, localbuf2, temp - localbuf2 ); strcat ( saveline, ">;" ); strcat ( saveline, sp ); } else if ( strstr ( localbuf2, "Last" ) != NULL ) /* encountered second entry of range pair; reformat and output */ { rangeLast = n; sprintf ( localbuf4, "%04X..%04X;", rangeFirst, rangeLast ); temp = strchr ( localbuf2, ',' ); if ( temp == NULL ) { printf ( "Invalid range name.\n" ); printf ( buf ); rangeCheckFailed = 1; return ( -1 ); } localbuf3[0] = '\0'; strncat ( localbuf3, localbuf2, temp - localbuf2 ); strcat ( localbuf3, ">;" ); strcat ( localbuf3, sp ); if ( strcmp ( saveline, localbuf3 ) != 0 ) { printf ( "Validity check failure on range pair.\n" ); printf ( buf ); rangeCheckFailed = 1; return ( -1 ); } strcat ( localbuf4, localbuf3 ); if ( !suppressOutput ) { rc = fputs ( localbuf4, fdo ); if ( rc == EOF ) { printf ( "Write error.\n" ); return ( -1 ); } } } else if ( !suppressOutput ) /* all other lines, just dump out */ { rc = fputs ( buf, fdo ); if ( rc == EOF ) { printf ( "Write error.\n" ); return ( -1 ); } } return ( 0 ); } /***********************************************************/ /* * SECTION: parseUnicodeData * * Parse the UnicodeData.txt file (or any other file using * the same generic format). */ static int parseUnicodeData ( void ) { FILE *fdi; FILE *fdo; int rc; int i; char buffer[512]; fdi = fopen ( fileName, "rt" ); if ( fdi == NULL ) { printf ( "Cannot open input.\n" ); return -1 ; } if ( !suppressOutput ) { fdo = fopen ( fileName2, "wt" ); if ( fdo == NULL ) { printf ( "Cannot open output.\n" ); fclose ( fdi ); return -1 ; } } else { fdo = (FILE *)NULL; } /* Do the work */ while ( fgets( buffer, 512, fdi ) != NULL ) { int slen; int lineIsBlank; /* Don't process empty lines or comments. */ slen = strlen ( buffer ); if ( ( slen == 0 ) || ( buffer[0] == '#' ) ) continue ; /* Also check for non-zero length lines with just whitespace */ lineIsBlank = 1; i = 0 ; while ( lineIsBlank && ( i < slen ) ) { if (!isspace(buffer[i])) lineIsBlank = 0; i++; } if ( lineIsBlank ) continue; /* fputs ( buffer, stdout ); */ rc = processData ( buffer, fdo ); linesProcessed++; if ( rc == -1 ) break; } fclose ( fdi ); if ( !suppressOutput ) { fclose ( fdo ); } if ( rc < 0 ) { printf ( "Abnormal termination at line %d.\n", linesProcessed ); return ( rc ); } else { printf ( "Processed %d lines.\n", linesProcessed ); } return ( 0 ); } /***********************************************************/ /* * SECTION: Command Line Processing. */ static void usageMsg( void ) { fputs ("Usage: ucdrefmt (-v)(-?)(-n)(-o outfilename) filename\n", stdout ); fputs (" -v show version.\n", stdout ); fputs (" -? show this usage message.\n", stdout ); fputs (" -h show this usage message.\n", stdout ); fputs (" -n no output (validity check only).\n", stdout ); } static void versionMsg(void) { fputs ( versionString, stdout ); } static int processArguments( int argc, char *argv[] ) { char argstring[40]; char* tmp; char c; int numargs = argc; int foundFile = 0; int foundOutFile = 0; while ( numargs > 1 ) { strncpy ( argstring, *++argv, 40 ); argstring[39] = '\0'; numargs--; tmp = argstring; c = *tmp++; if ( c == '-' ) { c = *tmp; switch ( c ) { case 'o' : if ( numargs > 1 ) { strncpy ( fileName2, *++argv, 40 ); fileName2[39] = '\0'; numargs--; foundOutFile = 1; break; } else { usageMsg(); return ( -1 ); } case 'v' : versionMsg(); return 0; case '?' : case 'h' : usageMsg(); return 0; case 'n' : suppressOutput = 1; break; default: usageMsg(); return -1; } } else { strncpy ( fileName, argstring, 40 ); foundFile = 1; } } /* * Default input file name to UnicodeData.txt. */ if ( foundFile == 0 ) { strcpy ( fileName, infile ); } if ( foundOutFile == 0 ) { strcpy ( fileName2, outfile ); } return 1; } /***********************************************************/ main ( int argc, char *argv[] ) { int rc; rc = processArguments ( argc, argv ); if ( rc != 1 ) { return ( rc ); } if ( suppressOutput ) { printf ( "Parsing %s for Range Pair Integrity\n", fileName ); } else { printf ( "Parsing %s, outputting to %s.\n", fileName, fileName2 ); } rc = parseUnicodeData (); if ( !rangeCheckFailed && ( rc == 0 ) ) { printf ( "Range Pair Integrity checks o.k.\n" ); } return 1; }