/* * This code is provided AS IS. No guarantees or * warranties, express or implied. Don't come looking * for me if it doesn't do what you expect it to! * * Straightforward C -- but assumes use of a 32-bit int. So * check your compiler! * * Parse UNIDATA2.TXT file. * * Extract fields: * 0 : Codepoint in 4-digit hex * 1 : Unicode 2.0 Character Name * 5 : Character decomposition * * Outputs: * * UNIDLST.TXT * * This is a list of the data, in Unicode numeric order, * with the following format: * * 0041 *** LATIN CAPITAL LETTER A * 00BE 0033 2044 0034 VULGAR FRACTION THREE QUARTERS * 00C0 0041 0300 LATIN CAPITAL LETTER A WITH GRAVE * * etc. The fields are tab-delimited. Unicode values which have * no decomposition are indicated with "***" to make it easy to * filter those values out with grep or other utilities. * * UNIDCMP.TXT * * This is a formatted listing, in Unicode numeric order, * of just those characters with decompositions, with the * following format: * * 00BE VULGAR FRACTION THREE QUARTERS * * 0033 DIGIT THREE * 2044 FRACTION SLASH * 0034 DIGIT FOUR * * 00C0 LATIN CAPITAL LETTER A WITH GRAVE * 0041 LATIN CAPITAL LETTER A * 0300 COMBINING GRAVE ACCENT * * etc. Full names are provided for the character elements of * the decompositions, to make it easy to check their values and * validity. * */ #include #include #include /* You may or may not need the following line. */ #include typedef unsigned int UInt32; /* assumes 32-bit compiler */ typedef unsigned short UShort16; typedef unsigned char UChar8; char infile[] = "UNIDATA2.TXT"; char outfile[] = "UNIDLST.TXT"; char outfile2[] = "UNIDCMP.TXT"; typedef struct { UShort16 haveData; char value[6]; /* as hex string */ char* name; char* decomp; } WALNUTS; typedef WALNUTS *WALNUTPTR; int badValues; int badChars; int badNames; #define NullToken (0) #define CharToken (1) #define FormatToken (2) WALNUTS bagof[0x10000]; /* * Convert an "ASCII" representation of sb/mb int to real int. * * i.e. "c2ba" to 0xC2BA */ int convertToInt ( UInt32 *dest, char *source ) { int slen; int numbytes; int i; char *sp; char b1, b2; char n1, n2; int n; slen = strlen ( source ); if ( ( slen % 2 ) != 0 ) { return -1 ; } numbytes = slen / 2 ; sp = source; *dest = 0; for ( i = 0; i < numbytes; i++ ) { *dest <<= 8; b1 = (char)toupper(*sp++); b2 = (char)toupper(*sp++); if ( b1 >= 0x41 ) n1 = (UChar8)(b1 - 55); else n1 = (UChar8)(b1 - 48); if ( b2 >= 0x41 ) n2 = (UChar8)(b2 - 55); else n2 = (UChar8)(b2 - 48); n = (( n1 << 4 ) + n2); *dest |= n; } return 0 ; } char *parseToken ( char *sp, char *dp, int *tokenT ) { int localToken; int rc; UInt32 uvalue; char *savedp; char *valustr; localToken = NullToken; savedp = dp; /* Prepend a tab */ *dp = '\t'; dp++; while ( (*sp != ' ') && (*sp != '\0') ) { if ( localToken == NullToken ) { if ( *sp == '<' ) localToken = FormatToken; else localToken = CharToken; } *dp++ = *sp++; } *dp = '\0'; if ( localToken == CharToken ) { valustr = savedp + 1; rc = convertToInt ( &uvalue, valustr ); if ( rc < 0 ) { strcat ( savedp, " !!BAD VALUE!!" ); badValues++; } else if ( bagof[uvalue].haveData == 0 ) { if ( ( uvalue >= 0x4E00 ) && ( uvalue <= 0x9FA5 ) ) { strcat ( savedp, " IDEOGRAPH" ); } else { strcat ( savedp, " !!BAD CHAR!!" ); badChars++; } } else if ( bagof[uvalue].name == NULL ) { strcat ( savedp, " !!BAD NAME!!" ); badNames++; } else { strcat ( savedp, " "); strcat ( savedp, bagof[uvalue].name ); } } while ( (*sp == ' ') && (*sp != '\0') ) sp++; *tokenT = localToken; return ( sp ); } int processDecomps ( WALNUTPTR t1, FILE *fd ) { char *sp; char *newsp; int tokenType; char localbuf[256]; sprintf ( localbuf, "%s\t%s\n", t1->value, t1->name ); fputs ( localbuf, fd ); sp = t1->decomp; while ( 1 ) { newsp = parseToken ( sp, localbuf, &tokenType ); if ( tokenType == NullToken ) { break; } else { fputs ( localbuf, fd ); fputs ( "\n", fd ); sp = newsp; } } fputs ( "\n", fd ); return ( 0 ); } int processData ( char *buf, FILE *fd ) { char *sp; char *dp; char *tp; int i; int rc; int gotDecomp; int tlen; int tlen2; UInt32 uvalue; WALNUTPTR p; char token1[20]; /* Codepoint in Hex */ char token2[128]; /* Unicode 2.0 name */ char token3[128]; /* decomposition string */ char localbuf[256]; sp = buf; /* Span and copy first token (field 0) */ dp = token1; i = 0; while ( (*sp != ';') && (*sp != '\0') && ( i < 20 ) ) { *dp++ = *sp++; i++; } *dp = '\0'; if ( *sp == '\0' ) return -1; /* Span semicolon */ sp++; if ( *sp == '\0' ) return -1; /* Span and copy second token (field 1) */ dp = token2; i = 0; while ( (*sp != ';') && (*sp != '\0') && ( i < 128 ) ) { *dp++ = *sp++; i++; } *dp = '\0'; if ( *sp == '\0' ) return -1; tlen = strlen(token2); /* Span semicolon */ sp++; if ( *sp == '\0' ) return -1; /* Span fields 2, 3, 4 */ for ( i = 0; i < 3; i++) { while ( (*sp != ';') && (*sp != '\0') ) sp++; if ( *sp == '\0' ) return -1; sp++; if ( *sp == '\0' ) return -1; } /* Span and copy third token (field 5) */ dp = token3; i = 0; while ( (*sp != ';') && (*sp != '\0') && ( i < 128 ) ) { *dp++ = *sp++; i++; } *dp = '\0'; tlen2 = strlen(token3); rc = convertToInt ( &uvalue, token1 ); if ( rc < 0 ) return rc ; gotDecomp = ( tlen2 > 0 ); if ( gotDecomp ) { sprintf ( localbuf, "%s\t%s\t%s\n", token1, token3, token2 ); } else { sprintf ( localbuf, "%s\t***\t%s\n", token1, token2 ); } fputs ( localbuf, fd ); /* fputs ( localbuf, stdout ); */ /* Now build up an entry in bagof(WALNUTS) to process decomps later */ p = &(bagof[uvalue]); p->haveData = 1; strcpy ( p->value, token1 ); if ( tlen > 0 ) { tp = (char *)malloc(tlen+1); strcpy (tp, token2); p->name = tp; } if ( tlen2 > 0 ) { tp = (char *)malloc(tlen2+1); strcpy (tp, token3); p->decomp = tp; } return ( rc ); } void InitBagOfWalnuts() { int i; for ( i = 0; i < 0x10000; i++ ) { bagof[i].haveData = 0; bagof[i].value[0] = '\0'; bagof[i].name = NULL; bagof[i].decomp = NULL; } } void FreeBagOfWalnuts() { int i; for ( i = 0; i < 0x10000; i++ ) { if (bagof[i].name != NULL) free ( bagof[i].name ); if (bagof[i].decomp != NULL) free ( bagof[i].decomp ); } } int main() { FILE *fdi; FILE *fdo; int rc; int i; char buffer[512]; fdi = fopen ( infile, "rt" ); if ( fdi == NULL ) { printf ( "Cannot open input.\n" ); return -1 ; } fdo = fopen ( outfile, "wt" ); if ( fdo == NULL ) { fclose ( fdi ) ; printf ( "Cannot open output.\n" ); return -1 ; } /* Do the work */ InitBagOfWalnuts(); while ( fgets( buffer, 512, fdi ) != NULL ) { int slen; int lineIsBlank; /* Don't process empty lines or comments. */ slen = strlen ( buffer ); if ( ( slen == 0 ) || ( buffer[0] == ';' ) ) continue ; /* Also check for non-zero length lines with just whitespace */ lineIsBlank = 1; i = 0 ; while ( lineIsBlank && ( i < slen ) ) { if (!isspace(buffer[i])) lineIsBlank = 0; i++; } if ( lineIsBlank ) continue; /* fputs ( buffer, stdout ); */ rc = processData ( buffer, fdo ); if ( rc != 0 ) break; } if ( rc < 0 ) { printf ( "Abnormal termination.\n" ); } fclose ( fdi ); fclose ( fdo ); /* Now process through the bagof(WALNUTS), looking for * decomps and producing a file of explicit decomp names */ badValues = 0; badChars = 0; badNames = 0; fdo = fopen ( outfile2, "wt" ); if ( fdo == NULL ) { printf ( "Cannot open output2.\n" ); return -1 ; } for ( i = 0 ; i < 0x10000; i++ ) { if ( bagof[i].haveData ) if ( bagof[i].decomp != NULL ) processDecomps(&(bagof[i]), fdo); } FreeBagOfWalnuts(); fclose ( fdo ); printf ( "Bad values: %d\n", badValues ); printf ( "Bad chars: %d\n", badChars ); printf ( "Bad names: %d\n", badNames ); return ( 0 ); }