#! /usr/bin/python # -*- coding: utf-8 -*- # # Update the pair table in Unicode UAX-14 (Line Break) based on the rules # # Usage # python table-update.py tr14-in.html [tr14-old.html] > tr14-new.html # # tr14-in.html is read, the existing pair table within it is replaced by a new # one computed from the line brak rules within TR-14 itself, and the result # (a complete, updated TR-14.html) is written to stdout. # # If a tr14-old.html is specified, any table cells that differ in the new # one will include 'class="changed"', the standard Unicode report mechanism # for tagging changes. # import sys import os import re import StringIO # Parse the LB classes that can appear on the left or right side of a rule # Examples: # SY # ( BK | CR | LF | NL ) # [^SP BA HY] # QU SP* # Quick and dirty job: # Ignore ( and [ and | # Empty string means all character classes. # Any occurence of '^' inverts the set. # Ignore 'SP*', handled at a higher level if it occures. # "sot" and "eot" are empty sets. def GetClasses(input): result = set(); input = input.strip() if input == "" or input == "ALL": result |= char_classes_set return result for m in re.finditer("(?i)[A-Z0-9\*]+", input): char_class = m.group() if char_class == "SP*" or char_class=="sot" or char_class=="eot": continue if not char_class in char_classes_set: print >> sys.stderr, "Unrecognized char class: " + char_class result.add(char_class) if input.find("^") >= 0: result ^= char_classes_set; return result # # Classes to put into the table for the UAX. # (, , ) # These are lifted from the old generated table, with the intent of # minimizing unnecessary changes. # TableClasses = ( ("OP", "U+0028 LEFT PARENTHESIS", "OpenPunctuation"), ("CL", "U+0029 RIGHT CURLY BRACKET", "ClosePunctuation"), ("CP", "U+0029 RIGHT PARENTHESIS", "CloseParenthesis"), ("QU", "U+0022 QUOTATION MARK", "Quotation"), ("GL", "U+00A0 NO-BREAK SPACE", "Glue"), ("NS", "U+30A1 KATAKANA LETTER SMALL A", "Nonstarter"), ("EX", "U+0021 EXCLAMATION MARK", "Exclamation"), ("SY", "U+002F SOLIDUS", "BreakSymbols"), ("IS", "U+002C COMMA", "InfixNumeric"), ("PR", "U+0024 DOLLAR SIGN", "PrefixNumeric"), ("PO", "U+0025 PERCENT SIGN", "PostfixNumeric"), ("NU", "U+0030 DIGIT ZERO", "Numeric"), ("AL", "U+0023 NUMBER SIGN", "Alphabetic"), ("HL", "U+05D0 HEBREW LETTER ALEF", "HebrewLetter"), ("ID", "U+2E80 CJK RADICAL REPEAT", "Ideographic"), ("IN", "U+2024 ONE DOT LEADER", "Inseparable"), ("HY", "U+002D HYPHEN-MINUS", "Hyphen"), ("BA", "U+2010 HYPHEN", "BreakAfter"), ("BB", "U+00B4 ACUTE ACCENT", "BreakBefore"), ("B2", "U+2014 EM DASH", "BreakBoth"), ("ZW", "U+200B ZERO WIDTH SPACE", "ZWSpace"), ("CM", "U+0302 COMBINING ACUTE ACCENT", "CombiningMark"), ("WJ", "U+2060 WORD JOINER", "WordJoiner"), ("H2", "U+AC00 HANGUL SYLLABLE GA", "H2"), ("H3", "U+AC01 HANGUL SYLLABLE GAG", "H3"), ("JL", "U+1100 HANGUL CHOSEONG KIYEOK", "JL"), ("JV", "U+1161 HANGUL JUNGSEONG A", "JV"), ("JT", "U+11A8 HANGUL JONGSEONG KIYEOK", "JT")) # # When outputing the table, create an html title for a cell, the # flyover attribute that displays the rules that contributed to the cell. # rulelist is a list of the rules. It may contain duplicates. It may contain # null elements that need to be ignored. # Extract the names of the rules and concatenate them. # def TitleJoin(rulelist): s = "" rulesdone = set() for rule in rulelist: if rule and not rule in rulesdone: if s: s += "; " s += "%s: %s" % (rule.name, rule.text) rulesdone.add(rule) return s; # cell_value: output, the visible cell contents. # cell_title: output, the flyover value, a concatenation of the rules that contribute def EvaluateTableCell(rules, row_class, column_class, cell_value, cell_title): # Remember which rules contributed to a table cell. # The rules will eventuall be emitted as the title (fly-over) attribute for the cell. prohibit_rule = None # row SP* x col indirect_rule = None # row x col indirect_sp_rule = None # SP ÷ col direct_rule = None # row ÷ col cell_value = "" # the symbol to put in the table, one of [^ % @ # _] # The CM (Combining Character) class needs special handling. # They are treated like AL when they can't combine, and it is that # behavior that is reflected in the table. # But the UAX rules don't express that directly, so we need to fake it. column_class_x = column_class row_class_x = row_class if column_class_x == 'CM': column_class_x = 'AL' if row_class_x == 'CM': row_class_x = 'AL' # Run through the rules, in order, to see which apply to this table cell. # SPACE logic is messy, looking at the behavior of spaces between the pair, checking # whether adding a space induces a break where there wasn't one otherwise. for rule in rules: if 'SP' in rule.left_classes and column_class_x in rule.right_classes and rule.op == '÷': if not indirect_sp_rule: indirect_sp_rule = rule; if indirect_rule: cell_value = "%" break if row_class_x in rule.left_classes and column_class_x in rule.right_classes: if rule.op == '÷': direct_rule = rule if not cell_value: # table symbol may already be '%' from an earlier rule. cell_value = '_' break if rule.op == '×' and rule.space_star: prohibit_rule = rule cell_value = '^' break if rule.op == '×' and not rule.space_star: if not indirect_rule: indirect_rule = rule if indirect_sp_rule: cell_value = "%" break if 'SP' in rule.left_classes: cell_value = '^' direct_rule = rule; break if column_class == 'CM': # For unknown reasons, the table in the UAX uses different symbols in the CM column, # but with the same meaning as the corresponding symbols in other columns. # Additionally, the rules do not produce the desired table results! # Just hardcode it for now. if row_class == 'OP': cell_value = '@' elif row_class == 'ZW': cell_value = '_' else: cell_value = '#' cell_title = TitleJoin((prohibit_rule, indirect_rule, indirect_sp_rule, direct_rule)) # # The meanings of the symbols in the pair table, as described in UAX 14: # # ^ denotes a prohibited break: B ^ A is equivalent to B SP* × A; in other words, # never break before A and after B, even if one or more spaces intervene. # # % denotes an indirect break opportunity: B % A is equivalent to B × A and B SP+ ÷ A; # in other words, do not break before A, unless one or more spaces follow B. # # @ denotes a prohibited break for combining marks: B @ A is equivalent to B SP* × A, # where A is of class CM. For more details, see Section 7.5, Combining Marks. # # # denotes an indirect break opportunity for combining marks following a space: # B # A is equivalent to (B × A and B SP+ ÷ A), where A is of class CM. # # _ denotes a direct break opportunity (equivalent to ÷ as defined above). # # TODO: WHY does CM get its own symbols? They have the same meaning as the non-CM symbols. # We should probably get rid of the distinction for this table. # # TODO: Add consistency checking. The rules can express things that the pair table cannot # represent. Verify that these cases don't arise def WriteTable(f, rules, rules_old): print >> f, '' print >> f, ' ' print >> f, '  ' for char_class, sample_char, description in TableClasses: print >> f, ' %s' \ % (sample_char, char_class, description, char_class, char_class) print >> f, ' ' for row_class, sample_char, description in TableClasses: print >> f, ' ' print >> f, ' %s' % \ (sample_char, description, row_class, row_class) for column_class, sc, dc in TableClasses: # Remember which rules contributed to a table cell. # The rules will eventuall be emitted as the title (fly-over) attribute for the cell. prohibit_rule = None # row SP* x col indirect_rule = None # row x col indirect_sp_rule = None # SP ÷ col direct_rule = None # row ÷ col table_symbol = "" # the symbol to put in the table, one of [^ % @ # _] # The CM (Combining Character) class needs special handling. # They are treated like AL when they can't combine, and it is that # behavior that is reflected in the table. # But the UAX rules don't express that directly, so we need to fake it. column_class_x = column_class row_class_x = row_class if column_class_x == 'CM': column_class_x = 'AL' if row_class_x == 'CM': row_class_x = 'AL' # Run through the rules, in order, to see which apply to this table cell. # SPACE logic is messy, looking at the behavior of spaces between the pair, checking # whether adding a space induces a break where there wasn't one otherwise. for rule in rules: if 'SP' in rule.left_classes and column_class_x in rule.right_classes and rule.op == '÷': if not indirect_sp_rule: indirect_sp_rule = rule; if indirect_rule: table_symbol = "%" break if row_class_x in rule.left_classes and column_class_x in rule.right_classes: if rule.op == '÷': direct_rule = rule if not table_symbol: # table symbol may already be '%' from an earlier rule. table_symbol = '_' break if rule.op == '×' and rule.space_star: prohibit_rule = rule table_symbol = '^' break if rule.op == '×' and not rule.space_star: if not indirect_rule: indirect_rule = rule if indirect_sp_rule: table_symbol = "%" break if 'SP' in rule.left_classes: table_symbol = '^' direct_rule = rule; break if column_class == 'CM': # For unknown reasons, the table in the UAX uses different symbols in the CM column, # but with the same meaning as the corresponding symbols in other columns. # Additionally, the rules do not produce the desired table results! # Just hardcode it for now. if row_class == 'OP': table_symbol = '@' elif row_class == 'ZW': table_symbol = '_' else: table_symbol = '#' title = TitleJoin((prohibit_rule, indirect_rule, indirect_sp_rule, direct_rule)) print >> f, ' %s' % (title, table_symbol) print >>f, ' ' print >> f, '', # Note: don't print a final new line. # # Reprsent one line break rule. Has attributes describing the rule. # TODO: list them, make slots? # class Rule: pass def ExtractRules(tr14, rules): # Extract all of the character class names from the UAX 14 original HTML. # Pull them from the HTML anchor for their definition, which looks like # XX # (with possibly a added in. global char_classes_set char_classes = re.findall('(?:)?[A-Z0-9]{2}<', tr14) char_classes_set = set(char_classes) # Extract the rules. A scrape from the HTML of the UAX # TODO: tag the rules in the HTML expicitly, to make this process less fragile. rule_name = "" for m in re.finditer('

(.*?))', tr14): # group # 3, after last rule if rule_name != "" and m.group(3): # Matched the "

> sys.stderr, "Skipping rule " + rule.name + ": " + rule.text; continue; rule.text = re.sub("", "", rule.text) # strip markup rules.append(rule) # # Build up a set of character classes that corresponds to what is allowed on the # left and right side of each of the rules. Attach these to the rule objects. # for rule in rules[:]: # iterate on a copy because we will remove some dud rules. # print rule.name + ": " + rule.text if not re.search("(×|÷|!)", rule.text): print >> sys.stderr, "Skipping rule %s: %s" % (rule.name, rule.text) rules.remove(rule) continue if rule.name == "LB21a": # HL (HY | BA) × # Cannot be represented by the pair table. Skip it. print >> sys.stderr, "Skipping rule %s: %s" % (rule.name, rule.text) rules.remove(rule) continue left_side, op, right_side = re.split("(×|÷|!)", rule.text) # print left_side, op, right_side rule.op = op; rule.left_classes = GetClasses(left_side); rule.right_classes = GetClasses(right_side); rule.space_star = left_side.find("SP*") != -1 # print rule.name, op, rule.left_classes, rule.right_classes def main(args): f = open(args[0]) tr14 = f.read() rules = list() ExtractRules(tr14, rules) if args[1]: print >> sys.stderr, "Second arg exists." f_old = open(args[1]) tr14_old = f.read() rules_old = ExtractRules(tr14_old, rules_old) table = StringIO.StringIO() WriteTable(table, rules, rules_old) newtable = table.getvalue() # print table updatedtr14 = \ re.sub('(?s)\<\!-- Begin Generated Pair Table -->.*?\<\!-- End Generated Pair Table -->', newtable, tr14) print updatedtr14, if __name__ == "__main__": main(sys.argv[1:])