package org.unicode.bidi;
/*
* Last Revised: 2016-09-21
*
* Credits:
* Originally written by Asmus Freytag
*
* Updated for Unicode 8.0 by Deepak Jois, with feedback from Ken Whistler
*
* (C) Copyright ASMUS, Inc. 2013, All Rights Reserved
* (C) Copyright Deepak Jois 2016, All Rights Reserved
*
* Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
*/
/*
* Revision info (2016-09-21):
* - Added MAX_PAIRING_DEPTH to support max depth for nested brackets in Unicode 8.0
* - Changes to support updated definitions BD14 and BD15 in Unicode 8.0
* - Changes to support clarifications to rule N0 in Unicode 8.0
*/
/**
* Reference implementation of the BPA algorithm of the Unicode 6.3 Bidi algorithm.
*
*
* This implementation is not optimized for performance. It is intended
* as a reference implementation that closely follows the specification
* of the paired bracket part of the Bidirectional Algorithm in
* The Unicode Standard version 6.3 (and revised in Unicode Standard version 8.0)
*
* The implementation covers definitions BD14-BD16 and rule N0.
*
* Like the BidiReference class which uses the BidiBPAReference class, the implementation is
* designed to decouple the mapping of Unicode properties to characters from the handling
* of the Bidi Paired-bracket Algorithm. Such mappings are to be performed by the caller.
*
* One of the properties, the Bidi_Paired_Bracket requires some pre-processing to translate
* it into the format used here. Instead of being a code-point mapping from a bracket character
* to the other partner of the bracket pair, this implementation accepts any unique identifier
* common to BOTH parts of the pair, and 0 or some unique value for all non-bracket characters.
* The actual values of these unique identifiers are not defined.
*
* The BPA algorithm requires that bracket characters that are canonical equivalents of each
* other must be able to be substituted for each other. Callers can accomplish this by re-using
* the same unique identifier for such equivalent characters.
*
* The resultant values become the pairValues array used in calling the resolvePairedBrackets member.
*
* In implementing BD16, this implementation departs slightly from the "logical" algorithm defined
* in UAX#9. In particular, the stack referenced there supports operations that go beyond a "basic"
* stack. An equivalent implementation based on a linked list is used here.
*
* @author Asmus Freytag
* @author Deepak Jois
*/
import java.util.*;
public class BidiPBAReference {
/*
* BD14. An opening paired bracket is a character whose
* Bidi_Paired_Bracket_Type property value is Open.
*
* BD15. A closing paired bracket is a character whose
* Bidi_Paired_Bracket_Type property value is Close.
*/
// Bidi_Paired_Bracket_Type
public final static byte n = 0;
public final static byte o = 1;
public final static byte c = 2;
byte sos; // direction corresponding to start of sequence
public BidiPBAReference() {
}
// Holds a pair of index values for opening and closing bracket location of
// a bracket pair
// Contains additional methods to allow pairs to be sorted by the location
// of the opening bracket
public static final class BracketPair implements
java.lang.Comparable {
private int ichOpener;
private int ichCloser;
public BracketPair(int ichOpener, int ichCloser) {
super();
this.ichOpener = ichOpener;
this.ichCloser = ichCloser;
}
public boolean equals(Object other) {
if (other instanceof BracketPair) {
BracketPair otherPair = (BracketPair) other;
return this.ichOpener == otherPair.ichOpener
&& this.ichCloser == otherPair.ichCloser;
}
return false;
}
public int compareTo(BracketPair otherPair) {
if (this.ichOpener == otherPair.ichOpener)
return 0;
if (this.ichOpener < otherPair.ichOpener)
return -1;
else
return 1;
}
public String toString() {
return "(" + ichOpener + ", " + ichCloser + ")";
}
public int getOpener() {
return ichOpener;
}
public int getCloser() {
return ichCloser;
}
}
// The following is a restatement of BD 16 using non-algorithmic language.
//
// A bracket pair is a pair of characters consisting of an opening
// paired bracket and a closing paired bracket such that the
// Bidi_Paired_Bracket property value of the former equals the latter,
// subject to the following constraints.
// - both characters of a pair occur in the same isolating run sequence
// - the closing character of a pair follows the opening character
// - any bracket character can belong at most to one pair, the earliest possible one
// - any bracket character not part of a pair is treated like an ordinary character
// - pairs may nest properly, but their spans may not overlap otherwise
// Bracket characters with canonical decompositions are supposed to be treated
// as if they had been normalized, to allow normalized and non-normalized text
// to give the same result. In this implementation that step is pushed out to
// the caller - see definition of the pairValues array.
LinkedList openers; // list of positions for opening brackets
// bracket pair positions sorted by location of opening bracket
private SortedSet pairPositions;
public String getPairPositionsString()
{
SortedSet tempPositions = new TreeSet();
for (BidiPBAReference.BracketPair pair : pairPositions) {
tempPositions.add(new BidiPBAReference.BracketPair(indexes[pair.getOpener()], indexes[pair.getCloser()]));
}
return tempPositions.toString();
}
private byte[] initialCodes; // direction bidi codes initially assigned to the original string
public byte[] codesIsolatedRun; // directional bidi codes for an isolated run
private int[] indexes; // array of index values into the original string
/**
* check whether characters at putative positions could form a bracket pair
* based on the paired bracket character properties
*
* @param pairValues
* - unique ID for the pair (or set) of canonically matched
* brackets
* @param ichOpener
* - position of the opening bracket
* @param ichCloser
* - position of the closing bracket
* @return true if match
*/
private Boolean matchOpener(int[] pairValues, int ichOpener, int ichCloser) {
return pairValues[indexes[ichOpener]] == pairValues[indexes[ichCloser]];
}
/**
* removes any list elements from First to index
*
* @param list
* @param index
*/
private void removeHead(LinkedList list, int index) {
ListIterator iter = list.listIterator(index);
while (iter.hasPrevious()) {
iter.previous();
iter.remove();
}
}
public static final int MAX_PAIRING_DEPTH = 63;
/**
* locate all Paired Bracket characters and determine whether they form
* pairs according to BD16. This implementation uses a linked list instead
* of a stack, because, while elements are added at the front (like a push)
* there are not generally removed in atomic 'pop' operations, reducing the
* benefit of the stack archetype.
*
* @param pairTypes
* - array of paired Bracket types
* @param pairValues
* - array of characters codes such that for all bracket
* characters it contains the same unique value if their
* Bidi_Paired_Bracket properties map between them. For
* brackets hat have canonical decompositions (singleton
* mappings) it contains the same value as for the canonically
* decomposed character. For characters that have paired
* bracket type of "n" the value is ignored.
*/
private void locateBrackets(byte[] pairTypes, int[] pairValues) {
openers = new LinkedList();
pairPositions = new TreeSet();
// traverse the run
// do that explicitly (not in a for-each) so we can record position
for (int ich = 0; ich < indexes.length; ich++) {
// look at the bracket type for each character
if ((pairTypes[indexes[ich]] == n) || (codesIsolatedRun[ich] != BidiReference.ON)) {
continue; // continue scanning
}
switch (pairTypes[indexes[ich]]) {
// opening bracket found, note location
case o:
// check if maximum pairing depth reached
if (openers.size() == MAX_PAIRING_DEPTH) {
openers.clear();
return;
}
// remember opener location, most recent first
openers.addFirst(ich);
break;
// closing bracket found
case c:
// see if there is a match
if (openers.isEmpty())
continue; // no opening bracket defined
ListIterator iter = openers.listIterator();
while (iter.hasNext()) {
Integer opener = iter.next();
if (matchOpener(pairValues, opener, ich)) {
// if the opener matches, add nested pair to the ordered list
pairPositions
.add(new BracketPair(opener, (Integer) ich));
// remove up to and including matched opener
removeHead(openers, iter.nextIndex());
break;
}
}
// if we get here, the closing bracket matched no openers
// and gets ignored
continue;
}
}
}
/*
* Bracket pairs within an isolating run sequence are processed as units so
* that both the opening and the closing paired bracket in a pair resolve to
* the same direction.
*
* N0. Process bracket pairs in an isolating run sequence sequentially in
* the logical order of the text positions of the opening paired brackets
* using the logic given below. Within this scope, bidirectional types EN
* and AN are treated as R.
*
* Identify the bracket pairs in the current isolating run sequence
* according to BD16. For each bracket-pair element in the list of pairs of
* text positions:
*
* a Inspect the bidirectional types of the characters enclosed within the
* bracket pair.
*
* b If any strong type (either L or R) matching the embedding direction is
* found, set the type for both brackets in the pair to match the embedding
* direction.
*
* o [ e ] o -> o e e e o
*
* o [ o e ] -> o e o e e
*
* o [ NI e ] -> o e NI e e
*
* c Otherwise, if a strong type (opposite the embedding direction) is
* found, test for adjacent strong types as follows: 1 First, check
* backwards before the opening paired bracket until the first strong type
* (L, R, or sos) is found. If that first preceding strong type is opposite
* the embedding direction, then set the type for both brackets in the pair
* to that type. 2 Otherwise, set the type for both brackets in the pair to
* the embedding direction.
*
* o [ o ] e -> o o o o e
*
* o [ o NI ] o -> o o o NI o o
*
* e [ o ] o -> e e o e o
*
* e [ o ] e -> e e o e e
*
* e ( o [ o ] NI ) e -> e e o o o o NI e e
*
* d Otherwise, do not set the type for the current bracket pair. Note that
* if the enclosed text contains no strong types the paired brackets will
* both resolve to the same level when resolved individually using rules N1
* and N2.
*
* e ( NI ) o -> e ( NI ) o
*/
/**
* map character's directional code to strong type as required by rule N0
*
* @param ich
* - index into array of directional codes
* @return R or L for strong directional codes, ON for anything else
*/
private byte getStrongTypeN0(int ich) {
switch (codesIsolatedRun[ich]) {
default:
return BidiReference.ON;
// in the scope of N0, number types are treated as R
case BidiReference.EN:
case BidiReference.AN:
case BidiReference.AL:
case BidiReference.R:
return BidiReference.R;
case BidiReference.L:
return BidiReference.L;
}
}
/**
* determine which strong types are contained inside a Bracket Pair
*
* @param pairedLocation
* - a bracket pair
* @param dirEmbed
* - the embedding direction
* @return ON if no strong type found, otherwise return the embedding
* direction, unless the only strong type found is opposite the
* embedding direction, in which case that is returned
*/
byte classifyPairContent(BracketPair pairedLocation, byte dirEmbed) {
byte dirOpposite = BidiReference.ON;
for (int ich = pairedLocation.getOpener() + 1; ich < pairedLocation
.getCloser(); ich++) {
byte dir = getStrongTypeN0(ich);
if (dir == BidiReference.ON)
continue;
if (dir == dirEmbed)
return dir; // type matching embedding direction found
dirOpposite = dir;
}
// return ON if no strong type found, or class opposite to dirEmbed
return dirOpposite;
}
/**
* determine which strong types are present before a Bracket Pair
*
* @param pairedLocation
* - a bracket pair
* @return R or L if strong type found, otherwise ON
*/
byte classBeforePair(BracketPair pairedLocation) {
for (int ich = pairedLocation.getOpener() - 1; ich >= 0; --ich) {
byte dir = getStrongTypeN0(ich);
if (dir != BidiReference.ON)
return dir;
}
// no strong types found, return sos
return sos;
}
/**
* Implement rule N0 for a single bracket pair
*
* @param pairedLocation
* - a bracket pair
* @param dirEmbed
* - the embedding direction
*/
void assignBracketType(BracketPair pairedLocation, byte dirEmbed) {
// rule "N0, a", inspect contents of pair
byte dirPair = classifyPairContent(pairedLocation, dirEmbed);
// dirPair is now L, R, or N (no strong type found)
// the following logical tests are performed out of order compared to
// the statement of the rules but yield the same results
if (dirPair == BidiReference.ON)
return; // case "d" - nothing to do
if (dirPair != dirEmbed) {
// case "c": strong type found, opposite - check before (c.1)
dirPair = classBeforePair(pairedLocation);
if (dirPair == dirEmbed || dirPair == BidiReference.ON) {
// no strong opposite type found before - use embedding (c.2)
dirPair = dirEmbed;
}
}
// else: case "b", strong type found matching embedding,
// no explicit action needed, as dirPair is already set to embedding
// direction
// set the bracket types to the type found
setBracketsToType(pairedLocation, dirPair);
}
private void setBracketsToType(BracketPair pairedLocation, byte dirPair) {
codesIsolatedRun[pairedLocation.getOpener()] = dirPair;
codesIsolatedRun[pairedLocation.getCloser()] = dirPair;
for(int i = pairedLocation.getOpener() + 1; i < pairedLocation.getCloser(); i++) {
int index = indexes[i];
if (initialCodes[index] == BidiReference.NSM) {
codesIsolatedRun[i] = dirPair;
} else {
break;
}
}
for(int i = pairedLocation.getCloser() + 1; i < indexes.length; i++) {
int index = indexes[i];
if (initialCodes[index] == BidiReference.NSM) {
codesIsolatedRun[i] = dirPair;
} else {
break;
}
}
}
// this implements rule N0 for a list of pairs
public void resolveBrackets(byte dirEmbed) {
for (BracketPair pair : pairPositions) {
assignBracketType(pair, dirEmbed);
}
}
/**
* runAlgorithm - runs the paired bracket part of the UBA algorithm
*
* @param indexes
* - indexes into the original string
* @param initialCodes
* - bidi classes (directional codes) initially assigned to each
* character in the original string (prior to any modifications by
* subsequent steps.
* @param codes
* - bidi classes (directional codes) for each character in the
* original string
* @param pairTypes
* - array of paired bracket types for each character in the
* original string
* @param pairValues
* - array of unique integers identifying which pair of brackets
* (or canonically equivalent set) a bracket character
* belongs to. For example in the string "[Test(s)>" the
* characters "(" and ")" would share one value and "[" and ">"
* share another (assuming that "]" and ">" are canonically equivalent).
* Characters that have pairType = n might always get pairValue = 0.
*
* The actual values are no important as long as they are unique,
* so one way to assign them is to use the code position value for
* the closing element of a paired set for both opening and closing
* character - paying attention to first applying canonical decomposition.
* @param sos
* - direction for sos
* @param level
* - the embedding level
*/
public void resolvePairedBrackets(int[] indexes, byte[] initialCodes, byte[] codes, byte[] pairTypes,
int[] pairValues, byte sos, byte level) {
final byte dirEmbed = 1 == (level & 1) ? BidiReference.R
: BidiReference.L;
this.sos = sos;
this.indexes = indexes;
codesIsolatedRun = codes;
this.initialCodes = initialCodes;
locateBrackets(pairTypes, pairValues);
resolveBrackets(dirEmbed);
}
/**
* Entry point for testing the BPA algorithm in isolation. Does not use an indexes
* array for indirection. Actual work is carried out by resolvePairedBrackets.
*
* @param codes
* - bidi classes (directional codes) for each character
* @param pairTypes
* - array of paired bracket type values for each character
* @param pairValues
* - array of unique integers identifying which bracket pair
* see resolvePairedBrackets for details.
* @param sos
* - direction for sos
* @param level
* - the embedding level
*/
public void runAlgorithm(byte[] codes, byte[] pairTypes,
int[] pairValues, byte sos, byte level) {
// dummy up an indexes array that represents an identity mapping
this.indexes = new int[codes.length];
for (int ich = 0; ich < indexes.length; ich++)
indexes[ich] = ich;
resolvePairedBrackets(indexes, codes, codes, pairTypes, pairValues, sos, level);
}
}