classpath
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

java.text.RuleBasedCollator && CollationElementIterator.


From: Guilhem Lavaux
Subject: java.text.RuleBasedCollator && CollationElementIterator.
Date: Sun, 16 May 2004 18:51:54 +0200
User-agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030630

Hi,

Here is a new patch for RuleBasedCollator and CollationElementIterator. This should fix handling of getOffset()/setOffset(). CollationKey are now consistently built up to the TERTIARY order. It should also fix how expansion sequences are handled. As always, there may still be bugs and missing features (like the missing '!' and some uncertainties concerning implicit accented characters reordering).

I'll commit it if nobody is against it.

Cheers,

Guilhem.

ChangeLog entry:
2004-05-16  Guilhem Lavaux <address@hidden>


        * java/text/CollationElementIterator.java
        (nextBlock, previousBlock): Use text_indexes to compute
        textIndex.
        (setOffset): Use text_indexes to chose the right position.
        (text_indexes): New field.
        (setText): Build text_indexes. Better handling of expansion
        ordering.



        * libraries/javalib/java/text/RuleBasedCollator.java
        (CollationElement, CollationSorter): Made static.
        (last_tertiary_value, SPECIAL_UNKNOWN_SEQ): Introduced special
        collation elements for unknown character this is for sequence
        using resets.
        (mergeRules): Fixed insertion point.
        (buildCollationVector): Initialize last_tertiary_value.
        (compare): Handle special cases of accented characters.
        (getDefaultAccentedElement): New method.
        (getCollationFixed): Fixed key building.


Index: java/text/CollationElementIterator.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/text/CollationElementIterator.java,v
retrieving revision 1.16
diff -u -r1.16 CollationElementIterator.java
--- java/text/CollationElementIterator.java     23 Apr 2004 16:03:10 -0000      
1.16
+++ java/text/CollationElementIterator.java     16 May 2004 16:43:03 -0000
@@ -92,6 +92,11 @@
   private Object[] text_decomposition;
 
   /**
+   * Array containing the index of the specified block.
+   */
+  private int[] text_indexes;
+
+  /**
    * This method initializes a new instance of 
<code>CollationElementIterator</code>
    * to iterate over the specified <code>String</code> using the rules in the
    * specified <code>RuleBasedCollator</code>.
@@ -112,9 +117,11 @@
       return null;
     
     RuleBasedCollator.CollationElement e =
-      (RuleBasedCollator.CollationElement) text_decomposition[index++];
+      (RuleBasedCollator.CollationElement) text_decomposition[index];
     
-    textIndex += e.key.length();
+    textIndex = text_indexes[index+1];
+
+    index++;
 
     return e;
   }
@@ -128,7 +135,7 @@
     RuleBasedCollator.CollationElement e =
       (RuleBasedCollator.CollationElement) text_decomposition[index];
 
-    textIndex -= e.key.length();
+    textIndex = text_indexes[index+1];
     
     return e;
   }
@@ -231,6 +238,9 @@
   public void setText(String text)
   {
     int idx = 0;
+    int idx_idx = 0;
+    int alreadyExpanded = 0;
+    int idxToMove = 0;
 
     this.text = text;
     this.index = 0;
@@ -238,6 +248,8 @@
     String work_text = text.intern();
 
     Vector v = new Vector();
+    Vector vi = new Vector();
+
     // Build element collection ordered as they come in "text".
     while (idx < work_text.length())
       {
@@ -254,6 +266,16 @@
              key_old = key;
            key = work_text.substring (idx, idx+p);
            object = collator.prefix_tree.get (key);
+           if (object != null && idx < alreadyExpanded)
+             {
+               RuleBasedCollator.CollationElement prefix = 
(RuleBasedCollator.CollationElement)object;
+               if (prefix.expansion != null && 
+                   prefix.expansion.startsWith(work_text.substring(0, idx)))
+               {
+                 object = null;
+                 key = key_old;
+               }
+             }
            p++;
          }
        while (idx+p <= work_text.length());
@@ -263,33 +285,107 @@
        
        RuleBasedCollator.CollationElement prefix =
          (RuleBasedCollator.CollationElement) collator.prefix_tree.get (key);
-       
+
+       /*
+        * First case: There is no such sequence in the database.
+        * We will have to build one from the context.
+        */
        if (prefix == null)
          {
-           RuleBasedCollator.CollationElement e =
-             collator.getDefaultElement(work_text.charAt (idx));
-           
-           v.add (e);
-           idx++;
+           /*
+            * We are dealing with sequences in an expansion. They
+            * are treated as accented characters (tertiary order).
+            */
+           if (alreadyExpanded > 0)
+             {
+               RuleBasedCollator.CollationElement e =
+                 collator.getDefaultAccentedElement (work_text.charAt (idx));
+               
+               v.add (e);
+               vi.add (new Integer(idx_idx));
+               idx++;
+               alreadyExpanded--;
+               if (alreadyExpanded == 0)
+                 {
+                   /* There is not any characters left in the expansion set.
+                    * We can increase the pointer in the source string.
+                    */
+                   idx_idx += idxToMove;
+                   idxToMove = 0; 
+                 }
+               else
+                 idx_idx++;
+             }
+           else
+             {
+               /* This is a normal character. */
+               RuleBasedCollator.CollationElement e =
+                 collator.getDefaultElement (work_text.charAt (idx));
+               Integer i_ref = new Integer(idx_idx);
+
+               /* Don't forget to mark it as a special sequence so the
+                * string can be ordered.
+                */
+               v.add (RuleBasedCollator.SPECIAL_UNKNOWN_SEQ);
+               vi.add (i_ref);
+               v.add (e);
+               vi.add (i_ref);
+               idx_idx++;
+               idx++;
+             }
            continue;
          }
-
+ 
+       /*
+        * Second case: Here we have found a matching sequence.
+        * Here we have an expansion string prepend it to the "work text" and
+        * add the corresponding sorting element. We must also mark 
+        */
        if (prefix.expansion != null)
          {
            work_text = prefix.expansion
              + work_text.substring (idx+prefix.key.length());
            idx = 0;
            v.add (prefix);
+           vi.add (new Integer(idx_idx));
+           if (alreadyExpanded == 0)
+             idxToMove = prefix.key.length();
+           alreadyExpanded += prefix.expansion.length()-prefix.key.length();
          }
        else
          {
+           /* Third case: the simplest. We have got the prefix and it
+            * has not to be expanded.
+            */
            if (!prefix.ignore)
-             v.add (prefix);
+             {
+               v.add (prefix);
+               vi.add (new Integer(idx_idx));
+             }
            idx += prefix.key.length();
+           /* If the sequence is in an expansion, we must decrease the
+            * counter.
+            */
+           if (alreadyExpanded > 0)
+             {
+               alreadyExpanded -= prefix.key.length();
+               if (alreadyExpanded == 0)
+                 {
+                   idx_idx += idxToMove;
+                   idxToMove = 0;
+                 }
+             } else
+               idx_idx += prefix.key.length();
          }
       }
     
     text_decomposition = v.toArray();
+    text_indexes = new int[vi.size()+1];
+    for (int i = 0; i < vi.size(); i++) 
+      {
+       text_indexes[i] = ((Integer)vi.elementAt(i)).intValue();
+      }
+    text_indexes[vi.size()] = text.length();
   }
 
   /**
@@ -341,22 +437,22 @@
     if (offset < 0)
       throw new IllegalArgumentException("Negative offset: " + offset);
 
-    if ((text.length() > 0) && (offset > 0))
+    if (offset > (text.length() - 1))
       throw new IllegalArgumentException("Offset too large: " + offset);
-    else if (offset > (text.length() - 1))
-      throw new IllegalArgumentException("Offset too large: " + offset);
-
-    textIndex = 0;
-    for (int i=0;i<text_decomposition.length;i++)
-      {
-       RuleBasedCollator.CollationElement e =
-         (RuleBasedCollator.CollationElement) text_decomposition[i];
-       int idx = textIndex + e.key.length();
-       
-       if (idx > offset)
+    
+    for (index = 0; index < text_decomposition.length; index++)
+      {        
+       if (offset <= text_indexes[index])
          break;
-       textIndex = idx;
       }
+    /*
+     * As text_indexes[0] == 0, we should not have to take care whether index 
is
+     * greater than 0. It is always.
+     */
+    if (text_indexes[index] == offset)
+      textIndex = offset;
+    else
+      textIndex = text_indexes[index-1];
   }
 
   /**
Index: java/text/RuleBasedCollator.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/text/RuleBasedCollator.java,v
retrieving revision 1.23
diff -u -r1.23 RuleBasedCollator.java
--- java/text/RuleBasedCollator.java    23 Apr 2004 17:37:46 -0000      1.23
+++ java/text/RuleBasedCollator.java    16 May 2004 16:43:03 -0000
@@ -147,7 +147,7 @@
    * This class describes what rank has a character (or a sequence of 
characters) 
    * in the lexicographic order. Each element in a rule has a collation 
element.
    */
-  final class CollationElement
+  final static class CollationElement
   {
     String key;
     int primary;
@@ -189,7 +189,7 @@
    * address@hidden 
#mergeRules(int,java.lang.String,java.util.Vector,java.util.Vector)})
    * as a temporary state while merging two sets of instructions.
    */
-  final class CollationSorter
+  final static class CollationSorter
   {
     static final int GREATERP = 0;
     static final int GREATERS = 1;
@@ -230,10 +230,27 @@
   private int last_primary_value;
 
   /**
+   * This is the value of the last secondary sequence of the
+   * primary 0, entered into
+   * <code>ce_table</code>. It is used to compute the
+   * ordering value of an unspecified accented character.
+   */
+  private int last_tertiary_value;
+
+  /**
    * This variable is true if accents need to be sorted
    * in the other direction.
    */
   private boolean inverseAccentComparison;
+
+  /**
+   * This collation element is special to unknown sequence.
+   * The JDK uses it to mark and sort the characters which has
+   * no collation rules.
+   */
+  static final CollationElement SPECIAL_UNKNOWN_SEQ = 
+    new CollationElement("", (short) 32767, (short) 0, (short) 0,
+                        (short) 0, null);
   
   /**
    * This method initializes a new instance of <code>RuleBasedCollator</code>
@@ -356,14 +373,14 @@
          (CollationSorter) main.elementAt(insertion_point-1);
        
        sorter.expansionOrdering = starter.substring(max_length); // Skip the 
first good prefix element
-       
+               
        main.insertElementAt(sorter, insertion_point);
        
        /*
         * This is a new set of rules. Append to the list.
         */
        patch.removeElementAt(0);
-       insertion_point = main.size();
+       insertion_point++;
       }
 
     // Now insert all elements of patch at the insertion point.
@@ -392,7 +409,7 @@
   {
     boolean ignoreChars = (base_offset == 0);
     int operator = -1;
-    StringBuffer sb = new StringBuffer("");
+    StringBuffer sb = new StringBuffer();
     boolean doubleQuote = false;
     boolean eatingChars = false;
     boolean nextIsModifier = false;
@@ -605,6 +622,7 @@
     throws ParseException
   {
     int primary_seq = 0;
+    int last_tertiary_seq = 0;
     short secondary_seq = 0;
     short tertiary_seq = 0;
     short equality_seq = 0;
@@ -652,6 +670,8 @@
            continue element_loop;
          case CollationSorter.GREATERT:
            tertiary_seq++;
+           if (primary_seq == 0)
+             last_tertiary_seq = tertiary_seq;
            equality_seq = 0;
            break;
          case CollationSorter.IGNORE:
@@ -686,6 +706,7 @@
     ce_table = v.toArray();
 
     last_primary_value = primary_seq+1;
+    last_tertiary_value = last_tertiary_seq+1;
   }
 
   /**
@@ -757,6 +778,17 @@
         // Check for primary strength differences
         int prim1 = CollationElementIterator.primaryOrder(ord1); 
         int prim2 = CollationElementIterator.primaryOrder(ord2); 
+       
+       if (prim1 == 0 && getStrength() < TERTIARY)
+         {
+           ct.previousBlock();
+           continue;
+         }
+       else if (prim2 == 0 && getStrength() < TERTIARY)
+         {
+           cs.previousBlock();
+           continue;
+         }
 
         if (prim1 < prim2)
           return -1;
@@ -769,7 +801,7 @@
         int sec1 = CollationElementIterator.secondaryOrder(ord1);
         int sec2 = CollationElementIterator.secondaryOrder(ord2);
 
-        if (sec1 < sec2)
+       if (sec1 < sec2)
           return -1;
         else if (sec1 > sec2)
           return 1;
@@ -833,6 +865,28 @@
   }
 
   /**
+   * This method builds a default collation element for an accented character
+   * without invoking the database created from the rules passed to the 
constructor.
+   *
+   * @param c Character which needs a collation element.
+   * @return A valid brand new CollationElement instance.
+   */
+  CollationElement getDefaultAccentedElement(char c)
+  {
+    int v;
+
+    // Preliminary support for generic accent sorting inversion (I don't know 
if all
+    // characters in the range should be sorted backward). This is the place
+    // to fix this if needed.
+    if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
+      v = 0x0361 - ((int) c - 0x02B9);
+    else
+      v = (short) c;
+    return new CollationElement("" + c, (short) 0,
+                               (short) 0, (short) (last_tertiary_value + v), 
(short) 0, null);
+  }
+
+  /**
    * This method returns an instance for <code>CollationElementIterator</code>
    * for the specified <code>String</code> under the collation rules for this
    * object.
@@ -894,11 +948,12 @@
         switch (getStrength())
           {
             case PRIMARY:
-               ord = CollationElementIterator.primaryOrder(ord);
-               break;
-
+             ord = CollationElementIterator.primaryOrder(ord);
+             break;
+             
             case SECONDARY:
-               ord = CollationElementIterator.secondaryOrder(ord);
+             ord = CollationElementIterator.primaryOrder(ord) << 8;
+             ord |= CollationElementIterator.secondaryOrder(ord);
 
             default:
                break;

reply via email to

[Prev in Thread] Current Thread [Next in Thread]