emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: regex and case-fold-search problem


From: Stefan Monnier
Subject: Re: regex and case-fold-search problem
Date: Fri, 23 Aug 2002 17:52:37 -0400

"Stefan Monnier" <monnier+gnu/address@hidden> wrote:
> For ASCII it's pretty easy to fix.  But for other charsets, it's
> indeed more tricky.  Maybe we can simply use the smallest contiguous
> range of chars that includes all the chars we should match,
> so the behavior is indeed "implementation-defined" (in the sense
> that it's not necessarily obvious to the user what happens) but
> it's at least less confusing (in the sense that (case-fold-search t)
> matches at least as much as (case-fold-search nil)).

How about the patch below ?


        Stefan


Index: regex.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/regex.c,v
retrieving revision 1.176
diff -u -u -b -r1.176 regex.c
--- regex.c     25 Mar 2002 00:45:48 -0000      1.176
+++ regex.c     23 Aug 2002 21:49:10 -0000
@@ -1914,12 +1914,13 @@
 #define BIT_UPPER      0x10
 #define BIT_MULTIBYTE  0x20
 
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)   \
+/* Set a range START..END to WORK_AREA.
+   The range is passed through TRANSLATE, so START and END
+   should be untranslated.  */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end)       \
   do {                                                                 \
     EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);                     \
-    (work_area).table[(work_area).used++] = (range_start);             \
-    (work_area).table[(work_area).used++] = (range_end);               \
+    set_image_of_range (&work_area, start, end, translate);    \
   } while (0)
 
 /* Free allocated memory for WORK_AREA.         */
@@ -2077,6 +2078,31 @@
 }
 #endif
 
+
+
+/* We need to find the image of the range start..end when passed through
+   TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+   and is not even necessarily contiguous.
+   We approximate it with the smallest contiguous range that contains
+   all the chars we need.  */
+static void
+set_image_of_range (work_area, start, end, translate)
+     RE_TRANSLATE_TYPE translate;
+     struct range_table_work_area *work_area;
+     re_wchar_t start, end;
+{
+  re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end);
+  if (RE_TRANSLATE_P (translate))
+    for (; start <= end; start++)
+      {
+       re_wchar_t c = TRANSLATE (start);
+       cmin = MIN (cmin, c);
+       cmax = MAX (cmax, c);
+      }
+  work_area->table[work_area->used++] = (cmin);
+  work_area->table[work_area->used++] = (cmax);
+}
+
 /* Explicit quit checking is only used on NTemacs.  */
 #if defined WINDOWSNT && defined emacs && defined QUIT
 extern int immediate_quit;
@@ -2525,14 +2551,18 @@
 
                if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
-               PATFETCH (c);
+               /* Don't translate yet.  The range TRANSLATE(X..Y) cannot
+                  always be determined from TRANSLATE(X) and TRANSLATE(Y)
+                  So the translation is done later in a loop.  Example:
+                  (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
+               PATFETCH_RAW (c);
 
                /* \ might escape characters inside [...] and [^...].  */
                if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
                  {
                    if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);
 
-                   PATFETCH (c);
+                   PATFETCH_RAW (c);
                    escaped_char = true;
                  }
                else
@@ -2636,10 +2668,10 @@
                  {
 
                    /* Discard the `-'. */
-                   PATFETCH (c1);
+                   PATFETCH_RAW (c1);
 
                    /* Fetch the character which ends the range. */
-                   PATFETCH (c1);
+                   PATFETCH_RAW (c1);
 
                    if (SINGLE_BYTE_CHAR_P (c))
                      {
@@ -2653,7 +2685,7 @@
                               starting at the smallest character in
                               the charset of C1 and ending at C1.  */
                            int charset = CHAR_CHARSET (c1);
-                           int c2 = MAKE_CHAR (charset, 0, 0);
+                           re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
                            
                            SET_RANGE_TABLE_WORK_AREA (range_table_work,
                                                       c2, c1);
@@ -2672,7 +2704,7 @@
                  /* ... into bitmap.  */
                  {
                    re_wchar_t this_char;
-                   int range_start = c, range_end = c1;
+                   re_wchar_t range_start = c, range_end = c1;
 
                    /* If the start is after the end, the range is empty.  */
                    if (range_start > range_end)





reply via email to

[Prev in Thread] Current Thread [Next in Thread]