bug-grep
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Rational Range Interpretation patches


From: Aharon Robbins
Subject: Rational Range Interpretation patches
Date: Thu, 01 Dec 2011 22:21:31 +0200
User-agent: Heirloom mailx 12.4 7/29/08

Hi. Here are the patches. I did my best to use git the way y'all want but
if not, please just fix it up...

First patch is for regcomp.c in gnulib.  Paulo, you should be able
to use this in sed.  Please do. :-)

Second is for dfa.c and grep.texi in grep.

Thanks,

Arnold
--------------------------------------------
>From 7c71a3b5b4459187b8eef913598b3fe4d887c7d1 Mon Sep 17 00:00:00 2001
From: Arnold D. Robbins <address@hidden>
Date: Thu, 1 Dec 2011 22:15:17 +0200
Subject: [PATCH] Implement Rational Range Interpretation.

---
 lib/regcomp.c |   13 +++----------
 1 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/lib/regcomp.c b/lib/regcomp.c
index 81c5d4a..fe1e707 100644
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -2654,7 +2654,6 @@ build_range_exp (const reg_syntax_t syntax,
     wchar_t wc;
     wint_t start_wc;
     wint_t end_wc;
-    wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
 
     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
                : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
@@ -2668,11 +2667,7 @@ build_range_exp (const reg_syntax_t syntax,
              ? __btowc (end_ch) : end_elem->opr.wch);
     if (start_wc == WEOF || end_wc == WEOF)
       return REG_ECOLLATE;
-    cmp_buf[0] = start_wc;
-    cmp_buf[4] = end_wc;
-
-    if (BE ((syntax & RE_NO_EMPTY_RANGES)
-            && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
+    else if ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc)
       return REG_ERANGE;
 
     /* Got valid collation sequence values, add them as a new entry.
@@ -2713,10 +2708,8 @@ build_range_exp (const reg_syntax_t syntax,
     /* Build the table for single byte characters.  */
     for (wc = 0; wc < SBC_MAX; ++wc)
       {
-       cmp_buf[2] = wc;
-       if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
-           && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
-         bitset_set (sbcset, wc);
+         if (start_wc <= wc && wc <= end_wc)
+           bitset_set (sbcset, wc);
       }
   }
 # else /* not RE_ENABLE_I18N */
-- 
1.7.1


>From a39f6840e57e5d90a58f975d7a1013b3e27cd54e Mon Sep 17 00:00:00 2001
From: Arnold D. Robbins <address@hidden>
Date: Thu, 1 Dec 2011 22:16:15 +0200
Subject: [PATCH] Implement and document Rational Range Interpretation.

---
 doc/grep.texi |   21 ++++++++++++++++-----
 src/dfa.c     |   28 +++-------------------------
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/doc/grep.texi b/doc/grep.texi
index b1b879a..51d2560 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -896,9 +896,7 @@ for 88-color and 256-color modes background colors.
 @cindex character type
 @cindex national language support
 @cindex NLS
-These variables specify the locale for the @code{LC_COLLATE} category,
-which determines the collating sequence
-used to interpret range expressions like @samp{[a-z]}.
+These variables specify the locale for the @code{LC_COLLATE} category.
 
 @item LC_ALL
 @itemx LC_CTYPE
@@ -1156,7 +1154,12 @@ For example, the regular expression
 Within a bracket expression, a @dfn{range expression} consists of two
 characters separated by a hyphen.
 It matches any single character that
-sorts between the two characters, inclusive, using the locale's
+sorts between the two characters, inclusive,
+using the machine's character set.
+
+Up to and including version 2.10 of @command{grep},
+range expressions would match any single character that sorted between
+the two characters, inclusive, using the current locale's
 collating sequence and character set.
 For example, in the default C
 locale, @samp{[a-d]} is equivalent to @samp{[abcd]}.
@@ -1165,9 +1168,17 @@ characters in dictionary order, and in these locales 
@samp{[a-d]} is
 typically not equivalent to @samp{[abcd]};
 it might be equivalent to @samp{[aBbCcDd]}, for example.
 To obtain the traditional interpretation
-of bracket expressions, you can use the @samp{C} locale by setting the
+of bracket expressions, it was necessary to use the @samp{C} locale
+by setting the
 @env{LC_ALL} environment variable to the value @samp{C}.
 
+Since the current POSIX standard now makes the behavior of range expressions
+be implementation-defined, instead of requiring the locale's
+collating order, @command{grep} has reverted to the traditional Unix
+behavior of defining ranges based on the machine character address@hidden
+is known as ``Rational Range Interpretation,'' a lovely phrase
+coined by Karl Berry.}
+
 Finally, certain named classes of characters are predefined within
 bracket expressions, as follows.
 Their interpretation depends on the @code{LC_CTYPE} locale;
diff --git a/src/dfa.c b/src/dfa.c
index 26ea4b5..d077d95 100644
--- a/src/dfa.c
+++ b/src/dfa.c
@@ -29,6 +29,7 @@
 #include <limits.h>
 #include <string.h>
 #include <locale.h>
+#include <stdbool.h>
 
 #define STREQ(a, b) (strcmp (a, b) == 0)
 
@@ -56,7 +57,6 @@
 
 #include "regex.h"
 #include "dfa.h"
-#include "hard-locale.h"
 #include "xalloc.h"
 
 /* HPUX, define those as macros in sys/param.h */
@@ -650,7 +650,6 @@ static int laststart;               /* True if we're 
separated from beginning or (, |
                                    only by zero-width characters. */
 static int parens;             /* Count of outstanding left parens. */
 static int minrep, maxrep;     /* Repeat counts for {m,n}. */
-static int hard_LC_COLLATE;    /* Nonzero if LC_COLLATE is hard.  */
 
 static int cur_mb_len = 1;     /* Length of the multibyte representation of
                                    wctok.  */
@@ -996,26 +995,8 @@ parse_bracket_exp (void)
                   c1 = tolower (c1);
                   c2 = tolower (c2);
                 }
-              if (!hard_LC_COLLATE)
-                for (c = c1; c <= c2; c++)
-                  setbit_case_fold_c (c, ccl);
-              else
-                {
-                  /* Defer to the system regex library about the meaning
-                     of range expressions.  */
-                  regex_t re;
-                  char pattern[6] = { '[', c1, '-', c2, ']', 0 };
-                  char subject[2] = { 0, 0 };
-                  regcomp (&re, pattern, REG_NOSUB);
-                  for (c = 0; c < NOTCHAR; ++c)
-                    {
-                      subject[0] = c;
-                      if (!(case_fold && isupper (c))
-                          && regexec (&re, subject, 0, NULL, 0) != REG_NOMATCH)
-                        setbit_case_fold_c (c, ccl);
-                    }
-                  regfree (&re);
-                }
+              for (c = c1; c <= c2; c++)
+                setbit_case_fold_c (c, ccl);
             }
 
           colon_warning_state |= 8;
@@ -1796,9 +1777,6 @@ dfaparse (char const *s, size_t len, struct dfa *d)
   lasttok = END;
   laststart = 1;
   parens = 0;
-#ifdef LC_COLLATE
-  hard_LC_COLLATE = hard_locale (LC_COLLATE);
-#endif
   if (MB_CUR_MAX > 1)
     {
       cur_mb_len = 0;
-- 
1.7.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]