emacs-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Explicitly numbered subgroups in regular expressions


From: Stefan Monnier
Subject: Explicitly numbered subgroups in regular expressions
Date: Sat, 09 Jun 2007 16:37:44 -0400
User-agent: Gnus/5.11 (Gnus v5.11) Emacs/22.1.50 (gnu/linux)

Any objection to the patch below for the trunk (it'll come with a NEWS
and ChangeLog entry, of course)?

What it does is add a new regexp syntax \(?<num>:<regexp>\) which is like
\(<regexp>\) except that it specifies explicitly the number of the subgroup.
E.g. (and (string-match "\\(?3:a\\)" "a") (match-data)) returns (0 1 nil
nil nil nil 0 1).  There is no backward compatibility issue with this patch:
such regexps are currently rejected as invalid.

Cases where this is useful:

1 - when we need to match either '<regexp>' or "<regexp>" and we need to
    extract the text within the quotes.  Currently we either use
    "[\"']\\(<regexp>\\)[\"']" which is not quite correct, or we use
    "\"\\(<regexp>\\)\"\\|'\\(<regexp>\\)'" and then have to use
    (or (match-string 1) (match-string 2)).
    With this patch we can use "\"\\(?1:<regexp>\\)\"\\|'\\(?1:<regexp>\\)'".
    In log-view-mode, we have such a situation where we regularly need to
    add one case and hence change some of the code from (or (match-string 1)
    (match-string 2)) to (or (match-string 1) (match-string 2) (match-string 3))
    and update other match-data indices.

2 - there are several places where some customizable data specifies
    a regular expression along with the match-data indices where the
    relevant subdata can be found.  E.g. in compilation-error-regexp-alist.
    With such a patch, we could use instead a scheme where the customizable
    data only includes a regexp because the match-data indices in which the
    relevant subdata can be found are always the same.

3 - in some rare cases such as comment-start-skip we have declared that
    subgroup N has a special meaning.  Problem is that this can be
    occasionally be problematic.

Number 3 is a rare circumstance (see comment around fortran-mode's setting
of comment-start-skip) and I can't remember it being anything else than
a minor problem.
Number 2 seems useful, but I have to admit that I have not actually tried it.
Number 1 was the motivating case.


        Stefan


--- regex.c     29 jan 2007 13:35:35 -0500      1.222
+++ regex.c     09 jun 2007 16:13:59 -0400      
@@ -2482,11 +2482,6 @@
      last -- ends with a forward jump of this sort.  */
   unsigned char *fixup_alt_jump = 0;
 
-  /* Counts open-groups as they are encountered.  Remembered for the
-     matching close-group on the compile stack, so the same register
-     number is put in the stop_memory as the start_memory.  */
-  regnum_t regnum = 0;
-
   /* Work area for range table of charset.  */
   struct range_table_work_area range_table_work;
 
@@ -3123,28 +3118,54 @@
            handle_open:
              {
                int shy = 0;
+               regnum_t regnum = 0;
                if (p+1 < pend)
                  {
                    /* Look for a special (?...) construct */
                    if ((syntax & RE_SHY_GROUPS) && *p == '?')
                      {
                        PATFETCH (c); /* Gobble up the '?'.  */
+                       while (!shy)
+                         {
                        PATFETCH (c);
                        switch (c)
                          {
                          case ':': shy = 1; break;
+                             case '0':
+                               /* An explicitly specified regnum must start
+                                  with non-0. */
+                               if (regnum == 0)
+                                 FREE_STACK_RETURN (REG_BADPAT);
+                             case '1': case '2': case '3': case '4':
+                             case '5': case '6': case '7': case '8': case '9':
+                               regnum = 10*regnum + (c - '0'); break;
                          default:
                            /* Only (?:...) is supported right now. */
                            FREE_STACK_RETURN (REG_BADPAT);
                          }
                      }
                  }
+                 }
 
                if (!shy)
-                 {
-                   bufp->re_nsub++;
-                   regnum++;
+                 regnum = ++bufp->re_nsub;
+               else if (regnum)
+                 { /* It's actually not shy, but explicitly numbered.  */
+                   shy = 0;
+                   if (regnum > bufp->re_nsub)
+                     bufp->re_nsub = regnum;
+                   else if (regnum > bufp->re_nsub
+                            /* Ideally, we'd want to check that the specified
+                               group can't have matched (i.e. all subgroups
+                               using the same regnum are in other branches of
+                               OR patterns), but we don't currently keep track
+                               of enough info to do that easily.  */
+                            || group_in_compile_stack (compile_stack, regnum))
+                     FREE_STACK_RETURN (REG_BADPAT);
                  }
+               else
+                 /* It's really shy.  */
+                 regnum = - bufp->re_nsub;
 
                if (COMPILE_STACK_FULL)
                  {
@@ -3163,12 +3184,11 @@
                COMPILE_STACK_TOP.fixup_alt_jump
                  = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
                COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
-               COMPILE_STACK_TOP.regnum = shy ? -regnum : regnum;
+               COMPILE_STACK_TOP.regnum = regnum;
 
-               /* Do not push a
-                  start_memory for groups beyond the last one we can
-                  represent in the compiled pattern.  */
-               if (regnum <= MAX_REGNUM && !shy)
+               /* Do not push a start_memory for groups beyond the last one
+                  we can represent in the compiled pattern.  */
+               if (regnum <= MAX_REGNUM && regnum > 0)
                  BUF_PUSH_2 (start_memory, regnum);
 
                compile_stack.avail++;
@@ -3213,7 +3233,7 @@
                /* We don't just want to restore into `regnum', because
                   later groups should continue to be numbered higher,
                   as in `(ab)c(de)' -- the second group is #2.  */
-               regnum_t this_group_regnum;
+               regnum_t regnum;
 
                compile_stack.avail--;
                begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
@@ -3222,7 +3242,7 @@
                    ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
                    : 0;
                laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
-               this_group_regnum = COMPILE_STACK_TOP.regnum;
+               regnum = COMPILE_STACK_TOP.regnum;
                /* If we've reached MAX_REGNUM groups, then this open
                   won't actually generate any code, so we'll have to
                   clear pending_exact explicitly.  */
@@ -3230,8 +3250,8 @@
 
                /* We're at the end of the group, so now we know how many
                   groups were inside this one.  */
-               if (this_group_regnum <= MAX_REGNUM && this_group_regnum > 0)
-                 BUF_PUSH_2 (stop_memory, this_group_regnum);
+               if (regnum <= MAX_REGNUM && regnum > 0)
+                 BUF_PUSH_2 (stop_memory, regnum);
              }
              break;
 
@@ -3557,8 +3577,9 @@
 
                reg = c - '0';
 
-               /* Can't back reference to a subexpression before its end.  */
-               if (reg > regnum || group_in_compile_stack (compile_stack, reg))
+               if (reg > bufp->re_nsub || reg < 1
+                   /* Can't back reference to a subexp before its end.  */
+                   || group_in_compile_stack (compile_stack, reg))
                  FREE_STACK_RETURN (REG_ESUBREG);
 
                laststart = b;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]