bug-gnu-utils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: php concatenation unknown to xgettext


From: Bruno Haible
Subject: Re: php concatenation unknown to xgettext
Date: Thu, 5 Jul 2007 02:09:39 +0200
User-agent: KMail/1.5.4

Hello,

Jan Engelhardt wrote:

> the following minimal php source is not correctly parsed by xgettext:
> 
>       <?php
>       _("foo"."bar");
>       ?>
> 
> ran through `xgettext -o /dev/stdout test.php` gives:
> 
>       msgid "foo"
>       msgstr ""
> 
> expected result:
> 
>       msgid "foobar"
>       msgstr ""
> 
> The dot is the concatenation operator and is very handy when dealing 
> with longer strings (esp when they take up more than 80 columns).
> 
> 
> $ xgettext --version
> xgettext (GNU gettext-tools) 0.15

Thank you for the precise report. I am applying this fix (relative to the
gettext CVS).

2007-07-04  Bruno Haible  <address@hidden>

        Recognize the PHP string concatenation operator.
        * x-php.c (enum token_type_ty): New elements token_type_dot,
        token_type_operator1, token_type_operator2.
        (struct token_ty): Add comment field.
        (free_token): Drop reference to comment field.
        (phase4_pushback, phase4_pushback_length): New variables.
        (phase4_get): Renamed from x_php_lex. Return last pushed-back token if
        available. Recognize tokens '.', '+', '-', '*', '/', '%', '++', '--',
        '!', '~', '@'. Fill in tp->comment.
        (phase4_unget): New function.
        (phase5_last): New variable.
        (x_php_lex): New function.
        (extract_balanced): Handle the new token types. Pass token's comment
        to remember_a_message.
        (extract_php): Initialize phase5_last.
        Reported by Jan Engelhardt <address@hidden>.

*** gettext-tools/src/x-php.c   17 Mar 2007 12:00:22 -0000      1.23
--- gettext-tools/src/x-php.c   4 Jul 2007 23:55:36 -0000
***************
*** 42,48 ****
  
  
  /* The PHP syntax is defined in phpdoc/manual/langref.html.
!    See also php-4.1.0/Zend/zend_language_scanner.l.
     Note that variable and function names can contain bytes in the range
     0x7f..0xff; see
       http://www.php.net/manual/en/language.variables.php
--- 42,49 ----
  
  
  /* The PHP syntax is defined in phpdoc/manual/langref.html.
!    See also php-4.1.0/Zend/zend_language_scanner.l
!    and      php-4.1.0/Zend/zend_language_parser.y.
     Note that variable and function names can contain bytes in the range
     0x7f..0xff; see
       http://www.php.net/manual/en/language.variables.php
***************
*** 741,746 ****
--- 742,750 ----
    token_type_comma,           /* , */
    token_type_lbracket,                /* [ */
    token_type_rbracket,                /* ] */
+   token_type_dot,             /* . */
+   token_type_operator1,               /* * / % ++ -- */
+   token_type_operator2,               /* + - ! ~ @ */
    token_type_string_literal,  /* "abc" */
    token_type_symbol,          /* symbol, number */
    token_type_other            /* misc. operator */
***************
*** 752,757 ****
--- 756,762 ----
  {
    token_type_ty type;
    char *string;               /* for token_type_string_literal, 
token_type_symbol */
+   refcounted_string_list_ty *comment; /* for token_type_string_literal */
    int line_number;
  };
  
***************
*** 762,780 ****
  {
    if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
      free (tp->string);
  }
  
  
  /* 4. Combine characters into tokens.  Discard whitespace.  */
  
  static void
! x_php_lex (token_ty *tp)
  {
    static char *buffer;
    static int bufmax;
    int bufpos;
    int c;
  
    tp->string = NULL;
  
    for (;;)
--- 767,795 ----
  {
    if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
      free (tp->string);
+   if (tp->type == token_type_string_literal)
+     drop_reference (tp->comment);
  }
  
  
  /* 4. Combine characters into tokens.  Discard whitespace.  */
  
+ static token_ty phase4_pushback[3];
+ static int phase4_pushback_length;
+ 
  static void
! phase4_get (token_ty *tp)
  {
    static char *buffer;
    static int bufmax;
    int bufpos;
    int c;
  
+   if (phase4_pushback_length)
+     {
+       *tp = phase4_pushback[--phase4_pushback_length];
+       return;
+     }
    tp->string = NULL;
  
    for (;;)
***************
*** 927,932 ****
--- 942,948 ----
          buffer[bufpos] = 0;
          tp->type = token_type_string_literal;
          tp->string = xstrdup (buffer);
+         tp->comment = add_reference (savable_comment);
          return;
  
        case '"':
***************
*** 1063,1069 ****
            }
          buffer[bufpos] = 0;
          if (tp->type == token_type_string_literal)
!           tp->string = xstrdup (buffer);
          return;
  
        case '?':
--- 1079,1088 ----
            }
          buffer[bufpos] = 0;
          if (tp->type == token_type_string_literal)
!           {
!             tp->string = xstrdup (buffer);
!             tp->comment = add_reference (savable_comment);
!           }
          return;
  
        case '?':
***************
*** 1075,1084 ****
                /* ?> and %> terminate PHP mode and switch back to HTML
                   mode.  */
                skip_html ();
              }
            else
!             phase1_ungetc (c2);
!           tp->type = token_type_other;
            return;
          }
  
--- 1094,1106 ----
                /* ?> and %> terminate PHP mode and switch back to HTML
                   mode.  */
                skip_html ();
+               tp->type = token_type_other;
              }
            else
!             {
!               phase1_ungetc (c2);
!               tp->type = (c == '%' ? token_type_operator1 : token_type_other);
!             }
            return;
          }
  
***************
*** 1102,1107 ****
--- 1124,1160 ----
          tp->type = token_type_rbracket;
          return;
  
+       case '.':
+         tp->type = token_type_dot;
+         return;
+ 
+       case '*':
+       case '/':
+         tp->type = token_type_operator1;
+         return;
+ 
+       case '+':
+       case '-':
+         {
+           int c2 = phase1_getc ();
+           if (c2 == c)
+             /* ++ or -- */
+             tp->type = token_type_operator1;
+           else
+             /* + or - */
+             {
+               phase1_ungetc (c2);
+               tp->type = token_type_operator2;
+             }
+           return;
+         }
+ 
+       case '!':
+       case '~':
+       case '@':
+         tp->type = token_type_operator2;
+         return;
+ 
        case '<':
          {
            int c2 = phase1_getc ();
***************
*** 1248,1253 ****
--- 1301,1388 ----
      }
  }
  
+ /* Supports 3 tokens of pushback.  */
+ static void
+ phase4_unget (token_ty *tp)
+ {
+   if (tp->type != token_type_eof)
+     {
+       if (phase4_pushback_length == SIZEOF (phase4_pushback))
+       abort ();
+       phase4_pushback[phase4_pushback_length++] = *tp;
+     }
+ }
+ 
+ 
+ /* 5. Compile-time optimization of string literal concatenation.
+    Combine "string1" . ... . "stringN" to the concatenated string if
+      - the token before this expression is none of
+        '+' '-' '.' '*' '/' '%' '!' '~' '++' '--' ')' '@'
+        (because then the first string could be part of an expression with
+        the same or higher precedence as '.', such as an additive,
+        multiplicative, negation, preincrement, or cast expression),
+      - the token after this expression is none of
+        '*' '/' '%' '++' '--'
+        (because then the last string could be part of an expression with
+        higher precedence as '.', such as a multiplicative or postincrement
+        expression).  */
+ 
+ static token_type_ty phase5_last;
+ 
+ static void
+ x_php_lex (token_ty *tp)
+ {
+   phase4_get (tp);
+   if (tp->type == token_type_string_literal
+       && !(phase5_last == token_type_dot
+          || phase5_last == token_type_operator1
+          || phase5_last == token_type_operator2
+          || phase5_last == token_type_rparen))
+     {
+       char *sum = tp->string;
+       size_t sum_len = strlen (sum);
+ 
+       for (;;)
+       {
+         token_ty token2;
+ 
+         phase4_get (&token2);
+         if (token2.type == token_type_dot)
+           {
+             token_ty token3;
+ 
+             phase4_get (&token3);
+             if (token3.type == token_type_string_literal)
+               {
+                 token_ty token_after;
+ 
+                 phase4_get (&token_after);
+                 if (token_after.type != token_type_operator1)
+                   {
+                     char *addend = token3.string;
+                     size_t addend_len = strlen (addend);
+ 
+                     sum = (char *) xrealloc (sum, sum_len + addend_len + 1);
+                     memcpy (sum + sum_len, addend, addend_len + 1);
+                     sum_len += addend_len;
+ 
+                     phase4_unget (&token_after);
+                     free_token (&token3);
+                     free_token (&token2);
+                     continue;
+                   }
+                 phase4_unget (&token_after);
+               }
+             phase4_unget (&token3);
+           }
+         phase4_unget (&token2);
+         break;
+       }
+       tp->string = sum;
+     }
+   phase5_last = tp->type;
+ }
+ 
  
  /* ========================= Extracting strings.  ========================== 
*/
  
***************
*** 1389,1405 ****
  
            if (extract_all)
              remember_a_message (mlp, NULL, token.string, inner_context,
!                                 &pos, savable_comment);
            else
              arglist_parser_remember (argparser, arg, token.string,
                                       inner_context,
                                       pos.file_name, pos.line_number,
!                                      savable_comment);
          }
          next_context_iter = null_context_list_iterator;
          state = 0;
          continue;
  
        case token_type_other:
          next_context_iter = null_context_list_iterator;
          state = 0;
--- 1524,1544 ----
  
            if (extract_all)
              remember_a_message (mlp, NULL, token.string, inner_context,
!                                 &pos, token.comment);
            else
              arglist_parser_remember (argparser, arg, token.string,
                                       inner_context,
                                       pos.file_name, pos.line_number,
!                                      token.comment);
!           drop_reference (token.comment);
          }
          next_context_iter = null_context_list_iterator;
          state = 0;
          continue;
  
+       case token_type_dot:
+       case token_type_operator1:
+       case token_type_operator2:
        case token_type_other:
          next_context_iter = null_context_list_iterator;
          state = 0;
***************
*** 1432,1437 ****
--- 1571,1578 ----
    last_comment_line = -1;
    last_non_comment_line = -1;
  
+   phase5_last = token_type_eof;
+ 
    flag_context_list_table = flag_table;
  
    init_keywords ();





reply via email to

[Prev in Thread] Current Thread [Next in Thread]