texinfo/tp Texinfo/Convert/Converter.pm Texinfo...

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
texinfo/tp Texinfo/Convert/Converter.pm Texinfo...

From:	Patrice Dumas
Subject:	texinfo/tp Texinfo/Convert/Converter.pm Texinfo...
Date:	Thu, 29 Sep 2011 20:13:07 +0000
CVSROOT:        /sources/texinfo
Module name:    texinfo
Changes by:     Patrice Dumas <pertusus>        11/09/29 20:13:07

Modified files:
        tp/Texinfo/Convert: Converter.pm Text.pm Unicode.pm 
        tp/t           : accents.t 

Log message:
        Put eight_bit_accents and unicode_accents in Texinfo::Convert::Unicode.

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Converter.pm?cvsroot=texinfo&r1=1.47&r2=1.48
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Text.pm?cvsroot=texinfo&r1=1.68&r2=1.69
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Unicode.pm?cvsroot=texinfo&r1=1.18&r2=1.19
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/t/accents.t?cvsroot=texinfo&r1=1.14&r2=1.15

Patches:
Index: Texinfo/Convert/Converter.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Converter.pm,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -b -r1.47 -r1.48
--- Texinfo/Convert/Converter.pm        29 Sep 2011 20:01:38 -0000      1.47
+++ Texinfo/Convert/Converter.pm        29 Sep 2011 20:13:07 -0000      1.48
@@ -838,11 +838,11 @@
 
   if ($self->get_conf('ENABLE_ENCODING')) {
     if ($self->{'encoding_name'} and $self->{'encoding_name'} eq 'utf-8') {
-      return Texinfo::Convert::Text::unicode_accents($result, $stack,
+      return Texinfo::Convert::Unicode::unicode_accents($result, $stack,
                                              $format_accents, $in_upper_case);
     } elsif ($self->{'encoding_name'}
            and 
$Texinfo::Encoding::eight_bit_encoding_aliases{$self->{'encoding_name'}}) {
-      return Texinfo::Convert::Text::eight_bit_accents($result, $stack, 
+      return Texinfo::Convert::Unicode::eight_bit_accents($result, $stack, 
                                                   $self->{'encoding_name'}, 
                                                   $format_accents,
                                                   $in_upper_case);

Index: Texinfo/Convert/Text.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Text.pm,v
retrieving revision 1.68
retrieving revision 1.69
diff -u -b -r1.68 -r1.69
--- Texinfo/Convert/Text.pm     29 Sep 2011 20:01:39 -0000      1.68
+++ Texinfo/Convert/Text.pm     29 Sep 2011 20:13:07 -0000      1.69
@@ -156,119 +156,6 @@
   $ignored_types{$type} = 1;
 }
 
-sub eight_bit_accents($$$$;$)
-{
-  my $unicode_formatted = shift;
-  my $stack = shift;
-  my $encoding = shift;
-  my $convert_accent = shift;
-  my $in_upper_case = shift;
-
-  my $result = $unicode_formatted;
-
-  my $debug;
-  #$debug = 1;
-
-  if ($debug) {
-    print STDERR "STACK: ".join('|', map {$_->{'cmdname'}} @$stack)."\n";
-  }
-
-  # accents are formatted and the intermediate results are kept, such
-  # that we can return the maximum of multiaccented letters that can be
-  # rendered with a given eight bit formatting.  undef is stored when 
-  # there is no corresponding unicode anymore.
-  my @results_stack = ([$unicode_formatted, undef]);
-
-  while (@$stack) {
-    if (defined($unicode_formatted)) {
-      $unicode_formatted
-        = Texinfo::Convert::Unicode::unicode_accent($unicode_formatted, 
$stack->[-1]);
-      $unicode_formatted = uc($unicode_formatted)
-        if ($in_upper_case and defined($unicode_formatted));
-    }
-    push @results_stack, [$unicode_formatted, $stack->[-1]];
-    pop @$stack;
-  }
-
-  if ($debug) {
-    print STDERR "PARTIAL_RESULTS_STACK:\n";
-    foreach my $partial_result (@results_stack) {
-      my $command = 'TEXT';
-      $command = $partial_result->[1]->{'cmdname'} if ($partial_result->[1]);
-      if (defined($partial_result->[0])) {
-        print STDERR "   -> ".Encode::encode('utf8', $partial_result->[0])
-                            ."|$command\n";
-      } else {
-        print STDERR "   -> NO UTF8 |$command\n";
-      }
-    }
-  }
-
-  # At this point we have the utf8 encoded results for the accent
-  # commands stack, with all the intermediate results.
-  # For each one we'll check if it is possible to encode it in the 
-  # current eight bit output encoding table and, if so set the result
-  # to the character.
-
-  my $eight_bit = '';
-
-  while (@results_stack) {
-    my $char = $results_stack[0]->[0];
-    last if (!defined($char));
-
-    my ($new_eight_bit, $new_codepoint)
-      = Texinfo::Convert::Unicode::eight_bit_and_unicode_point($char,
-                                                              $encoding);
-    if ($debug) {
-      my $command = 'TEXT';
-      $command = $results_stack[0]->[1]->{'cmdname'} 
-        if ($results_stack[0]->[1]);
-      print STDERR "" . Encode::encode('utf8', $char) 
-        . " ($command) new_codepoint: $new_codepoint 8bit: $new_eight_bit old: 
$eight_bit\n";
-    }
-
-    # no corresponding eight bit character found for a composed character
-    last if (!$new_eight_bit);
-
-    # in that case, the new eight bit character is the same than the one 
-    # found with one less character (and it isn't a @dotless{i}). It may
-    # hapen in 2 case, both meaning that there is no corresponding 8bit char:
-    #
-    # -> there are 2 characters in accent. This could happen, for example
-    #    if an accent that cannot be rendered is found and it leads to 
-    #    appending or prepending a character. For example this happens for
-    #    @={@,address@hidden, where @,address@hidden is expanded to a 2 
character:
-    #    n with a tilde, followed by a , 
-    #    In that case, the additional utf8 diacritic is appended, which 
-    #    means that it is composed with the , and leaves n with a tilde 
-    #    untouched. 
-    # -> the diacritic is appended but the normal form doesn't lead
-    #    to a composed character, such that the first character
-    #    of the string is unchanged. This, for example, happens for 
-    #    @ubaraccent{a} since there is no composed accent with a and an 
-    #    underbar.
-    last if ($new_eight_bit eq $eight_bit
-             and !($results_stack[0]->[1]->{'cmdname'} eq 'dotless' 
-                   and $char eq 'i'));
-    $result = $results_stack[0]->[0];
-    $eight_bit = $new_eight_bit;
-    shift @results_stack;
-  }
-
-  # handle the remaining accents, that have not been converted to 8bit
-  # compatible unicode
-  shift @results_stack if (!defined($results_stack[0]->[1]));
-  while (@results_stack) {
-    $result = &$convert_accent($result,
-                               $results_stack[0]->[1],
-                               $in_upper_case);
-    shift @results_stack;
-  }
-
-  # An important remark is that the final conversion to 8bit is left to
-  # perl.
-  return $result;
-}
 
 sub ascii_accent($$)
 {
@@ -302,30 +189,7 @@
   return $result;
 }
 
-# format a stack of accents as unicode
-sub unicode_accents ($$$;$)
-{
-  my $result = shift;
-  my $stack = shift;
-  my $format_accent = shift;
-  my $in_upper_case = shift;
-
-  while (@$stack) {
-    my $formatted_result
-     = Texinfo::Convert::Unicode::unicode_accent($result, $stack->[-1]);
-    last if (!defined($formatted_result));
-
-    $result = $formatted_result;
-    pop @$stack;
-  }
-  $result = uc ($result) if ($in_upper_case);
-  while (@$stack) {
-    my $accent_command = pop @$stack;
-    $result = &$format_accent($result, $accent_command, $in_upper_case);
-  }
-  return $result;
-}
-
+# format an accent command and nested accents within as Text.
 sub text_accents($$;$)
 {
   my $accent = shift;
@@ -343,11 +207,12 @@
   my $text = convert({'contents' => $contents}, $options);
 
   if ($encoding and $encoding eq 'utf-8') {
-    return unicode_accents($text, $stack, \&ascii_accent, $in_upper_case);
+    return Texinfo::Convert::Unicode::unicode_accents($text, $stack, 
+                                              \&ascii_accent, $in_upper_case);
   } elsif ($encoding 
            and $Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}) {
-    return eight_bit_accents($text, $stack, $encoding, \&ascii_accent, 
-                             $in_upper_case);
+    return Texinfo::Convert::Unicode::eight_bit_accents($text, $stack, 
+                                     $encoding, \&ascii_accent, 
$in_upper_case);
   } else {
     my $result = ascii_accents($text, $stack, $in_upper_case);
   }

Index: Texinfo/Convert/Unicode.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Unicode.pm,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -b -r1.18 -r1.19
--- Texinfo/Convert/Unicode.pm  29 Sep 2011 14:36:30 -0000      1.18
+++ Texinfo/Convert/Unicode.pm  29 Sep 2011 20:13:07 -0000      1.19
@@ -1223,7 +1223,7 @@
 }
 
 # return the 8 bit, if it exists, and the unicode codepoint
-sub eight_bit_and_unicode_point($$)
+sub _eight_bit_and_unicode_point($$)
 {
   my $char = shift;
   my $encoding = shift;
@@ -1245,6 +1245,142 @@
   return ($eight_bit, $codepoint);
 }
 
+# format a stack of accents as unicode
+sub unicode_accents ($$$;$)
+{
+  my $result = shift;
+  my $stack = shift;
+  my $format_accent = shift;
+  my $in_upper_case = shift;
+
+  while (@$stack) {
+    my $formatted_result = unicode_accent($result, $stack->[-1]);
+    last if (!defined($formatted_result));
+
+    $result = $formatted_result;
+    pop @$stack;
+  }
+  $result = uc ($result) if ($in_upper_case);
+  while (@$stack) {
+    my $accent_command = pop @$stack;
+    $result = &$format_accent($result, $accent_command, $in_upper_case);
+  }
+  return $result;
+}
+
+sub eight_bit_accents($$$$;$)
+{
+  my $unicode_formatted = shift;
+  my $stack = shift;
+  my $encoding = shift;
+  my $convert_accent = shift;
+  my $in_upper_case = shift;
+
+  my $result = $unicode_formatted;
+
+  my $debug;
+  #$debug = 1;
+
+  if ($debug) {
+    print STDERR "STACK: ".join('|', map {$_->{'cmdname'}} @$stack)."\n";
+  }
+
+  # accents are formatted and the intermediate results are kept, such
+  # that we can return the maximum of multiaccented letters that can be
+  # rendered with a given eight bit formatting.  undef is stored when 
+  # there is no corresponding unicode anymore.
+  my @results_stack = ([$unicode_formatted, undef]);
+
+  while (@$stack) {
+    if (defined($unicode_formatted)) {
+      $unicode_formatted 
+         = unicode_accent($unicode_formatted, $stack->[-1]);
+      $unicode_formatted = uc($unicode_formatted)
+        if ($in_upper_case and defined($unicode_formatted));
+    }
+    push @results_stack, [$unicode_formatted, $stack->[-1]];
+    pop @$stack;
+  }
+
+  if ($debug) {
+    print STDERR "PARTIAL_RESULTS_STACK:\n";
+    foreach my $partial_result (@results_stack) {
+      my $command = 'TEXT';
+      $command = $partial_result->[1]->{'cmdname'} if ($partial_result->[1]);
+      if (defined($partial_result->[0])) {
+        print STDERR "   -> ".Encode::encode('utf8', $partial_result->[0])
+                            ."|$command\n";
+      } else {
+        print STDERR "   -> NO UTF8 |$command\n";
+      }
+    }
+  }
+
+  # At this point we have the utf8 encoded results for the accent
+  # commands stack, with all the intermediate results.
+  # For each one we'll check if it is possible to encode it in the 
+  # current eight bit output encoding table and, if so set the result
+  # to the character.
+
+  my $eight_bit = '';
+
+  while (@results_stack) {
+    my $char = $results_stack[0]->[0];
+    last if (!defined($char));
+
+    my ($new_eight_bit, $new_codepoint)
+      = _eight_bit_and_unicode_point($char, $encoding);
+    if ($debug) {
+      my $command = 'TEXT';
+      $command = $results_stack[0]->[1]->{'cmdname'} 
+        if ($results_stack[0]->[1]);
+      print STDERR "" . Encode::encode('utf8', $char) 
+        . " ($command) new_codepoint: $new_codepoint 8bit: $new_eight_bit old: 
$eight_bit\n";
+    }
+
+    # no corresponding eight bit character found for a composed character
+    last if (!$new_eight_bit);
+
+    # in that case, the new eight bit character is the same than the one 
+    # found with one less character (and it isn't a @dotless{i}). It may
+    # hapen in 2 case, both meaning that there is no corresponding 8bit char:
+    #
+    # -> there are 2 characters in accent. This could happen, for example
+    #    if an accent that cannot be rendered is found and it leads to 
+    #    appending or prepending a character. For example this happens for
+    #    @={@,address@hidden, where @,address@hidden is expanded to a 2 
character:
+    #    n with a tilde, followed by a , 
+    #    In that case, the additional utf8 diacritic is appended, which 
+    #    means that it is composed with the , and leaves n with a tilde 
+    #    untouched. 
+    # -> the diacritic is appended but the normal form doesn't lead
+    #    to a composed character, such that the first character
+    #    of the string is unchanged. This, for example, happens for 
+    #    @ubaraccent{a} since there is no composed accent with a and an 
+    #    underbar.
+    last if ($new_eight_bit eq $eight_bit
+             and !($results_stack[0]->[1]->{'cmdname'} eq 'dotless' 
+                   and $char eq 'i'));
+    $result = $results_stack[0]->[0];
+    $eight_bit = $new_eight_bit;
+    shift @results_stack;
+  }
+
+  # handle the remaining accents, that have not been converted to 8bit
+  # compatible unicode
+  shift @results_stack if (!defined($results_stack[0]->[1]));
+  while (@results_stack) {
+    $result = &$convert_accent($result,
+                               $results_stack[0]->[1],
+                               $in_upper_case);
+    shift @results_stack;
+  }
+
+  # An important remark is that the final conversion to 8bit is left to
+  # perl.
+  return $result;
+}
+
 
 # returns the unicode for a command with brace and no arg
 # if it is known that it is present for the encoding

Index: t/accents.t
===================================================================
RCS file: /sources/texinfo/texinfo/tp/t/accents.t,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -b -r1.14 -r1.15
--- t/accents.t 29 Sep 2011 20:01:39 -0000      1.14
+++ t/accents.t 29 Sep 2011 20:13:07 -0000      1.15
@@ -77,7 +77,7 @@
   my $text = Texinfo::Convert::Text::convert({'contents' => $contents});
 
   my $result = 
-       Texinfo::Convert::Text::eight_bit_accents($text, $commands_stack, 
+       Texinfo::Convert::Unicode::eight_bit_accents($text, $commands_stack, 
     'iso-8859-1', \&Texinfo::Convert::Text::ascii_accent);
 
   my $html_converter = Texinfo::Convert::HTML->converter();
@@ -93,7 +93,7 @@
   $text = Texinfo::Convert::Text::convert({'contents' => $contents},
                                {'enabled_encoding' => 'utf-8'});
 
-  my $result_unicode = Texinfo::Convert::Text::unicode_accents($text, 
+  my $result_unicode = Texinfo::Convert::Unicode::unicode_accents($text, 
               $commands_stack, \&Texinfo::Convert::Text::ascii_accent);
   if (defined($reference)) {
     #ok (Encode::decode('iso-8859-1', $reference) eq $result, $name);
[Prev in Thread]
Current Thread
[Next in Thread]
texinfo/tp Texinfo/Convert/Converter.pm Texinfo..., Patrice Dumas, 2011/09/04
- texinfo/tp Texinfo/Convert/Converter.pm Texinfo..., Patrice Dumas, 2011/09/17
- texinfo/tp Texinfo/Convert/Converter.pm Texinfo..., Patrice Dumas <=
Prev by Date: texinfo/tp Texinfo/Common.pm Texinfo/Convert/Co...
Next by Date: texinfo/tp Texinfo/Convert/Text.pm Texinfo/Conv...
Previous by thread: texinfo/tp Texinfo/Convert/Converter.pm Texinfo...
Next by thread: texinfo/tp Texinfo/Convert/HTML.pm t/30sectioni...
Index(es):
- Date
- Thread