[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
texinfo/tp Texinfo/Convert/Converter.pm Texinfo...
From: |
Patrice Dumas |
Subject: |
texinfo/tp Texinfo/Convert/Converter.pm Texinfo... |
Date: |
Thu, 29 Sep 2011 20:13:07 +0000 |
CVSROOT: /sources/texinfo
Module name: texinfo
Changes by: Patrice Dumas <pertusus> 11/09/29 20:13:07
Modified files:
tp/Texinfo/Convert: Converter.pm Text.pm Unicode.pm
tp/t : accents.t
Log message:
Put eight_bit_accents and unicode_accents in Texinfo::Convert::Unicode.
CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Converter.pm?cvsroot=texinfo&r1=1.47&r2=1.48
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Text.pm?cvsroot=texinfo&r1=1.68&r2=1.69
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/Texinfo/Convert/Unicode.pm?cvsroot=texinfo&r1=1.18&r2=1.19
http://cvs.savannah.gnu.org/viewcvs/texinfo/tp/t/accents.t?cvsroot=texinfo&r1=1.14&r2=1.15
Patches:
Index: Texinfo/Convert/Converter.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Converter.pm,v
retrieving revision 1.47
retrieving revision 1.48
diff -u -b -r1.47 -r1.48
--- Texinfo/Convert/Converter.pm 29 Sep 2011 20:01:38 -0000 1.47
+++ Texinfo/Convert/Converter.pm 29 Sep 2011 20:13:07 -0000 1.48
@@ -838,11 +838,11 @@
if ($self->get_conf('ENABLE_ENCODING')) {
if ($self->{'encoding_name'} and $self->{'encoding_name'} eq 'utf-8') {
- return Texinfo::Convert::Text::unicode_accents($result, $stack,
+ return Texinfo::Convert::Unicode::unicode_accents($result, $stack,
$format_accents, $in_upper_case);
} elsif ($self->{'encoding_name'}
and
$Texinfo::Encoding::eight_bit_encoding_aliases{$self->{'encoding_name'}}) {
- return Texinfo::Convert::Text::eight_bit_accents($result, $stack,
+ return Texinfo::Convert::Unicode::eight_bit_accents($result, $stack,
$self->{'encoding_name'},
$format_accents,
$in_upper_case);
Index: Texinfo/Convert/Text.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Text.pm,v
retrieving revision 1.68
retrieving revision 1.69
diff -u -b -r1.68 -r1.69
--- Texinfo/Convert/Text.pm 29 Sep 2011 20:01:39 -0000 1.68
+++ Texinfo/Convert/Text.pm 29 Sep 2011 20:13:07 -0000 1.69
@@ -156,119 +156,6 @@
$ignored_types{$type} = 1;
}
-sub eight_bit_accents($$$$;$)
-{
- my $unicode_formatted = shift;
- my $stack = shift;
- my $encoding = shift;
- my $convert_accent = shift;
- my $in_upper_case = shift;
-
- my $result = $unicode_formatted;
-
- my $debug;
- #$debug = 1;
-
- if ($debug) {
- print STDERR "STACK: ".join('|', map {$_->{'cmdname'}} @$stack)."\n";
- }
-
- # accents are formatted and the intermediate results are kept, such
- # that we can return the maximum of multiaccented letters that can be
- # rendered with a given eight bit formatting. undef is stored when
- # there is no corresponding unicode anymore.
- my @results_stack = ([$unicode_formatted, undef]);
-
- while (@$stack) {
- if (defined($unicode_formatted)) {
- $unicode_formatted
- = Texinfo::Convert::Unicode::unicode_accent($unicode_formatted,
$stack->[-1]);
- $unicode_formatted = uc($unicode_formatted)
- if ($in_upper_case and defined($unicode_formatted));
- }
- push @results_stack, [$unicode_formatted, $stack->[-1]];
- pop @$stack;
- }
-
- if ($debug) {
- print STDERR "PARTIAL_RESULTS_STACK:\n";
- foreach my $partial_result (@results_stack) {
- my $command = 'TEXT';
- $command = $partial_result->[1]->{'cmdname'} if ($partial_result->[1]);
- if (defined($partial_result->[0])) {
- print STDERR " -> ".Encode::encode('utf8', $partial_result->[0])
- ."|$command\n";
- } else {
- print STDERR " -> NO UTF8 |$command\n";
- }
- }
- }
-
- # At this point we have the utf8 encoded results for the accent
- # commands stack, with all the intermediate results.
- # For each one we'll check if it is possible to encode it in the
- # current eight bit output encoding table and, if so set the result
- # to the character.
-
- my $eight_bit = '';
-
- while (@results_stack) {
- my $char = $results_stack[0]->[0];
- last if (!defined($char));
-
- my ($new_eight_bit, $new_codepoint)
- = Texinfo::Convert::Unicode::eight_bit_and_unicode_point($char,
- $encoding);
- if ($debug) {
- my $command = 'TEXT';
- $command = $results_stack[0]->[1]->{'cmdname'}
- if ($results_stack[0]->[1]);
- print STDERR "" . Encode::encode('utf8', $char)
- . " ($command) new_codepoint: $new_codepoint 8bit: $new_eight_bit old:
$eight_bit\n";
- }
-
- # no corresponding eight bit character found for a composed character
- last if (!$new_eight_bit);
-
- # in that case, the new eight bit character is the same than the one
- # found with one less character (and it isn't a @dotless{i}). It may
- # hapen in 2 case, both meaning that there is no corresponding 8bit char:
- #
- # -> there are 2 characters in accent. This could happen, for example
- # if an accent that cannot be rendered is found and it leads to
- # appending or prepending a character. For example this happens for
- # @={@,address@hidden, where @,address@hidden is expanded to a 2
character:
- # n with a tilde, followed by a ,
- # In that case, the additional utf8 diacritic is appended, which
- # means that it is composed with the , and leaves n with a tilde
- # untouched.
- # -> the diacritic is appended but the normal form doesn't lead
- # to a composed character, such that the first character
- # of the string is unchanged. This, for example, happens for
- # @ubaraccent{a} since there is no composed accent with a and an
- # underbar.
- last if ($new_eight_bit eq $eight_bit
- and !($results_stack[0]->[1]->{'cmdname'} eq 'dotless'
- and $char eq 'i'));
- $result = $results_stack[0]->[0];
- $eight_bit = $new_eight_bit;
- shift @results_stack;
- }
-
- # handle the remaining accents, that have not been converted to 8bit
- # compatible unicode
- shift @results_stack if (!defined($results_stack[0]->[1]));
- while (@results_stack) {
- $result = &$convert_accent($result,
- $results_stack[0]->[1],
- $in_upper_case);
- shift @results_stack;
- }
-
- # An important remark is that the final conversion to 8bit is left to
- # perl.
- return $result;
-}
sub ascii_accent($$)
{
@@ -302,30 +189,7 @@
return $result;
}
-# format a stack of accents as unicode
-sub unicode_accents ($$$;$)
-{
- my $result = shift;
- my $stack = shift;
- my $format_accent = shift;
- my $in_upper_case = shift;
-
- while (@$stack) {
- my $formatted_result
- = Texinfo::Convert::Unicode::unicode_accent($result, $stack->[-1]);
- last if (!defined($formatted_result));
-
- $result = $formatted_result;
- pop @$stack;
- }
- $result = uc ($result) if ($in_upper_case);
- while (@$stack) {
- my $accent_command = pop @$stack;
- $result = &$format_accent($result, $accent_command, $in_upper_case);
- }
- return $result;
-}
-
+# format an accent command and nested accents within as Text.
sub text_accents($$;$)
{
my $accent = shift;
@@ -343,11 +207,12 @@
my $text = convert({'contents' => $contents}, $options);
if ($encoding and $encoding eq 'utf-8') {
- return unicode_accents($text, $stack, \&ascii_accent, $in_upper_case);
+ return Texinfo::Convert::Unicode::unicode_accents($text, $stack,
+ \&ascii_accent, $in_upper_case);
} elsif ($encoding
and $Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}) {
- return eight_bit_accents($text, $stack, $encoding, \&ascii_accent,
- $in_upper_case);
+ return Texinfo::Convert::Unicode::eight_bit_accents($text, $stack,
+ $encoding, \&ascii_accent,
$in_upper_case);
} else {
my $result = ascii_accents($text, $stack, $in_upper_case);
}
Index: Texinfo/Convert/Unicode.pm
===================================================================
RCS file: /sources/texinfo/texinfo/tp/Texinfo/Convert/Unicode.pm,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -b -r1.18 -r1.19
--- Texinfo/Convert/Unicode.pm 29 Sep 2011 14:36:30 -0000 1.18
+++ Texinfo/Convert/Unicode.pm 29 Sep 2011 20:13:07 -0000 1.19
@@ -1223,7 +1223,7 @@
}
# return the 8 bit, if it exists, and the unicode codepoint
-sub eight_bit_and_unicode_point($$)
+sub _eight_bit_and_unicode_point($$)
{
my $char = shift;
my $encoding = shift;
@@ -1245,6 +1245,142 @@
return ($eight_bit, $codepoint);
}
+# format a stack of accents as unicode
+sub unicode_accents ($$$;$)
+{
+ my $result = shift;
+ my $stack = shift;
+ my $format_accent = shift;
+ my $in_upper_case = shift;
+
+ while (@$stack) {
+ my $formatted_result = unicode_accent($result, $stack->[-1]);
+ last if (!defined($formatted_result));
+
+ $result = $formatted_result;
+ pop @$stack;
+ }
+ $result = uc ($result) if ($in_upper_case);
+ while (@$stack) {
+ my $accent_command = pop @$stack;
+ $result = &$format_accent($result, $accent_command, $in_upper_case);
+ }
+ return $result;
+}
+
+sub eight_bit_accents($$$$;$)
+{
+ my $unicode_formatted = shift;
+ my $stack = shift;
+ my $encoding = shift;
+ my $convert_accent = shift;
+ my $in_upper_case = shift;
+
+ my $result = $unicode_formatted;
+
+ my $debug;
+ #$debug = 1;
+
+ if ($debug) {
+ print STDERR "STACK: ".join('|', map {$_->{'cmdname'}} @$stack)."\n";
+ }
+
+ # accents are formatted and the intermediate results are kept, such
+ # that we can return the maximum of multiaccented letters that can be
+ # rendered with a given eight bit formatting. undef is stored when
+ # there is no corresponding unicode anymore.
+ my @results_stack = ([$unicode_formatted, undef]);
+
+ while (@$stack) {
+ if (defined($unicode_formatted)) {
+ $unicode_formatted
+ = unicode_accent($unicode_formatted, $stack->[-1]);
+ $unicode_formatted = uc($unicode_formatted)
+ if ($in_upper_case and defined($unicode_formatted));
+ }
+ push @results_stack, [$unicode_formatted, $stack->[-1]];
+ pop @$stack;
+ }
+
+ if ($debug) {
+ print STDERR "PARTIAL_RESULTS_STACK:\n";
+ foreach my $partial_result (@results_stack) {
+ my $command = 'TEXT';
+ $command = $partial_result->[1]->{'cmdname'} if ($partial_result->[1]);
+ if (defined($partial_result->[0])) {
+ print STDERR " -> ".Encode::encode('utf8', $partial_result->[0])
+ ."|$command\n";
+ } else {
+ print STDERR " -> NO UTF8 |$command\n";
+ }
+ }
+ }
+
+ # At this point we have the utf8 encoded results for the accent
+ # commands stack, with all the intermediate results.
+ # For each one we'll check if it is possible to encode it in the
+ # current eight bit output encoding table and, if so set the result
+ # to the character.
+
+ my $eight_bit = '';
+
+ while (@results_stack) {
+ my $char = $results_stack[0]->[0];
+ last if (!defined($char));
+
+ my ($new_eight_bit, $new_codepoint)
+ = _eight_bit_and_unicode_point($char, $encoding);
+ if ($debug) {
+ my $command = 'TEXT';
+ $command = $results_stack[0]->[1]->{'cmdname'}
+ if ($results_stack[0]->[1]);
+ print STDERR "" . Encode::encode('utf8', $char)
+ . " ($command) new_codepoint: $new_codepoint 8bit: $new_eight_bit old:
$eight_bit\n";
+ }
+
+ # no corresponding eight bit character found for a composed character
+ last if (!$new_eight_bit);
+
+ # in that case, the new eight bit character is the same than the one
+ # found with one less character (and it isn't a @dotless{i}). It may
+ # hapen in 2 case, both meaning that there is no corresponding 8bit char:
+ #
+ # -> there are 2 characters in accent. This could happen, for example
+ # if an accent that cannot be rendered is found and it leads to
+ # appending or prepending a character. For example this happens for
+ # @={@,address@hidden, where @,address@hidden is expanded to a 2
character:
+ # n with a tilde, followed by a ,
+ # In that case, the additional utf8 diacritic is appended, which
+ # means that it is composed with the , and leaves n with a tilde
+ # untouched.
+ # -> the diacritic is appended but the normal form doesn't lead
+ # to a composed character, such that the first character
+ # of the string is unchanged. This, for example, happens for
+ # @ubaraccent{a} since there is no composed accent with a and an
+ # underbar.
+ last if ($new_eight_bit eq $eight_bit
+ and !($results_stack[0]->[1]->{'cmdname'} eq 'dotless'
+ and $char eq 'i'));
+ $result = $results_stack[0]->[0];
+ $eight_bit = $new_eight_bit;
+ shift @results_stack;
+ }
+
+ # handle the remaining accents, that have not been converted to 8bit
+ # compatible unicode
+ shift @results_stack if (!defined($results_stack[0]->[1]));
+ while (@results_stack) {
+ $result = &$convert_accent($result,
+ $results_stack[0]->[1],
+ $in_upper_case);
+ shift @results_stack;
+ }
+
+ # An important remark is that the final conversion to 8bit is left to
+ # perl.
+ return $result;
+}
+
# returns the unicode for a command with brace and no arg
# if it is known that it is present for the encoding
Index: t/accents.t
===================================================================
RCS file: /sources/texinfo/texinfo/tp/t/accents.t,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -b -r1.14 -r1.15
--- t/accents.t 29 Sep 2011 20:01:39 -0000 1.14
+++ t/accents.t 29 Sep 2011 20:13:07 -0000 1.15
@@ -77,7 +77,7 @@
my $text = Texinfo::Convert::Text::convert({'contents' => $contents});
my $result =
- Texinfo::Convert::Text::eight_bit_accents($text, $commands_stack,
+ Texinfo::Convert::Unicode::eight_bit_accents($text, $commands_stack,
'iso-8859-1', \&Texinfo::Convert::Text::ascii_accent);
my $html_converter = Texinfo::Convert::HTML->converter();
@@ -93,7 +93,7 @@
$text = Texinfo::Convert::Text::convert({'contents' => $contents},
{'enabled_encoding' => 'utf-8'});
- my $result_unicode = Texinfo::Convert::Text::unicode_accents($text,
+ my $result_unicode = Texinfo::Convert::Unicode::unicode_accents($text,
$commands_stack, \&Texinfo::Convert::Text::ascii_accent);
if (defined($reference)) {
#ok (Encode::decode('iso-8859-1', $reference) eq $result, $name);