[Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for readin

emacs-diffs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for readin

From:	K. Handa
Subject:	[Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for reading UTF-8 files.
Date:	Fri, 05 Apr 2013 23:19:51 +0900
User-agent:	Bazaar (2.5.0)

------------------------------------------------------------
revno: 112229 [merge]
committer: K. Handa <address@hidden>
branch nick: trunk
timestamp: Fri 2013-04-05 23:19:51 +0900
message:
  Optimize the code for reading UTF-8 files.
modified:
  src/ChangeLog
  src/coding.c
  src/insdel.c

=== modified file 'src/ChangeLog'
--- a/src/ChangeLog     2013-04-05 14:07:02 +0000
+++ b/src/ChangeLog     2013-04-05 14:17:55 +0000
@@ -1,3 +1,23 @@
+2013-04-03  Kenichi Handa  <address@hidden>
+
+       The following changes is to optimize the code for reading UTF-8
+       files.
+
+       * coding.c (check_ascii): Renamed from detect_ascii.  Return value
+       changed.  Check EOL format.  Do not call adjust_coding_eol_type
+       here.
+       (check_utf_8): New function.
+       (adjust_coding_eol_type): Do nothing if already adjusted.
+       (detect_coding): Compare the return value of check_ascii with
+       coding->src_bytes.  Call adjust_coding_eol_type if necessary.
+       (decode_coding_gap): Optimize for valid UTF-8.
+
+2013-03-21  Kenichi Handa  <address@hidden>
+
+       * coding.c (syms_of_coding): Cancel previous change.
+
+       * insdel.c (insert_from_gap): Fix previous change.
+
 2013-04-05  Dmitry Antipov  <address@hidden>
 
        Consistently use platform-specific function to detect window system.
@@ -484,7 +504,7 @@
 
        * coding.c (decode_coding_gap): Fix typo caught by static checking.
 
-2013-03-15  handa  <address@hidden>
+2013-03-15  Kenichi Handa  <address@hidden>
 
        * insdel.c (insert_from_gap): New arg text_at_gap_tail.
        (adjust_after_replace): Make it back to static.  Delete the third

=== modified file 'src/coding.c'
--- a/src/coding.c      2013-03-20 08:08:34 +0000
+++ b/src/coding.c      2013-04-05 14:08:56 +0000
@@ -6072,17 +6072,18 @@
 #define EOL_SEEN_CRLF  4
 
 
-static Lisp_Object adjust_coding_eol_type (struct coding_system *coding, int 
eol_seen);
-
-
-/* Return true iff all the source bytes are ASCII.
+static Lisp_Object adjust_coding_eol_type (struct coding_system *coding,
+                                          int eol_seen);
+
+
+/* Return the number of ASCII characters at the head of the source.
    By side effects, set coding->head_ascii and coding->eol_seen.  The
    value of coding->eol_seen is "logical or" of EOL_SEEN_LF,
    EOL_SEEN_CR, and EOL_SEEN_CRLF, but the value is reliable only when
    all the source bytes are ASCII.  */
 
-static bool
-detect_ascii (struct coding_system *coding)
+static int
+check_ascii (struct coding_system *coding)
 {
   const unsigned char *src, *end;
   Lisp_Object eol_type = CODING_ID_EOL_TYPE (coding->id);
@@ -6096,21 +6097,20 @@
   src = coding->source;
   end = src + coding->src_bytes;
 
-  if (inhibit_eol_conversion)
+  if (inhibit_eol_conversion
+      || eol_seen != EOL_SEEN_NONE)
     {
       /* We don't have to check EOL format.  */
       while (src < end && !( *src & 0x80)) src++;
-      eol_seen = EOL_SEEN_LF;
-      adjust_coding_eol_type (coding, eol_seen);
-    }
-  else if (eol_seen != EOL_SEEN_NONE)
-    {
-      /* We don't have to check EOL format either.  */
-      while (src < end && !(*src & 0x80)) src++;
+      if (inhibit_eol_conversion)
+       {
+         eol_seen = EOL_SEEN_LF;
+         adjust_coding_eol_type (coding, eol_seen);
+       }
     }
   else
     {
-      end--;                   /* We look ahead one byte.  */
+      end--;               /* We look ahead one byte for "CR LF".  */
       while (src < end)
        {
          int c = *src;
@@ -6118,6 +6118,69 @@
          if (c & 0x80)
            break;
          src++;
+         if (c == '\r')
+           {
+             if (*src == '\n')
+               {
+                 eol_seen |= EOL_SEEN_CRLF;
+                 src++;
+               }
+             else
+               eol_seen |= EOL_SEEN_CR;
+           }
+         else if (c == '\n')
+           eol_seen |= EOL_SEEN_LF;
+       }
+      if (src == end)
+       {
+         int c = *src;
+
+         /* All bytes but the last one C are ASCII.  */
+         if (! (c & 0x80))
+           {
+             if (c == '\r')
+               eol_seen |= EOL_SEEN_CR;
+             else if (c  == '\n')
+               eol_seen |= EOL_SEEN_LF;
+             src++;
+           }
+       }
+    }
+  coding->head_ascii = src - coding->source;
+  coding->eol_seen = eol_seen;
+  return (coding->head_ascii);
+}
+
+
+/* Return the number of charcters at the source if all the bytes are
+   valid UTF-8 (of Unicode range).  Otherwise, return -1.  By side
+   effects, update coding->eol_seen.  The value of coding->eol_seen is
+   "logical or" of EOL_SEEN_LF, EOL_SEEN_CR, and EOL_SEEN_CRLF, but
+   the value is reliable only when all the source bytes are valid
+   UTF-8.  */
+
+static int
+check_utf_8 (struct coding_system *coding)
+{
+  const unsigned char *src, *end;
+  int eol_seen = coding->eol_seen;
+  int nchars = coding->head_ascii;
+
+  if (coding->head_ascii < 0)
+    check_ascii (coding);
+  else
+    coding_set_source (coding);
+  src = coding->source + coding->head_ascii;
+  /* We look ahead one byte for CR LF.  */
+  end = coding->source + coding->src_bytes - 1;
+
+  while (src < end)
+    {
+      int c = *src;
+
+      if (UTF_8_1_OCTET_P (*src))
+       {
+         src++;
          if (c < 0x20)
            {
              if (c == '\r')
@@ -6126,6 +6189,7 @@
                    {
                      eol_seen |= EOL_SEEN_CRLF;
                      src++;
+                     nchars++;
                    }
                  else
                    eol_seen |= EOL_SEEN_CR;
@@ -6134,27 +6198,58 @@
                eol_seen |= EOL_SEEN_LF;
            }
        }
-      if (src > end)
-       /* The last two bytes are CR LF, which means that we have
-          scanned all bytes. */
-       end++;
-      else if (src == end)
-       {
-         end++;
-         if (! (*src & 0x80))
-           {
-             if (*src == '\r')
-               eol_seen |= EOL_SEEN_CR;
-             else if (*src  == '\n')
-               eol_seen |= EOL_SEEN_LF;
-             src++;
-           }
-       }
-      adjust_coding_eol_type (coding, eol_seen);
-    }
-  coding->head_ascii = src - coding->source;
+      else if (UTF_8_2_OCTET_LEADING_P (c))
+       {
+         if (c < 0xC2          /* overlong sequence */
+             || src + 1 >= end
+             || ! UTF_8_EXTRA_OCTET_P (src[1]))
+           return -1;
+         src += 2;
+       }
+      else if (UTF_8_3_OCTET_LEADING_P (c))
+       {
+         if (src + 2 >= end
+             || ! (UTF_8_EXTRA_OCTET_P (src[1])
+                   && UTF_8_EXTRA_OCTET_P (src[2])))
+           return -1;
+         c = (((c & 0xF) << 12)
+              | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
+         if (c < 0x800                       /* overlong sequence */
+             || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
+           return -1;
+         src += 3;
+       }
+      else if (UTF_8_4_OCTET_LEADING_P (c))
+       {
+         if (src + 3 >= end
+             || ! (UTF_8_EXTRA_OCTET_P (src[1])
+                   && UTF_8_EXTRA_OCTET_P (src[2])
+                   && UTF_8_EXTRA_OCTET_P (src[3])))
+           return -1;
+         c = (((c & 0x7) << 18) | ((src[1] & 0x3F) << 12)
+              | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F));
+         if (c < 0x10000       /* overlong sequence */
+             || c >= 0x110000) /* non-Unicode character  */
+           return -1;
+         src += 4;
+       }
+      else
+       return -1;
+      nchars++;
+    }
+
+  if (src == end)
+    {
+      if (! UTF_8_1_OCTET_P (*src))
+       return -1;
+      nchars++;
+      if (*src == '\r')
+       eol_seen |= EOL_SEEN_CR;
+      else if (*src  == '\n')
+       eol_seen |= EOL_SEEN_LF;
+    }
   coding->eol_seen = eol_seen;
-  return (src == end);
+  return nchars;
 }
 
 
@@ -6269,6 +6364,9 @@
   Lisp_Object eol_type;
 
   eol_type = CODING_ID_EOL_TYPE (coding->id);
+  if (! VECTORP (eol_type))
+    /* Already adjusted.  */
+    return eol_type;
   if (eol_seen & EOL_SEEN_LF)
     {
       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
@@ -6360,7 +6458,8 @@
                        {
                          coding->eol_seen |= EOL_SEEN_CRLF;
                          src++;
-                         coding->head_ascii++;
+                         if (! eight_bit_found)
+                           coding->head_ascii++;
                        }
                      else
                        coding->eol_seen |= EOL_SEEN_CR;
@@ -6461,9 +6560,14 @@
       coding_systems
        = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
       detect_info.found = detect_info.rejected = 0;
-      if (detect_ascii (coding))
+      if (check_ascii (coding) == coding->src_bytes)
        {
+         int head_ascii = coding->head_ascii;
+
+         if (coding->eol_seen != EOL_SEEN_NONE)
+           adjust_coding_eol_type (coding, coding->eol_seen);
          setup_coding_system (XCDR (coding_systems), coding);
+         coding->head_ascii = head_ascii;
        }
       else
        {
@@ -7620,15 +7724,27 @@
   if (CODING_REQUIRE_DETECTION (coding))
     detect_coding (coding);
   attrs = CODING_ID_ATTRS (coding->id);
-  if (! disable_ascii_optimization)
+  if (! disable_ascii_optimization
+      && ! coding->src_multibyte
+      && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
+      && NILP (CODING_ATTR_POST_READ (attrs))
+      && NILP (get_translation_table (attrs, 0, NULL)))
     {
-      if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
-         && NILP (CODING_ATTR_POST_READ (attrs))
-         && NILP (get_translation_table (attrs, 0, NULL))
-         && (coding->head_ascii >= 0 /* We've already called detect_coding */
-             ? coding->head_ascii == bytes
-             : detect_ascii (coding)))
-       {
+      chars = coding->head_ascii;
+      if (chars < 0)
+       chars = check_ascii (coding);
+      if (chars != bytes)
+       {
+         if (EQ (CODING_ATTR_TYPE (attrs), Qutf_8))
+           chars = check_utf_8 (coding);
+         else
+           chars = -1;
+       }
+      if (chars >= 0)
+       {
+         if (coding->eol_seen != EOL_SEEN_NONE)
+           adjust_coding_eol_type (coding, coding->eol_seen);
+
          if (coding->eol_seen == EOL_SEEN_CR)
            {
              unsigned char *src_end = GAP_END_ADDR;
@@ -7645,6 +7761,7 @@
              unsigned char *src = GAP_END_ADDR;
              unsigned char *src_beg = src - coding->src_bytes;
              unsigned char *dst = src;
+             ptrdiff_t diff;
 
              while (src_beg < src)
                {
@@ -7652,10 +7769,13 @@
                  if (*src == '\n')
                    src--;
                }
-             bytes -= dst - src;
+             diff = dst - src;
+             bytes -= diff;
+             chars -= diff;
            }
-         coding->produced_char = coding->produced = bytes;
-         insert_from_gap (bytes, bytes, 1);
+         coding->produced = bytes;
+         coding->produced_char = chars;
+         insert_from_gap (chars, bytes, 1);
          return;
        }
     }
@@ -10877,7 +10997,7 @@
   DEFVAR_BOOL ("disable-ascii-optimization", disable_ascii_optimization,
               doc: /* If non-nil, Emacs does not optimize code decoder for 
ASCII files.
 Internal use only.  Removed after the experimental optimizer gets stable. */);
-  disable_ascii_optimization = 1;
+  disable_ascii_optimization = 0;
 
   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
               doc: /* Char table for translating self-inserting characters.

=== modified file 'src/insdel.c'
--- a/src/insdel.c      2013-03-28 14:04:49 +0000
+++ b/src/insdel.c      2013-04-05 14:17:55 +0000
@@ -983,6 +983,9 @@
 void
 insert_from_gap (ptrdiff_t nchars, ptrdiff_t nbytes, bool text_at_gap_tail)
 {
+  int ins_charpos = GPT;
+  int ins_bytepos = GPT_BYTE;
+
   if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
     nchars = nbytes;
 
@@ -1003,18 +1006,18 @@
 
   eassert (GPT <= GPT_BYTE);
 
-  adjust_overlays_for_insert (GPT - nchars, nchars);
-  adjust_markers_for_insert (GPT - nchars, GPT_BYTE - nbytes,
-                            GPT, GPT_BYTE, 0);
+  adjust_overlays_for_insert (ins_charpos, nchars);
+  adjust_markers_for_insert (ins_charpos, ins_bytepos,
+                            ins_charpos + nchars, ins_bytepos + nbytes, 0);
 
   if (buffer_intervals (current_buffer))
     {
-      offset_intervals (current_buffer, GPT - nchars, nchars);
-      graft_intervals_into_buffer (NULL, GPT - nchars, nchars,
+      offset_intervals (current_buffer, ins_charpos, nchars);
+      graft_intervals_into_buffer (NULL, ins_charpos, nchars,
                                   current_buffer, 0);
     }
 
-  if (! text_at_gap_tail && GPT - nchars < PT)
+  if (ins_charpos < PT)
     adjust_point (nchars, nbytes);
 
   check_markers ();

[Prev in Thread]

Current Thread

[Next in Thread]

[Emacs-diffs] /srv/bzr/emacs/trunk r112229: Optimize the code for reading UTF-8 files., K. Handa <=

Prev by Date: [Emacs-diffs] /srv/bzr/emacs/trunk r112228: Consistently use platform-specific function to detect window system.
Next by Date: [Emacs-diffs] /srv/bzr/emacs/trunk r112230: ispell.el (ispell-set-spellchecker-params): Really set `ispell-args' for all equivs.
Previous by thread: [Emacs-diffs] /srv/bzr/emacs/trunk r112228: Consistently use platform-specific function to detect window system.
Next by thread: [Emacs-diffs] /srv/bzr/emacs/trunk r112230: ispell.el (ispell-set-spellchecker-params): Really set `ispell-args' for all equivs.
Index(es):
- Date
- Thread