[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH v3 1/4] Fix width computation
From: |
Robbie Harwood |
Subject: |
Re: [PATCH v3 1/4] Fix width computation |
Date: |
Wed, 12 Jan 2022 14:48:11 -0500 |
Bruno Haible <bruno@clisp.org> writes:
> Robbie Harwood wrote:
>> From: Vladimir 'phcoder' Serbinenko <phcoder@gmail.com>
>>
>> [rharwood@redhat.com: merge with later fix from pjones@redhat.com]
>> Signed-off-by: Robbie Harwood <rharwood@redhat.com>
>> ---
>> lib/argp-fmtstream.c | 80 +++++++++++++++++++++++++++++++++++++-------
>> lib/argp-help.c | 3 +-
>> lib/mbswidth.c | 15 +++++++++
>> lib/mbswidth.h | 4 +++
>> 4 files changed, 88 insertions(+), 14 deletions(-)
>
> This patch will need several more iterations, as it mixes two good
> ideas, with - so far - imperfect implementations:
>
> * The need to use wcwidth in argp-fmtstream arises because the
> help strings are internationalized. But for internationalized strings,
> line breaking by looking for spaces is just wrong. (It works for
> Russian and Greek but not for Chinese.) A much better algorithm
> (that works for most languages, except Thai) is found in
> GNU libunistring 1.0. But the code in glibc cannot use libunistring;
> so there likely will need to be some '#ifdef _LIBC'.
>
> * The idea to parse escape sequences in a special way, before invoking
> wcwidth, it nice. But's it's far from complete. IMO one needs to look
> at GNU teseq, the ISO 2022 standard, and other de-facto standards
> when implementing that. And it should then be implemented in wcswidth,
> mbswidth, etc. uniformly.
I've put together at using libunistring (attached). Unfortunately it
doesn't seem to find ulc_width_linebreaks() at link time, and I'm not
familiar enough with gnulib to understand why. Feedback on that (or
other parts of it) appreciated.
Be well,
--Robbie
signature.asc
Description: PGP signature
>From 12c09fc6632e295101ee6957c952bbf1b5c0165a Mon Sep 17 00:00:00 2001
From: Robbie Harwood <rharwood@redhat.com>
Date: Mon, 10 Jan 2022 17:17:39 -0500
Subject: [PATCH] argp-fmtstream: use ulc_width_linebreaks()
ulc_width_linebreaks() can't be used in _LIBC, so fall back to a
best-effort solution there that won't work with Chinese.
Signed-off-by: Robbie Harwood <rharwood@redhat.com>
---
lib/argp-fmtstream.c | 361 +++++++++++++++++++------------------------
modules/argp | 1 +
2 files changed, 161 insertions(+), 201 deletions(-)
diff --git a/lib/argp-fmtstream.c b/lib/argp-fmtstream.c
index 78c29e38c..12d26effb 100644
--- a/lib/argp-fmtstream.c
+++ b/lib/argp-fmtstream.c
@@ -28,9 +28,11 @@
#include <errno.h>
#include <stdarg.h>
#include <ctype.h>
+#include <wchar.h>
#include "argp-fmtstream.h"
#include "argp-namefrob.h"
+#include "mbswidth.h"
#ifndef ARGP_FMTSTREAM_USE_LINEWRAP
@@ -42,6 +44,8 @@
# include <wchar.h>
# include <libio/libioP.h>
# define __vsnprintf(s, l, f, a) _IO_vsnprintf (s, l, f, a)
+#else
+# include <unilbrk.h>
#endif
#define INIT_BUF_SIZE 200
@@ -115,233 +119,188 @@ weak_alias (__argp_fmtstream_free, argp_fmtstream_free)
#endif
#endif
-/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the
- end of its buffer. This code is mostly from glibc stdio/linewrap.c. */
-void
-__argp_fmtstream_update (argp_fmtstream_t fs)
+/* Insert suffix and left margin at POINT_OFFS, flushing as needed. */
+static void
+line_at_point (argp_fmtstream_t fs, const char *suffix)
{
- char *buf, *nl;
- size_t len;
+ size_t i, space_needed = fs->wmargin;
+ char *nl, *queue = fs->buf + fs->point_offs;
- /* Scan the buffer for newlines. */
- buf = fs->buf + fs->point_offs;
- while (buf < fs->p)
+ if (suffix)
+ space_needed += strlen(suffix);
+
+ while (fs->p + space_needed > fs->end)
{
- size_t r;
-
- if (fs->point_col == 0 && fs->lmargin != 0)
+ /* Output the first line so we can use the space. */
+ nl = memchr (fs->buf, '\n', fs->point_offs);
+ if (nl == NULL)
{
- /* We are starting a new line. Print spaces to the left margin. */
- const size_t pad = fs->lmargin;
- if (fs->p + pad < fs->end)
- {
- /* We can fit in them in the buffer by moving the
- buffer text up and filling in the beginning. */
- memmove (buf + pad, buf, fs->p - buf);
- fs->p += pad; /* Compensate for bigger buffer. */
- memset (buf, ' ', pad); /* Fill in the spaces. */
- buf += pad; /* Don't bother searching them. */
- }
- else
- {
- /* No buffer space for spaces. Must flush. */
- size_t i;
- for (i = 0; i < pad; i++)
- {
+ /* Line longer than buffer - shouldn't happen. Truncate. */
+ nl = queue - 1;
+ *nl = '\n';
+ }
+
#ifdef _LIBC
- if (_IO_fwide (fs->stream, 0) > 0)
- putwc_unlocked (L' ', fs->stream);
- else
+ __fxprintf (fs->stream, "%.*s\n",
+ (int) (nl - fs->buf), fs->buf);
+#else
+ if (nl > fs->buf)
+ fwrite_unlocked (fs->buf, 1, nl - fs->buf, fs->stream);
+ putc_unlocked ('\n', fs->stream);
#endif
- putc_unlocked (' ', fs->stream);
- }
- }
- fs->point_col = pad;
- }
- len = fs->p - buf;
- nl = memchr (buf, '\n', len);
+ memmove (fs->buf, nl + 1, nl + 1 - fs->buf);
+ fs->p -= nl + 1 - fs->buf;
+ fs->point_offs -= nl + 1 - fs->buf;
+ }
- if (fs->point_col < 0)
- fs->point_col = 0;
+ memmove (queue + space_needed, queue, fs->p - queue);
+ if (suffix)
+ {
+ memcpy (queue, suffix, strlen (suffix));
+ fs->point_offs += strlen (suffix);
+ }
+ for (i = 0; i < fs->lmargin && fs->point_col != -1; i++)
+ fs->buf[fs->point_offs++] = ' ';
+ fs->point_col = fs->lmargin;
+}
- if (!nl)
+#ifdef _LIBC
+# define UC_BREAK_UNDEFINED 0
+# define UC_BREAK_POSSIBLE 1
+# define UC_BREAK_HYPHENATION 2
+# define UC_BREAK_MANDATORY 3
+static int
+uwl_shim(const char *s, size_t n, int width, int start_column,
+ const char *encoding, char *p)
+{
+ size_t i = 0, cur_col = start_column, last_option = 0, c_len;
+ mbstate_t ps;
+ wchar_t c;
+
+ memset(&ps, 0, sizeof (ps));
+ memset(p, UC_BREAK_UNDEFINED, n);
+
+ while (i < n)
+ {
+ c_len = mbrtowc (&c, s + i, n, &ps);
+ if (c_len == 0 || c_len == -2)
{
- /* The buffer ends in a partial line. */
-
- if (fs->point_col + len < fs->rmargin)
- {
- /* The remaining buffer text is a partial line and fits
- within the maximum line width. Advance point for the
- characters to be written and stop scanning. */
- fs->point_col += len;
- break;
- }
- else
- /* Set the end-of-line pointer for the code below to
- the end of the buffer. */
- nl = fs->p;
+ break;
}
- else if (fs->point_col + (nl - buf) < (ssize_t) fs->rmargin)
+ else if (c_len == -1)
{
- /* The buffer contains a full line that fits within the maximum
- line width. Reset point and scan the next line. */
- fs->point_col = 0;
- buf = nl + 1;
+ /* Malformed. Walk forward until things make sense again. */
+ i++;
+ cur_col++;
continue;
}
- /* This line is too long. */
- r = fs->rmargin - 1;
-
- if (fs->wmargin < 0)
+ if (c == L'\n')
{
- /* Truncate the line by overwriting the excess with the
- newline and anything after it in the buffer. */
- if (nl < fs->p)
+ p[i] = UC_BREAK_MANDATORY;
+ last_option = 0;
+ cur_col = 0;
+ }
+ else if (c == L' ' || c == '\t')
+ {
+ /* Best effort - won't help with Chinese or Thai. */
+ last_option = i;
+ }
+ else if (cur_col >= width && last_option != 0)
+ {
+ p[last_option] = UC_BREAK_POSSIBLE;
+ last_option = 0;
+ cur_col = 0;
+ }
+
+ i += c_len;
+ cur_col += wcwidth(c);
+ }
+
+ return n - last_break;
+}
+#define ulc_width_linebreaks(s, n, width, start_column, at_end_columns, \
+ o, encoding, p) \
+ uwl_shim(s, n, width, start_column, encoding, p)
+#endif
+
+/* Process FS's buffer so that line wrapping is done from POINT_OFFS to the
+ end of its buffer. */
+void
+__argp_fmtstream_update (argp_fmtstream_t fs)
+{
+ char *lns, *nl, *c, *queue = fs->buf + fs->point_offs;
+ int col;
+ size_t i, proc_len = fs->p - queue;
+
+ if (queue >= fs->p)
+ return;
+
+ if (fs->wmargin == -1)
+ {
+ while (fs->buf + fs->point_offs < fs->p)
+ {
+ nl = memchr (fs->buf + fs->point_offs, '\n',
+ fs->rmargin - fs->point_col);
+ if (nl)
{
- memmove (buf + (r - fs->point_col), nl, fs->p - nl);
- fs->p -= buf + (r - fs->point_col) - nl;
- /* Reset point for the next line and start scanning it. */
- fs->point_col = 0;
- buf += r + 1; /* Skip full line plus \n. */
+ fs->point_offs = nl - queue;
+ line_at_point (fs, NULL);
+ continue;
}
- else
+
+ /* Truncate until end of line. */
+ fs->point_offs += fs->rmargin - fs->point_col;
+ nl = memchr (fs->buf + fs->point_offs + fs->rmargin - fs->point_col,
+ '\n', fs->p - fs->buf - fs->point_offs);
+ if (! nl)
{
- /* The buffer ends with a partial line that is beyond the
- maximum line width. Advance point for the characters
- written, and discard those past the max from the buffer. */
- fs->point_col += len;
- fs->p -= fs->point_col - r;
- break;
+ /* This is the last line. */
+ fs->buf[fs->point_offs++] = '\n';
+ fs->p = fs->buf + fs->point_offs;
+ line_at_point(fs, NULL);
+ return;
}
+
+ fs->buf[fs->point_offs++] = '\n';
+ memmove (fs->buf + fs->point_offs, nl + 1,
+ nl + 1 - fs->buf - fs->point_offs);
+ line_at_point(fs, NULL);
+ }
+ return;
+ }
+
+ lns = (char *) malloc (proc_len);
+ if (! lns)
+ return;
+
+ col = ulc_width_linebreaks (queue, proc_len, fs->rmargin - fs->wmargin,
+ fs->point_col == -1 ? 0 : fs->point_col, 0,
+ NULL, locale_charset (), lns);
+ for (i = 0; i < proc_len; i++)
+ {
+ if (lns[i] == UC_BREAK_HYPHENATION)
+ {
+ line_at_point (fs, "-\n");
+ }
+ else if (lns[i] == UC_BREAK_POSSIBLE)
+ {
+ line_at_point (fs, "\n");
+ }
+ else if (lns[i] == UC_BREAK_MANDATORY)
+ {
+ fs->point_offs++;
+ line_at_point (fs, NULL);
}
else
{
- /* Do word wrap. Go to the column just past the maximum line
- width and scan back for the beginning of the word there.
- Then insert a line break. */
-
- char *p, *nextline;
- int i;
-
- p = buf + (r + 1 - fs->point_col);
- while (p >= buf && !isblank ((unsigned char) *p))
- --p;
- nextline = p + 1; /* This will begin the next line. */
-
- if (nextline > buf)
- {
- /* Swallow separating blanks. */
- if (p >= buf)
- do
- --p;
- while (p >= buf && isblank ((unsigned char) *p));
- nl = p + 1; /* The newline will replace the first blank. */
- }
- else
- {
- /* A single word that is greater than the maximum line width.
- Oh well. Put it on an overlong line by itself. */
- p = buf + (r + 1 - fs->point_col);
- /* Find the end of the long word. */
- if (p < nl)
- do
- ++p;
- while (p < nl && !isblank ((unsigned char) *p));
- if (p == nl)
- {
- /* It already ends a line. No fussing required. */
- fs->point_col = 0;
- buf = nl + 1;
- continue;
- }
- /* We will move the newline to replace the first blank. */
- nl = p;
- /* Swallow separating blanks. */
- do
- ++p;
- while (isblank ((unsigned char) *p));
- /* The next line will start here. */
- nextline = p;
- }
-
- /* Note: There are a bunch of tests below for
- NEXTLINE == BUF + LEN + 1; this case is where NL happens to fall
- at the end of the buffer, and NEXTLINE is in fact empty (and so
- we need not be careful to maintain its contents). */
-
- if ((nextline == buf + len + 1
- ? fs->end - nl < fs->wmargin + 1
- : nextline - (nl + 1) < fs->wmargin)
- && fs->p > nextline)
- {
- /* The margin needs more blanks than we removed. */
- if (fs->end - fs->p > fs->wmargin + 1)
- /* Make some space for them. */
- {
- size_t mv = fs->p - nextline;
- memmove (nl + 1 + fs->wmargin, nextline, mv);
- nextline = nl + 1 + fs->wmargin;
- len = nextline + mv - buf;
- *nl++ = '\n';
- }
- else
- /* Output the first line so we can use the space. */
- {
-#ifdef _LIBC
- __fxprintf (fs->stream, "%.*s\n",
- (int) (nl - fs->buf), fs->buf);
-#else
- if (nl > fs->buf)
- fwrite_unlocked (fs->buf, 1, nl - fs->buf, fs->stream);
- putc_unlocked ('\n', fs->stream);
-#endif
-
- len += buf - fs->buf;
- nl = buf = fs->buf;
- }
- }
- else
- /* We can fit the newline and blanks in before
- the next word. */
- *nl++ = '\n';
-
- if (nextline - nl >= fs->wmargin
- || (nextline == buf + len + 1 && fs->end - nextline >=
fs->wmargin))
- /* Add blanks up to the wrap margin column. */
- for (i = 0; i < fs->wmargin; ++i)
- *nl++ = ' ';
- else
- for (i = 0; i < fs->wmargin; ++i)
-#ifdef _LIBC
- if (_IO_fwide (fs->stream, 0) > 0)
- putwc_unlocked (L' ', fs->stream);
- else
-#endif
- putc_unlocked (' ', fs->stream);
-
- /* Copy the tail of the original buffer into the current buffer
- position. */
- if (nl < nextline)
- memmove (nl, nextline, buf + len - nextline);
- len -= nextline - buf;
-
- /* Continue the scan on the remaining lines in the buffer. */
- buf = nl;
-
- /* Restore bufp to include all the remaining text. */
- fs->p = nl + len;
-
- /* Reset the counter of what has been output this line. If wmargin
- is 0, we want to avoid the lmargin getting added, so we set
- point_col to a magic value of -1 in that case. */
- fs->point_col = fs->wmargin ? fs->wmargin : -1;
+ fs->point_offs++;
}
}
+ fs->point_col = col;
- /* Remember that we've scanned as far as the end of the buffer. */
- fs->point_offs = fs->p - fs->buf;
+ free (lns);
}
/* Ensure that FS has space for AMOUNT more bytes in its buffer, either by
diff --git a/modules/argp b/modules/argp
index 85eefe2b1..1a958b1f1 100644
--- a/modules/argp
+++ b/modules/argp
@@ -36,6 +36,7 @@ stdio
strerror
memchr
memmove
+libunistring
configure.ac:
gl_ARGP
--
2.34.1