[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 3/5] str: Add some functions for handling UTF-8.
From: |
Ben Pfaff |
Subject: |
[PATCH 3/5] str: Add some functions for handling UTF-8. |
Date: |
Thu, 18 Nov 2010 22:21:30 -0800 |
---
Smake | 2 +
src/libpspp/str.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
src/libpspp/str.h | 8 +++++
3 files changed, 86 insertions(+), 0 deletions(-)
diff --git a/Smake b/Smake
index 7855ce1..6e6b702 100644
--- a/Smake
+++ b/Smake
@@ -71,8 +71,10 @@ GNULIB_MODULES = \
unilbrk/ulc-width-linebreaks \
unistd \
unistr/u8-cpy \
+ unistr/u8-mbtouc \
unistr/u8-strlen \
unistr/u8-strncat \
+ unitypes \
unlocked-io \
vasprintf-posix \
version-etc \
diff --git a/src/libpspp/str.c b/src/libpspp/str.c
index cd2363a..7b67722 100644
--- a/src/libpspp/str.c
+++ b/src/libpspp/str.c
@@ -22,6 +22,7 @@
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
+#include <unistr.h>
#include "libpspp/cast.h"
#include "libpspp/message.h"
@@ -740,6 +741,81 @@ ss_xstrdup (struct substring ss)
s[ss.length] = '\0';
return s;
}
+/* UTF-8. */
+
+/* Returns the character represented by the UTF-8 sequence at the start of S.
+ The return value is either a Unicode code point in the range 0 to 0x10ffff,
+ or UINT32_MAX if S is empty. */
+ucs4_t
+ss_first_mb (struct substring s)
+{
+ return ss_at_mb (s, 0);
+}
+
+/* Returns the number of bytes in the UTF-8 character at the beginning of S.
+
+ The return value is 0 if S is empty, otherwise between 1 and 4. */
+int
+ss_first_mblen (struct substring s)
+{
+ return ss_at_mblen (s, 0);
+}
+
+/* Advances S past the UTF-8 character at its beginning. Returns the Unicode
+ code point that was skipped (in the range 0 to 0x10ffff), or UINT32_MAX if S
+ was not modified because it was initially empty. */
+ucs4_t
+ss_get_mb (struct substring *s)
+{
+ if (s->length > 0)
+ {
+ ucs4_t uc;
+ int n;
+
+ n = u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s->string), s->length);
+ s->string += n;
+ s->length -= n;
+ return uc;
+ }
+ else
+ return UINT32_MAX;
+}
+
+/* Returns the character represented by the UTF-8 sequence starting OFS bytes
+ into S. The return value is either a Unicode code point in the range 0 to
+ 0x10ffff, or UINT32_MAX if OFS is past the last byte in S.
+
+ (Returns 0xfffd if OFS points into the middle, not the beginning, of a UTF-8
+ sequence.) */
+ucs4_t
+ss_at_mb (struct substring s, size_t ofs)
+{
+ if (s.length > ofs)
+ {
+ ucs4_t uc;
+ u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs),
+ s.length - ofs);
+ return uc;
+ }
+ else
+ return UINT32_MAX;
+}
+
+/* Returns the number of bytes represented by the UTF-8 sequence starting OFS
+ bytes into S. The return value is 0 if OFS is past the last byte in S,
+ otherwise between 1 and 4. */
+int
+ss_at_mblen (struct substring s, size_t ofs)
+{
+ if (s.length > ofs)
+ {
+ ucs4_t uc;
+ return u8_mbtouc (&uc, CHAR_CAST (const uint8_t *, s.string + ofs),
+ s.length - ofs);
+ }
+ else
+ return 0;
+}
/* Initializes ST as an empty string. */
void
diff --git a/src/libpspp/str.h b/src/libpspp/str.h
index ecf9e6e..ddfd2f8 100644
--- a/src/libpspp/str.h
+++ b/src/libpspp/str.h
@@ -23,6 +23,7 @@
#include <stdint.h>
#include <stdio.h>
#include <string.h>
+#include <unitypes.h>
#include "compiler.h"
#include "memcasecmp.h"
@@ -127,6 +128,13 @@ int ss_equals (struct substring, struct substring);
int ss_equals_case (struct substring, struct substring);
size_t ss_pointer_to_position (struct substring, const char *);
char *ss_xstrdup (struct substring);
+
+/* UTF-8. */
+ucs4_t ss_first_mb (struct substring);
+int ss_first_mblen (struct substring);
+ucs4_t ss_get_mb (struct substring *);
+ucs4_t ss_at_mb (struct substring, size_t ofs);
+int ss_at_mblen (struct substring, size_t ofs);
/* Variable length strings. */
--
1.7.1
- [PATCH 0/5] refactor command name parsing, Ben Pfaff, 2010/11/19
- [PATCH 1/5] str: Inline some trivial functions., Ben Pfaff, 2010/11/19
- [PATCH 3/5] str: Add some functions for handling UTF-8.,
Ben Pfaff <=
- [PATCH 4/5] identifier: Add some functions for Unicode syntax., Ben Pfaff, 2010/11/19
- [PATCH 5/5] command: Factor command name matching out of command.c., Ben Pfaff, 2010/11/19
- [PATCH 2/5] str: Change "char" to "byte" in function names., Ben Pfaff, 2010/11/19
- Re: [PATCH 0/5] refactor command name parsing, John Darrington, 2010/11/19