[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
bug#24603: [PATCHv5 08/11] Implement rules for title-casing Dutch ij ‘le
From: |
Michal Nazarewicz |
Subject: |
bug#24603: [PATCHv5 08/11] Implement rules for title-casing Dutch ij ‘letter’ (bug#24603) |
Date: |
Thu, 9 Mar 2017 22:51:47 +0100 |
Dutch treats ‘ij’ as a single letter and when capitalising a word it
should be capitalised as such (i.e. ‘ij’ becomes ‘IJ’). Implement that.
* src/casefiddle.c (struct casing_context): Add a ‘special’ field which
determines if any special casing rules are in effect.
(prepare_casing_context): Interpret ‘buffer-language’ variable and set
ctx->special accordingly. This allows for per-language special rules.
For now only Dutch (‘nl’) is handled specially.
(case_character_impl): Add handling of a Dutch ‘ij’ letter.
* test/src/casefiddle-tests.el (casefiddle-tests--test-casing): Add
test cases for Dutch ‘ij’.
---
src/casefiddle.c | 56 ++++++++++++++++++++++++++++++++++++++++++++
test/src/casefiddle-tests.el | 7 +++++-
2 files changed, 62 insertions(+), 1 deletion(-)
diff --git a/src/casefiddle.c b/src/casefiddle.c
index 2f573782115..d59684c7b8e 100644
--- a/src/casefiddle.c
+++ b/src/casefiddle.c
@@ -49,6 +49,32 @@ struct casing_context {
bool inbuffer;
/* Whether we are inside of a word. */
bool inword;
+
+ /* Determines which special casing rules need to be applied as well as tracks
+ state for some of the transformations.*/
+ enum {
+ /* No special casing rules need to be applied. */
+ SPECIAL_NONE,
+
+ /* In Dutch, ‘ij’ is a digraph and when capitalised the whole thing is
upper
+ cased. Unicode has ‘ij’ and ‘IJ’ (with proper casing mappings) but they
+ aren’t always used so we cannot/should not rely on them.
+
+ Note that rule for capitalising ‘ij’ as a single letter is not present
in
+ Unicode 9.0’s SpecialCasing.txt. On the flip side, Firefox implements
+ this as well so we’re not completely alone.
+
+ There are words where ‘ij’ are two separate letters (such as bijectie or
+ bijoux) in which case the capitalisation rules do not apply. I (mina86)
+ have googled this a little and couldn’t find a Dutch word which beings
+ with ‘ij’ that is not a digraph so we should be in the clear since we
+ only care about the initial. */
+ /* Apply Dutch rules for capitalising ‘ij’. */
+ SPECIAL_NL,
+ /* As above and the previous character was upcased ‘i’ so if we now see ‘j’
+ it needs to be upcased as well. */
+ SPECIAL_NL_UPCASE_J
+ } special;
};
/* Initialise CTX structure and prepares related global data for casing
@@ -57,6 +83,8 @@ static void
prepare_casing_context (struct casing_context *ctx,
enum case_action flag, bool inbuffer)
{
+ Lisp_Object lang;
+
ctx->flag = flag;
ctx->inbuffer = inbuffer;
ctx->inword = false;
@@ -65,6 +93,7 @@ prepare_casing_context (struct casing_context *ctx,
: Qnil;
ctx->specialcase_char_table =
uniprop_table (intern_c_string ("special-casing"));
+ ctx->special = SPECIAL_NONE;
/* If the case table is flagged as modified, rescan it. */
if (NILP (XCHAR_TABLE (BVAR (current_buffer, downcase_table))->extras[1]))
@@ -72,6 +101,14 @@ prepare_casing_context (struct casing_context *ctx,
if (inbuffer && (int) flag >= (int) CASE_CAPITALIZE)
SETUP_BUFFER_SYNTAX_TABLE (); /* For syntax_prefix_flag_p. */
+
+ lang = BVAR(current_buffer, language);
+ if (STRINGP (lang) && SCHARS (lang) >= 2)
+ switch ((SREF(lang, 0) << 8) | SREF(lang, 1) | 0x2020u) {
+ case ('n' << 8) | 'l': /* Dutch */
+ if ((int) flag >= (int) CASE_CAPITALIZE)
+ ctx->special = SPECIAL_NL;
+ }
}
struct casing_str_buf {
@@ -95,6 +132,25 @@ case_character_impl (struct casing_str_buf *buf,
bool was_inword;
int cased;
+ /* Handle Dutch ij. Note that SPECIAL_NL and SPECIAL_NL_UPCASE_J implies
that
+ ctx->flag ≥ CASE_CAPITALIZE. */
+ if (ctx->special == SPECIAL_NL && ch == 'i' && !ctx->inword)
+ {
+ ctx->special = SPECIAL_NL_UPCASE_J;
+ ctx->inword = true;
+ cased = 'I';
+ goto done;
+ }
+ else if (ctx->special == SPECIAL_NL_UPCASE_J)
+ {
+ ctx->special = SPECIAL_NL;
+ if (ch == 'j')
+ {
+ cased = 'J';
+ goto done;
+ }
+ }
+
/* Update inword state */
was_inword = ctx->inword;
ctx->inword = SYNTAX (ch) == Sword &&
diff --git a/test/src/casefiddle-tests.el b/test/src/casefiddle-tests.el
index 10450360eab..5e38a97d256 100644
--- a/test/src/casefiddle-tests.el
+++ b/test/src/casefiddle-tests.el
@@ -135,6 +135,7 @@ casefiddle-tests--test-casing
(lambda (errors test)
(let* ((input (car test))
(expected (cdr test))
+ (buffer-language (or (nth 5 test) "en_GB"))
(func-pairs '((upcase upcase-region)
(downcase downcase-region)
(capitalize capitalize-region)
@@ -200,7 +201,11 @@ casefiddle-tests--test-casing
("Σ Σ" "Σ Σ" "σ σ" "Σ Σ" "Σ Σ")
("όσος" "ΌΣΟΣ" "όσος" "Όσος" "Όσος")
;; If sigma is already lower case, we don’t want to change it.
- ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ"))))))
+ ("όσοσ" "ΌΣΟΣ" "όσοσ" "Όσοσ" "Όσοσ")
+
+ ;; Dutch 'ij' is capitalised as single digraph.
+ ("ijsland" "IJSLAND" "ijsland" "Ijsland" "Ijsland")
+ ("ijsland" "IJSLAND" "ijsland" "IJsland" "IJsland" "nl"))))))
(ert-deftest casefiddle-tests-casing-byte8 ()
(should-not
--
2.12.0.246.ga2ecc84866-goog
- bug#24603: [PATCHv5 03/11] Add support for title-casing letters (bug#24603), (continued)
- bug#24603: [PATCHv5 03/11] Add support for title-casing letters (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 06/11] Implement special sigma casing rule (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 04/11] Split up casify_region function (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 07/11] Introduce ‘buffer-language’ buffer-locar variable, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 02/11] Introduce case_character function, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 01/11] Split casify_object into multiple functions, Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 10/11] Implement casing rules for Lithuanian (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 08/11] Implement rules for title-casing Dutch ij ‘letter’ (bug#24603),
Michal Nazarewicz <=
- bug#24603: [PATCHv5 09/11] Implement Turkic dotless and dotted i casing rules (bug#24603), Michal Nazarewicz, 2017/03/09
- bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603), Michal Nazarewicz, 2017/03/09
bug#24603: [PATCHv5 05/11] Support casing characters which map into multiple code points (bug#24603), Michal Nazarewicz, 2017/03/09