bug-libunistring
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [bug-libunistring] Incorrect NFKC case folding


From: Daurnimator
Subject: Re: [bug-libunistring] Incorrect NFKC case folding
Date: Sat, 19 Nov 2016 14:38:29 +1100

For reference, here is a list of codepoints that seem to have the wrong result, it was generated with the following lua 5.3 program:

```
local unistring = require "unistring" -- From https://github.com/daurnimator/lua-unistring

for line in io.lines("DerivedNormalizationProps.txt") do
    local codepoint, to = line:match("^(%x+) *; NFKC_CF;([%x ]*)")
    if codepoint then
        codepoint = tonumber(codepoint, 16)
        local t = {}
        for cp in to:gmatch("%x+") do
            table.insert(t, tonumber(cp, 16))
        end
        if utf8.char(table.unpack(t)) ~= unistring.casefold(utf8.char(codepoint), nil, "NFKC") then
            print("FAILED", line)
        end
    end
end
```

FAILED 00AD          ; NFKC_CF;                # Cf       SOFT HYPHEN
FAILED 034F          ; NFKC_CF;                # Mn       COMBINING GRAPHEME JOINER
FAILED 061C          ; NFKC_CF;                # Cf       ARABIC LETTER MARK
FAILED 180E          ; NFKC_CF;                # Cf       MONGOLIAN VOWEL SEPARATOR
FAILED 1C80          ; NFKC_CF; 0432           # L&       CYRILLIC SMALL LETTER ROUNDED VE
FAILED 1C81          ; NFKC_CF; 0434           # L&       CYRILLIC SMALL LETTER LONG-LEGGED DE
FAILED 1C82          ; NFKC_CF; 043E           # L&       CYRILLIC SMALL LETTER NARROW O
FAILED 1C83          ; NFKC_CF; 0441           # L&       CYRILLIC SMALL LETTER WIDE ES
FAILED 1C86          ; NFKC_CF; 044A           # L&       CYRILLIC SMALL LETTER TALL HARD SIGN
FAILED 1C87          ; NFKC_CF; 0463           # L&       CYRILLIC SMALL LETTER TALL YAT
FAILED 1C88          ; NFKC_CF; A64B           # L&       CYRILLIC SMALL LETTER UNBLENDED UK
FAILED 2065          ; NFKC_CF;                # Cn       <reserved-2065>
FAILED 3164          ; NFKC_CF;                # Lo       HANGUL FILLER
FAILED A7AE          ; NFKC_CF; 026A           # L&       LATIN CAPITAL LETTER SMALL CAPITAL I
FAILED FEFF          ; NFKC_CF;                # Cf       ZERO WIDTH NO-BREAK SPACE
FAILED FFA0          ; NFKC_CF;                # Lo       HALFWIDTH HANGUL FILLER
FAILED 104B0         ; NFKC_CF; 104D8          # L&       OSAGE CAPITAL LETTER A
FAILED 104B1         ; NFKC_CF; 104D9          # L&       OSAGE CAPITAL LETTER AI
FAILED 104B2         ; NFKC_CF; 104DA          # L&       OSAGE CAPITAL LETTER AIN
FAILED 104B3         ; NFKC_CF; 104DB          # L&       OSAGE CAPITAL LETTER AH
FAILED 104B4         ; NFKC_CF; 104DC          # L&       OSAGE CAPITAL LETTER BRA
FAILED 104B5         ; NFKC_CF; 104DD          # L&       OSAGE CAPITAL LETTER CHA
FAILED 104B6         ; NFKC_CF; 104DE          # L&       OSAGE CAPITAL LETTER EHCHA
FAILED 104B7         ; NFKC_CF; 104DF          # L&       OSAGE CAPITAL LETTER E
FAILED 104B8         ; NFKC_CF; 104E0          # L&       OSAGE CAPITAL LETTER EIN
FAILED 104B9         ; NFKC_CF; 104E1          # L&       OSAGE CAPITAL LETTER HA
FAILED 104BA         ; NFKC_CF; 104E2          # L&       OSAGE CAPITAL LETTER HYA
FAILED 104BB         ; NFKC_CF; 104E3          # L&       OSAGE CAPITAL LETTER I
FAILED 104BC         ; NFKC_CF; 104E4          # L&       OSAGE CAPITAL LETTER KA
FAILED 104BD         ; NFKC_CF; 104E5          # L&       OSAGE CAPITAL LETTER EHKA
FAILED 104BE         ; NFKC_CF; 104E6          # L&       OSAGE CAPITAL LETTER KYA
FAILED 104BF         ; NFKC_CF; 104E7          # L&       OSAGE CAPITAL LETTER LA
FAILED 104C0         ; NFKC_CF; 104E8          # L&       OSAGE CAPITAL LETTER MA
FAILED 104C1         ; NFKC_CF; 104E9          # L&       OSAGE CAPITAL LETTER NA
FAILED 104C2         ; NFKC_CF; 104EA          # L&       OSAGE CAPITAL LETTER O
FAILED 104C3         ; NFKC_CF; 104EB          # L&       OSAGE CAPITAL LETTER OIN
FAILED 104C4         ; NFKC_CF; 104EC          # L&       OSAGE CAPITAL LETTER PA
FAILED 104C5         ; NFKC_CF; 104ED          # L&       OSAGE CAPITAL LETTER EHPA
FAILED 104C6         ; NFKC_CF; 104EE          # L&       OSAGE CAPITAL LETTER SA
FAILED 104C7         ; NFKC_CF; 104EF          # L&       OSAGE CAPITAL LETTER SHA
FAILED 104C8         ; NFKC_CF; 104F0          # L&       OSAGE CAPITAL LETTER TA
FAILED 104C9         ; NFKC_CF; 104F1          # L&       OSAGE CAPITAL LETTER EHTA
FAILED 104CA         ; NFKC_CF; 104F2          # L&       OSAGE CAPITAL LETTER TSA
FAILED 104CB         ; NFKC_CF; 104F3          # L&       OSAGE CAPITAL LETTER EHTSA
FAILED 104CC         ; NFKC_CF; 104F4          # L&       OSAGE CAPITAL LETTER TSHA
FAILED 104CD         ; NFKC_CF; 104F5          # L&       OSAGE CAPITAL LETTER DHA
FAILED 104CE         ; NFKC_CF; 104F6          # L&       OSAGE CAPITAL LETTER U
FAILED 104CF         ; NFKC_CF; 104F7          # L&       OSAGE CAPITAL LETTER WA
FAILED 104D0         ; NFKC_CF; 104F8          # L&       OSAGE CAPITAL LETTER KHA
FAILED 104D1         ; NFKC_CF; 104F9          # L&       OSAGE CAPITAL LETTER GHA
FAILED 104D2         ; NFKC_CF; 104FA          # L&       OSAGE CAPITAL LETTER ZA
FAILED 104D3         ; NFKC_CF; 104FB          # L&       OSAGE CAPITAL LETTER ZHA
FAILED 1E900         ; NFKC_CF; 1E922          # L&       ADLAM CAPITAL LETTER ALIF
FAILED 1E901         ; NFKC_CF; 1E923          # L&       ADLAM CAPITAL LETTER DAALI
FAILED 1E902         ; NFKC_CF; 1E924          # L&       ADLAM CAPITAL LETTER LAAM
FAILED 1E903         ; NFKC_CF; 1E925          # L&       ADLAM CAPITAL LETTER MIIM
FAILED 1E904         ; NFKC_CF; 1E926          # L&       ADLAM CAPITAL LETTER BA
FAILED 1E905         ; NFKC_CF; 1E927          # L&       ADLAM CAPITAL LETTER SINNYIIYHE
FAILED 1E906         ; NFKC_CF; 1E928          # L&       ADLAM CAPITAL LETTER PE
FAILED 1E907         ; NFKC_CF; 1E929          # L&       ADLAM CAPITAL LETTER BHE
FAILED 1E908         ; NFKC_CF; 1E92A          # L&       ADLAM CAPITAL LETTER RA
FAILED 1E909         ; NFKC_CF; 1E92B          # L&       ADLAM CAPITAL LETTER E
FAILED 1E90A         ; NFKC_CF; 1E92C          # L&       ADLAM CAPITAL LETTER FA
FAILED 1E90B         ; NFKC_CF; 1E92D          # L&       ADLAM CAPITAL LETTER I
FAILED 1E90C         ; NFKC_CF; 1E92E          # L&       ADLAM CAPITAL LETTER O
FAILED 1E90D         ; NFKC_CF; 1E92F          # L&       ADLAM CAPITAL LETTER DHA
FAILED 1E90E         ; NFKC_CF; 1E930          # L&       ADLAM CAPITAL LETTER YHE
FAILED 1E90F         ; NFKC_CF; 1E931          # L&       ADLAM CAPITAL LETTER WAW
FAILED 1E910         ; NFKC_CF; 1E932          # L&       ADLAM CAPITAL LETTER NUN
FAILED 1E911         ; NFKC_CF; 1E933          # L&       ADLAM CAPITAL LETTER KAF
FAILED 1E912         ; NFKC_CF; 1E934          # L&       ADLAM CAPITAL LETTER YA
FAILED 1E913         ; NFKC_CF; 1E935          # L&       ADLAM CAPITAL LETTER U
FAILED 1E914         ; NFKC_CF; 1E936          # L&       ADLAM CAPITAL LETTER JIIM
FAILED 1E915         ; NFKC_CF; 1E937          # L&       ADLAM CAPITAL LETTER CHI
FAILED 1E916         ; NFKC_CF; 1E938          # L&       ADLAM CAPITAL LETTER HA
FAILED 1E917         ; NFKC_CF; 1E939          # L&       ADLAM CAPITAL LETTER QAAF
FAILED 1E918         ; NFKC_CF; 1E93A          # L&       ADLAM CAPITAL LETTER GA
FAILED 1E919         ; NFKC_CF; 1E93B          # L&       ADLAM CAPITAL LETTER NYA
FAILED 1E91A         ; NFKC_CF; 1E93C          # L&       ADLAM CAPITAL LETTER TU
FAILED 1E91B         ; NFKC_CF; 1E93D          # L&       ADLAM CAPITAL LETTER NHA
FAILED 1E91C         ; NFKC_CF; 1E93E          # L&       ADLAM CAPITAL LETTER VA
FAILED 1E91D         ; NFKC_CF; 1E93F          # L&       ADLAM CAPITAL LETTER KHA
FAILED 1E91E         ; NFKC_CF; 1E940          # L&       ADLAM CAPITAL LETTER GBE
FAILED 1E91F         ; NFKC_CF; 1E941          # L&       ADLAM CAPITAL LETTER ZAL
FAILED 1E920         ; NFKC_CF; 1E942          # L&       ADLAM CAPITAL LETTER KPO
FAILED 1E921         ; NFKC_CF; 1E943          # L&       ADLAM CAPITAL LETTER SHA
FAILED 1F23B         ; NFKC_CF; 914D           # So       SQUARED CJK UNIFIED IDEOGRAPH-914D
FAILED E0000         ; NFKC_CF;                # Cn       <reserved-E0000>
FAILED E0001         ; NFKC_CF;                # Cf       LANGUAGE TAG

reply via email to

[Prev in Thread] Current Thread [Next in Thread]