diff options
Diffstat (limited to 'src/dged/utf8.c')
| -rw-r--r-- | src/dged/utf8.c | 152 |
1 files changed, 101 insertions, 51 deletions
diff --git a/src/dged/utf8.c b/src/dged/utf8.c index 52de2da..ede4fb1 100644 --- a/src/dged/utf8.c +++ b/src/dged/utf8.c @@ -1,5 +1,6 @@ #include "utf8.h" +#include <assert.h> #include <stdio.h> #include <wchar.h> @@ -10,76 +11,125 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte) { bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; } bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); } -uint32_t utf8_nbytes_in_char(uint8_t byte) { - // length of char is the number of leading ones - // flip it and count number of leading zeros - uint8_t invb = ~byte; - return __builtin_clz((uint32_t)invb) - 24; +enum utf8_state { + Utf8_Accept = 0, + Utf8_Reject = 1, +}; + +// clang-format off +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; +// clang-format on + +/* + * emoji decoding algorithm from + * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ +static enum utf8_state decode(enum utf8_state *state, uint32_t *codep, + uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6) + : (0xff >> type) & (byte); + + *state = utf8d[256 + *state * 16 + type]; + return *state; +} + +static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) { + uint32_t codepoint = 0; + enum utf8_state state = Utf8_Accept; + uint32_t bi = 0; + while (bi < nbytes) { + enum utf8_state res = decode(&state, &codepoint, bytes[bi]); + ++bi; + + if (res == Utf8_Accept || res == Utf8_Reject) { + break; + } + } + + if (state == Utf8_Reject) { + codepoint = 0xfffd; + } + + return (struct codepoint){.codepoint = codepoint, .nbytes = bi}; } -// TODO: grapheme clusters, this returns the number of unicode code points +struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) { + if (iter->offset >= iter->nbytes) { + return NULL; + } + + iter->current = next_utf8_codepoint(iter->data + iter->offset, + iter->nbytes - iter->offset); + iter->offset += iter->current.nbytes; + return &iter->current; +} + +struct utf8_codepoint_iterator +create_utf8_codepoint_iterator(uint8_t *data, uint64_t len, + uint64_t initial_offset) { + return (struct utf8_codepoint_iterator){ + .data = data, + .nbytes = len, + .offset = initial_offset, + }; +} + +/* TODO: grapheme clusters and other classification, this + * returns the number of unicode code points + */ uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) { + uint32_t bi = 0; uint32_t nchars = 0; - uint32_t expected = 0; - for (uint32_t bi = 0; bi < nbytes; ++bi) { - uint8_t byte = bytes[bi]; - if (utf8_byte_is_unicode(byte)) { - if (utf8_byte_is_unicode_start(byte)) { - expected = utf8_nbytes_in_char(byte) - 1; - } else { // continuation byte - --expected; - if (expected == 0) { - ++nchars; - } - } - } else { // ascii - ++nchars; - } + while (bi < nbytes) { + struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi); + ++nchars; + bi += codepoint.nbytes; } + return nchars; } -// TODO: grapheme clusters, this uses the number of unicode code points +/* TODO: grapheme clusters and other classification, this + * returns the number of unicode code points + */ uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) { - uint32_t bi = 0; uint32_t chars = 0; uint32_t expected = 0; while (chars < nchars && bi < nbytes) { - uint8_t byte = bytes[bi]; - if (utf8_byte_is_unicode(byte)) { - if (utf8_byte_is_unicode_start(byte)) { - expected = utf8_nbytes_in_char(byte) - 1; - } else { // continuation char - --expected; - if (expected == 0) { - ++chars; - } - } - } else { // ascii - ++chars; - } - - ++bi; + struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi); + bi += codepoint.nbytes; + ++chars; } + // TODO: reject invalid? return bi; } -uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len) { - if (utf8_byte_is_unicode_start(*bytes)) { - wchar_t wc; - size_t nbytes = 0; - if ((nbytes = mbrtowc(&wc, (char *)bytes, len, NULL)) > 0) { - size_t w = wcwidth(wc); - return w > 0 ? w : 2; - } else { - return 1; - } - } else if (utf8_byte_is_unicode_continuation(*bytes)) { - return 0; +uint32_t unicode_visual_char_width(const struct codepoint *codepoint) { + if (codepoint->nbytes > 0) { + // TODO: use unicode classification instead + size_t w = wcwidth(codepoint->codepoint); + return w >= 0 ? w : 2; } else { - return 1; + return 0; } } |
