summaryrefslogtreecommitdiff
path: root/src/dged/utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/dged/utf8.c')
-rw-r--r--src/dged/utf8.c152
1 files changed, 101 insertions, 51 deletions
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index 52de2da..ede4fb1 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -1,5 +1,6 @@
#include "utf8.h"
+#include <assert.h>
#include <stdio.h>
#include <wchar.h>
@@ -10,76 +11,125 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte) {
bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; }
bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); }
-uint32_t utf8_nbytes_in_char(uint8_t byte) {
- // length of char is the number of leading ones
- // flip it and count number of leading zeros
- uint8_t invb = ~byte;
- return __builtin_clz((uint32_t)invb) - 24;
+enum utf8_state {
+ Utf8_Accept = 0,
+ Utf8_Reject = 1,
+};
+
+// clang-format off
+static const uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+// clang-format on
+
+/*
+ * emoji decoding algorithm from
+ * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ */
+static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
+ uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6)
+ : (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state * 16 + type];
+ return *state;
+}
+
+static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) {
+ uint32_t codepoint = 0;
+ enum utf8_state state = Utf8_Accept;
+ uint32_t bi = 0;
+ while (bi < nbytes) {
+ enum utf8_state res = decode(&state, &codepoint, bytes[bi]);
+ ++bi;
+
+ if (res == Utf8_Accept || res == Utf8_Reject) {
+ break;
+ }
+ }
+
+ if (state == Utf8_Reject) {
+ codepoint = 0xfffd;
+ }
+
+ return (struct codepoint){.codepoint = codepoint, .nbytes = bi};
}
-// TODO: grapheme clusters, this returns the number of unicode code points
+struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) {
+ if (iter->offset >= iter->nbytes) {
+ return NULL;
+ }
+
+ iter->current = next_utf8_codepoint(iter->data + iter->offset,
+ iter->nbytes - iter->offset);
+ iter->offset += iter->current.nbytes;
+ return &iter->current;
+}
+
+struct utf8_codepoint_iterator
+create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
+ uint64_t initial_offset) {
+ return (struct utf8_codepoint_iterator){
+ .data = data,
+ .nbytes = len,
+ .offset = initial_offset,
+ };
+}
+
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) {
+ uint32_t bi = 0;
uint32_t nchars = 0;
- uint32_t expected = 0;
- for (uint32_t bi = 0; bi < nbytes; ++bi) {
- uint8_t byte = bytes[bi];
- if (utf8_byte_is_unicode(byte)) {
- if (utf8_byte_is_unicode_start(byte)) {
- expected = utf8_nbytes_in_char(byte) - 1;
- } else { // continuation byte
- --expected;
- if (expected == 0) {
- ++nchars;
- }
- }
- } else { // ascii
- ++nchars;
- }
+ while (bi < nbytes) {
+ struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+ ++nchars;
+ bi += codepoint.nbytes;
}
+
return nchars;
}
-// TODO: grapheme clusters, this uses the number of unicode code points
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) {
-
uint32_t bi = 0;
uint32_t chars = 0;
uint32_t expected = 0;
while (chars < nchars && bi < nbytes) {
- uint8_t byte = bytes[bi];
- if (utf8_byte_is_unicode(byte)) {
- if (utf8_byte_is_unicode_start(byte)) {
- expected = utf8_nbytes_in_char(byte) - 1;
- } else { // continuation char
- --expected;
- if (expected == 0) {
- ++chars;
- }
- }
- } else { // ascii
- ++chars;
- }
-
- ++bi;
+ struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+ bi += codepoint.nbytes;
+ ++chars;
}
+ // TODO: reject invalid?
return bi;
}
-uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len) {
- if (utf8_byte_is_unicode_start(*bytes)) {
- wchar_t wc;
- size_t nbytes = 0;
- if ((nbytes = mbrtowc(&wc, (char *)bytes, len, NULL)) > 0) {
- size_t w = wcwidth(wc);
- return w > 0 ? w : 2;
- } else {
- return 1;
- }
- } else if (utf8_byte_is_unicode_continuation(*bytes)) {
- return 0;
+uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
+ if (codepoint->nbytes > 0) {
+ // TODO: use unicode classification instead
+ size_t w = wcwidth(codepoint->codepoint);
+ return w >= 0 ? w : 2;
} else {
- return 1;
+ return 0;
}
}