summaryrefslogtreecommitdiff
path: root/src/dged/utf8.c
diff options
context:
space:
mode:
authorAlbert Cervin <albert@acervin.com>2024-08-23 17:07:27 +0200
committerAlbert Cervin <albert@acervin.com>2024-09-11 16:22:58 +0200
commit4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree4745d99e70d645a8134dafc3814dc68bf678daf4 /src/dged/utf8.c
parent991283f684c224db46fe68738470921b8c394f13 (diff)
downloaddged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip
Overhaul unicode parsing
It now instead iterates the actual unicode code points. This is better than what it was previously doing but it is still not entirely correct w.r.t to unicode sequences. This handling of unicode code points does however make it slightly easier to handle UTF-16 if needed in the future. This also adds some long needed tests for buffer methods.
Diffstat (limited to 'src/dged/utf8.c')
-rw-r--r--src/dged/utf8.c152
1 files changed, 101 insertions, 51 deletions
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index 52de2da..ede4fb1 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -1,5 +1,6 @@
#include "utf8.h"
+#include <assert.h>
#include <stdio.h>
#include <wchar.h>
@@ -10,76 +11,125 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte) {
bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; }
bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); }
-uint32_t utf8_nbytes_in_char(uint8_t byte) {
- // length of char is the number of leading ones
- // flip it and count number of leading zeros
- uint8_t invb = ~byte;
- return __builtin_clz((uint32_t)invb) - 24;
+enum utf8_state {
+ Utf8_Accept = 0,
+ Utf8_Reject = 1,
+};
+
+// clang-format off
+static const uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+// clang-format on
+
+/*
+ * emoji decoding algorithm from
+ * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ */
+static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
+ uint32_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6)
+ : (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state * 16 + type];
+ return *state;
+}
+
+static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) {
+ uint32_t codepoint = 0;
+ enum utf8_state state = Utf8_Accept;
+ uint32_t bi = 0;
+ while (bi < nbytes) {
+ enum utf8_state res = decode(&state, &codepoint, bytes[bi]);
+ ++bi;
+
+ if (res == Utf8_Accept || res == Utf8_Reject) {
+ break;
+ }
+ }
+
+ if (state == Utf8_Reject) {
+ codepoint = 0xfffd;
+ }
+
+ return (struct codepoint){.codepoint = codepoint, .nbytes = bi};
}
-// TODO: grapheme clusters, this returns the number of unicode code points
+struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) {
+ if (iter->offset >= iter->nbytes) {
+ return NULL;
+ }
+
+ iter->current = next_utf8_codepoint(iter->data + iter->offset,
+ iter->nbytes - iter->offset);
+ iter->offset += iter->current.nbytes;
+ return &iter->current;
+}
+
+struct utf8_codepoint_iterator
+create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
+ uint64_t initial_offset) {
+ return (struct utf8_codepoint_iterator){
+ .data = data,
+ .nbytes = len,
+ .offset = initial_offset,
+ };
+}
+
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) {
+ uint32_t bi = 0;
uint32_t nchars = 0;
- uint32_t expected = 0;
- for (uint32_t bi = 0; bi < nbytes; ++bi) {
- uint8_t byte = bytes[bi];
- if (utf8_byte_is_unicode(byte)) {
- if (utf8_byte_is_unicode_start(byte)) {
- expected = utf8_nbytes_in_char(byte) - 1;
- } else { // continuation byte
- --expected;
- if (expected == 0) {
- ++nchars;
- }
- }
- } else { // ascii
- ++nchars;
- }
+ while (bi < nbytes) {
+ struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+ ++nchars;
+ bi += codepoint.nbytes;
}
+
return nchars;
}
-// TODO: grapheme clusters, this uses the number of unicode code points
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) {
-
uint32_t bi = 0;
uint32_t chars = 0;
uint32_t expected = 0;
while (chars < nchars && bi < nbytes) {
- uint8_t byte = bytes[bi];
- if (utf8_byte_is_unicode(byte)) {
- if (utf8_byte_is_unicode_start(byte)) {
- expected = utf8_nbytes_in_char(byte) - 1;
- } else { // continuation char
- --expected;
- if (expected == 0) {
- ++chars;
- }
- }
- } else { // ascii
- ++chars;
- }
-
- ++bi;
+ struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+ bi += codepoint.nbytes;
+ ++chars;
}
+ // TODO: reject invalid?
return bi;
}
-uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len) {
- if (utf8_byte_is_unicode_start(*bytes)) {
- wchar_t wc;
- size_t nbytes = 0;
- if ((nbytes = mbrtowc(&wc, (char *)bytes, len, NULL)) > 0) {
- size_t w = wcwidth(wc);
- return w > 0 ? w : 2;
- } else {
- return 1;
- }
- } else if (utf8_byte_is_unicode_continuation(*bytes)) {
- return 0;
+uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
+ if (codepoint->nbytes > 0) {
+ // TODO: use unicode classification instead
+ size_t w = wcwidth(codepoint->codepoint);
+ return w >= 0 ? w : 2;
} else {
- return 1;
+ return 0;
}
}