Overhaul unicode parsing

It now instead iterates the actual unicode code points. This is better than what it was previously doing but it is still not entirely correct w.r.t to unicode sequences. This handling of unicode code points does however make it slightly easier to handle UTF-16 if needed in the future. This also adds some long needed tests for buffer methods.
author: Albert Cervin <albert@acervin.com> 2024-08-23 17:07:27 +0200
committer: Albert Cervin <albert@acervin.com> 2024-09-11 16:22:58 +0200
commit: 4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree: 4745d99e70d645a8134dafc3814dc68bf678daf4 /src/dged/utf8.c
parent: 991283f684c224db46fe68738470921b8c394f13 (diff)
download: dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz
dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip
1 files changed, 101 insertions, 51 deletions
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index 52de2da..ede4fb1 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -1,5 +1,6 @@
 #include "utf8.h"
 
+#include <assert.h>
 #include <stdio.h>
 #include <wchar.h>
 
@@ -10,76 +11,125 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte) {
 bool utf8_byte_is_unicode(uint8_t byte) { return (byte & 0x80) != 0x0; }
 bool utf8_byte_is_ascii(uint8_t byte) { return !utf8_byte_is_unicode(byte); }
 
-uint32_t utf8_nbytes_in_char(uint8_t byte) {
-  // length of char is the number of leading ones
-  // flip it and count number of leading zeros
-  uint8_t invb = ~byte;
-  return __builtin_clz((uint32_t)invb) - 24;
+enum utf8_state {
+  Utf8_Accept = 0,
+  Utf8_Reject = 1,
+};
+
+// clang-format off
+static const uint8_t utf8d[] = {
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+  0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+  0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+  0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+  1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+  1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+  1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+// clang-format on
+
+/*
+ * emoji decoding algorithm from
+ * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ */
+static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
+                              uint32_t byte) {
+  uint32_t type = utf8d[byte];
+
+  *codep = (*state != Utf8_Accept) ? (byte & 0x3fu) | (*codep << 6)
+                                   : (0xff >> type) & (byte);
+
+  *state = utf8d[256 + *state * 16 + type];
+  return *state;
+}
+
+static struct codepoint next_utf8_codepoint(uint8_t *bytes, uint64_t nbytes) {
+  uint32_t codepoint = 0;
+  enum utf8_state state = Utf8_Accept;
+  uint32_t bi = 0;
+  while (bi < nbytes) {
+    enum utf8_state res = decode(&state, &codepoint, bytes[bi]);
+    ++bi;
+
+    if (res == Utf8_Accept || res == Utf8_Reject) {
+      break;
+    }
+  }
+
+  if (state == Utf8_Reject) {
+    codepoint = 0xfffd;
+  }
+
+  return (struct codepoint){.codepoint = codepoint, .nbytes = bi};
 }
 
-// TODO: grapheme clusters, this returns the number of unicode code points
+struct codepoint *utf8_next_codepoint(struct utf8_codepoint_iterator *iter) {
+  if (iter->offset >= iter->nbytes) {
+    return NULL;
+  }
+
+  iter->current = next_utf8_codepoint(iter->data + iter->offset,
+                                      iter->nbytes - iter->offset);
+  iter->offset += iter->current.nbytes;
+  return &iter->current;
+}
+
+struct utf8_codepoint_iterator
+create_utf8_codepoint_iterator(uint8_t *data, uint64_t len,
+                               uint64_t initial_offset) {
+  return (struct utf8_codepoint_iterator){
+      .data = data,
+      .nbytes = len,
+      .offset = initial_offset,
+  };
+}
+
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
 uint32_t utf8_nchars(uint8_t *bytes, uint32_t nbytes) {
+  uint32_t bi = 0;
   uint32_t nchars = 0;
-  uint32_t expected = 0;
-  for (uint32_t bi = 0; bi < nbytes; ++bi) {
-    uint8_t byte = bytes[bi];
-    if (utf8_byte_is_unicode(byte)) {
-      if (utf8_byte_is_unicode_start(byte)) {
-        expected = utf8_nbytes_in_char(byte) - 1;
-      } else { // continuation byte
-        --expected;
-        if (expected == 0) {
-          ++nchars;
-        }
-      }
-    } else { // ascii
-      ++nchars;
-    }
+  while (bi < nbytes) {
+    struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+    ++nchars;
+    bi += codepoint.nbytes;
   }
+
   return nchars;
 }
 
-// TODO: grapheme clusters, this uses the number of unicode code points
+/* TODO: grapheme clusters and other classification, this
+ * returns the number of unicode code points
+ */
 uint32_t utf8_nbytes(uint8_t *bytes, uint32_t nbytes, uint32_t nchars) {
-
   uint32_t bi = 0;
   uint32_t chars = 0;
   uint32_t expected = 0;
 
   while (chars < nchars && bi < nbytes) {
-    uint8_t byte = bytes[bi];
-    if (utf8_byte_is_unicode(byte)) {
-      if (utf8_byte_is_unicode_start(byte)) {
-        expected = utf8_nbytes_in_char(byte) - 1;
-      } else { // continuation char
-        --expected;
-        if (expected == 0) {
-          ++chars;
-        }
-      }
-    } else { // ascii
-      ++chars;
-    }
-
-    ++bi;
+    struct codepoint codepoint = next_utf8_codepoint(bytes + bi, nbytes - bi);
+    bi += codepoint.nbytes;
+    ++chars;
   }
 
+  // TODO: reject invalid?
   return bi;
 }
 
-uint32_t utf8_visual_char_width(uint8_t *bytes, uint32_t len) {
-  if (utf8_byte_is_unicode_start(*bytes)) {
-    wchar_t wc;
-    size_t nbytes = 0;
-    if ((nbytes = mbrtowc(&wc, (char *)bytes, len, NULL)) > 0) {
-      size_t w = wcwidth(wc);
-      return w > 0 ? w : 2;
-    } else {
-      return 1;
-    }
-  } else if (utf8_byte_is_unicode_continuation(*bytes)) {
-    return 0;
+uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
+  if (codepoint->nbytes > 0) {
+    // TODO: use unicode classification instead
+    size_t w = wcwidth(codepoint->codepoint);
+    return w >= 0 ? w : 2;
   } else {
-    return 1;
+    return 0;
   }
 }
author	Albert Cervin <albert@acervin.com>	2024-08-23 17:07:27 +0200
committer	Albert Cervin <albert@acervin.com>	2024-09-11 16:22:58 +0200
commit	4ab7e453e26afc6e9f4938c65f89463fbba9e267 (patch)
tree	4745d99e70d645a8134dafc3814dc68bf678daf4 /src/dged/utf8.c
parent	991283f684c224db46fe68738470921b8c394f13 (diff)
download	dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.gz dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.tar.xz dged-4ab7e453e26afc6e9f4938c65f89463fbba9e267.zip