summaryrefslogtreecommitdiff
path: root/src/dged/utf8.c
diff options
context:
space:
mode:
authorAlbert Cervin <albert@acervin.com>2025-11-26 23:34:08 +0100
committerAlbert Cervin <albert@acervin.com>2025-11-26 23:34:08 +0100
commitf06d8923e86a2af70f9c97f8484dc9e645dcefdb (patch)
treea2fce0ab790dda09bfb33544622f65d7953fc898 /src/dged/utf8.c
parentb8f2c54675cfd4c89c13941503fb23eda0ad082d (diff)
downloaddged-f06d8923e86a2af70f9c97f8484dc9e645dcefdb.tar.gz
dged-f06d8923e86a2af70f9c97f8484dc9e645dcefdb.tar.xz
dged-f06d8923e86a2af70f9c97f8484dc9e645dcefdb.zip
Handle unicode chars when unescaping JSON
The python LSP used some nbsp unicode chars when sending back documentation.
Diffstat (limited to 'src/dged/utf8.c')
-rw-r--r--src/dged/utf8.c26
1 files changed, 25 insertions, 1 deletions
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index b47f5fc..cc5a66e 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -36,7 +36,7 @@ static const uint8_t utf8d[] = {
// clang-format on
/*
- * emoji decoding algorithm from
+ * unicode decoding algorithm from
* https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*/
static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
@@ -141,3 +141,27 @@ uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
return 0;
}
}
+
+size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]) {
+ if (codepoint <= 0x7F) {
+ buf[0] = (uint8_t)codepoint & 0xff;
+ return 1;
+ } else if (codepoint <= 0x7FF) {
+ buf[0] = 0xC0 | (codepoint >> 6);
+ buf[1] = 0x80 | (codepoint & 0x3F);
+ return 2;
+ } else if (codepoint <= 0xFFFF) {
+ buf[0] = 0xE0 | (codepoint >> 12);
+ buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[2] = 0x80 | (codepoint & 0x3F);
+ return 3;
+ } else if (codepoint <= 0x10FFFF) {
+ buf[0] = 0xF0 | (codepoint >> 18);
+ buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[3] = 0x80 | (codepoint & 0x3F);
+ return 4;
+ }
+
+ return 0;
+}