3 files changed, 74 insertions, 6 deletions
diff --git a/src/dged/json.c b/src/dged/json.c
index a514f00..69823cb 100644
--- a/src/dged/json.c
+++ b/src/dged/json.c
@@ -2,10 +2,13 @@
 
 #include "hash.h"
 #include "hashmap.h"
+#include "utf8.h"
 #include "vec.h"
 
+#include <assert.h>
 #include <stddef.h>
 #include <stdio.h>
+#include <stdlib.h>
 
 struct json_key_value {
   struct s8 key;
@@ -49,20 +52,49 @@ static struct json_value create_object(struct json_value *parent) {
   return val;
 }
 
+static uint32_t codepoint_from_hex(uint8_t bytes[4]) {
+  uint32_t nmbr = 0;
+  for (size_t i = 0; i < 4; ++i) {
+    uint8_t byte = bytes[i];
+    uint32_t value = 0;
+    if (byte >= '0' && byte <= '9') {
+      value = byte - '0';
+    } else if (byte >= 'A' && byte <= 'F') {
+      value = byte - 'A' + 10;
+    } else if (byte >= 'a' && byte <= 'f') {
+      value = byte - 'a' + 10;
+    }
+
+    // 16 ^ (3-i)
+    uint32_t multiplier = 1 << (4 * (3 - i));
+    nmbr += value * multiplier;
+  }
+
+  return nmbr;
+}
+
 struct s8 unescape_json_string(struct s8 input) {
-  /* FIXME: this is a bit funky and does not take
-  unicode characters into account and probably also
-  misses some escape codes. */
   size_t new_size = 0;
   bool escape = false;
   for (size_t bi = 0; bi < input.l; ++bi) {
     uint8_t b = input.s[bi];
+
+    size_t sz = 1;
     if (b == '\\' && !escape) {
       escape = true;
       continue;
     }
 
-    ++new_size;
+    if (b == 'u' && escape) {
+      // unicode codepoint, calculate byte-width
+      // format is \uXXXX where X is a hex digit.
+      uint8_t chars[4];
+      uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]);
+      sz = utf8_encode(codepoint, chars);
+      bi += 4;
+    }
+
+    new_size += sz;
     escape = false;
   }
 
@@ -77,6 +109,7 @@ struct s8 unescape_json_string(struct s8 input) {
       continue;
     }
 
+    size_t skip = 1;
     if (escape) {
       switch (b) {
       case 'b':
@@ -97,6 +130,14 @@ struct s8 unescape_json_string(struct s8 input) {
       case 't':
         buf[bufi] = '\t';
         break;
+      case 'u': {
+        uint8_t chars[4] = {0};
+        uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]);
+        size_t size = utf8_encode(codepoint, chars);
+        memcpy(&buf[bufi], chars, size);
+        skip = size;
+        bi += 4;
+      } break;
       case '"':
         buf[bufi] = '"';
         break;
@@ -108,7 +149,7 @@ struct s8 unescape_json_string(struct s8 input) {
     }
 
     escape = false;
-    ++bufi;
+    bufi += skip;
   }
 
   return (struct s8){
diff --git a/src/dged/utf8.c b/src/dged/utf8.c
index b47f5fc..cc5a66e 100644
--- a/src/dged/utf8.c
+++ b/src/dged/utf8.c
@@ -36,7 +36,7 @@ static const uint8_t utf8d[] = {
 // clang-format on
 
 /*
- * emoji decoding algorithm from
+ * unicode decoding algorithm from
  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  */
 static enum utf8_state decode(enum utf8_state *state, uint32_t *codep,
@@ -141,3 +141,27 @@ uint32_t unicode_visual_char_width(const struct codepoint *codepoint) {
     return 0;
   }
 }
+
+size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]) {
+  if (codepoint <= 0x7F) {
+    buf[0] = (uint8_t)codepoint & 0xff;
+    return 1;
+  } else if (codepoint <= 0x7FF) {
+    buf[0] = 0xC0 | (codepoint >> 6);
+    buf[1] = 0x80 | (codepoint & 0x3F);
+    return 2;
+  } else if (codepoint <= 0xFFFF) {
+    buf[0] = 0xE0 | (codepoint >> 12);
+    buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+    buf[2] = 0x80 | (codepoint & 0x3F);
+    return 3;
+  } else if (codepoint <= 0x10FFFF) {
+    buf[0] = 0xF0 | (codepoint >> 18);
+    buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+    buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+    buf[3] = 0x80 | (codepoint & 0x3F);
+    return 4;
+  }
+
+  return 0;
+}
diff --git a/src/dged/utf8.h b/src/dged/utf8.h
index 150fe02..b91e7fd 100644
--- a/src/dged/utf8.h
+++ b/src/dged/utf8.h
@@ -2,6 +2,7 @@
 #define _UTF8_H
 
 #include <stdbool.h>
+#include <stddef.h>
 #include <stdint.h>
 
 struct codepoint {
@@ -35,4 +36,6 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte);
 bool utf8_byte_is_unicode(uint8_t byte);
 bool utf8_byte_is_ascii(uint8_t byte);
 
+size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]);
+
 #endif