diff options
Diffstat (limited to 'src/dged')
| -rw-r--r-- | src/dged/json.c | 51 | ||||
| -rw-r--r-- | src/dged/utf8.c | 26 | ||||
| -rw-r--r-- | src/dged/utf8.h | 3 |
3 files changed, 74 insertions, 6 deletions
diff --git a/src/dged/json.c b/src/dged/json.c index a514f00..69823cb 100644 --- a/src/dged/json.c +++ b/src/dged/json.c @@ -2,10 +2,13 @@ #include "hash.h" #include "hashmap.h" +#include "utf8.h" #include "vec.h" +#include <assert.h> #include <stddef.h> #include <stdio.h> +#include <stdlib.h> struct json_key_value { struct s8 key; @@ -49,20 +52,49 @@ static struct json_value create_object(struct json_value *parent) { return val; } +static uint32_t codepoint_from_hex(uint8_t bytes[4]) { + uint32_t nmbr = 0; + for (size_t i = 0; i < 4; ++i) { + uint8_t byte = bytes[i]; + uint32_t value = 0; + if (byte >= '0' && byte <= '9') { + value = byte - '0'; + } else if (byte >= 'A' && byte <= 'F') { + value = byte - 'A' + 10; + } else if (byte >= 'a' && byte <= 'f') { + value = byte - 'a' + 10; + } + + // 16 ^ (3-i) + uint32_t multiplier = 1 << (4 * (3 - i)); + nmbr += value * multiplier; + } + + return nmbr; +} + struct s8 unescape_json_string(struct s8 input) { - /* FIXME: this is a bit funky and does not take - unicode characters into account and probably also - misses some escape codes. */ size_t new_size = 0; bool escape = false; for (size_t bi = 0; bi < input.l; ++bi) { uint8_t b = input.s[bi]; + + size_t sz = 1; if (b == '\\' && !escape) { escape = true; continue; } - ++new_size; + if (b == 'u' && escape) { + // unicode codepoint, calculate byte-width + // format is \uXXXX where X is a hex digit. + uint8_t chars[4]; + uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]); + sz = utf8_encode(codepoint, chars); + bi += 4; + } + + new_size += sz; escape = false; } @@ -77,6 +109,7 @@ struct s8 unescape_json_string(struct s8 input) { continue; } + size_t skip = 1; if (escape) { switch (b) { case 'b': @@ -97,6 +130,14 @@ struct s8 unescape_json_string(struct s8 input) { case 't': buf[bufi] = '\t'; break; + case 'u': { + uint8_t chars[4] = {0}; + uint32_t codepoint = codepoint_from_hex(&input.s[bi + 1]); + size_t size = utf8_encode(codepoint, chars); + memcpy(&buf[bufi], chars, size); + skip = size; + bi += 4; + } break; case '"': buf[bufi] = '"'; break; @@ -108,7 +149,7 @@ struct s8 unescape_json_string(struct s8 input) { } escape = false; - ++bufi; + bufi += skip; } return (struct s8){ diff --git a/src/dged/utf8.c b/src/dged/utf8.c index b47f5fc..cc5a66e 100644 --- a/src/dged/utf8.c +++ b/src/dged/utf8.c @@ -36,7 +36,7 @@ static const uint8_t utf8d[] = { // clang-format on /* - * emoji decoding algorithm from + * unicode decoding algorithm from * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ static enum utf8_state decode(enum utf8_state *state, uint32_t *codep, @@ -141,3 +141,27 @@ uint32_t unicode_visual_char_width(const struct codepoint *codepoint) { return 0; } } + +size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]) { + if (codepoint <= 0x7F) { + buf[0] = (uint8_t)codepoint & 0xff; + return 1; + } else if (codepoint <= 0x7FF) { + buf[0] = 0xC0 | (codepoint >> 6); + buf[1] = 0x80 | (codepoint & 0x3F); + return 2; + } else if (codepoint <= 0xFFFF) { + buf[0] = 0xE0 | (codepoint >> 12); + buf[1] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[2] = 0x80 | (codepoint & 0x3F); + return 3; + } else if (codepoint <= 0x10FFFF) { + buf[0] = 0xF0 | (codepoint >> 18); + buf[1] = 0x80 | ((codepoint >> 12) & 0x3F); + buf[2] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[3] = 0x80 | (codepoint & 0x3F); + return 4; + } + + return 0; +} diff --git a/src/dged/utf8.h b/src/dged/utf8.h index 150fe02..b91e7fd 100644 --- a/src/dged/utf8.h +++ b/src/dged/utf8.h @@ -2,6 +2,7 @@ #define _UTF8_H #include <stdbool.h> +#include <stddef.h> #include <stdint.h> struct codepoint { @@ -35,4 +36,6 @@ bool utf8_byte_is_unicode_continuation(uint8_t byte); bool utf8_byte_is_unicode(uint8_t byte); bool utf8_byte_is_ascii(uint8_t byte); +size_t utf8_encode(uint32_t codepoint, uint8_t buf[4]); + #endif |
