From 4ab7e453e26afc6e9f4938c65f89463fbba9e267 Mon Sep 17 00:00:00 2001
From: Albert Cervin <albert@acervin.com>
Date: Fri, 23 Aug 2024 17:07:27 +0200
Subject: Overhaul unicode parsing

It now instead iterates the actual unicode code points. This is better
than what it was previously doing but it is still not entirely correct
w.r.t to unicode sequences.

This handling of unicode code points does however make it slightly
easier to handle UTF-16 if needed in the future.

This also adds some long needed tests for buffer methods.
---
 src/dged/buffer.c | 473 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 288 insertions(+), 185 deletions(-)

(limited to 'src/dged/buffer.c')

diff --git a/src/dged/buffer.c b/src/dged/buffer.c
index 6051f69..1062a47 100644
--- a/src/dged/buffer.c
+++ b/src/dged/buffer.c
@@ -157,6 +157,42 @@ void buffer_static_teardown() {
   }
 }
 
+static uint32_t get_tab_width(struct buffer *buffer) {
+  struct setting *tw = lang_setting(&buffer->lang, "tab-width");
+  if (tw == NULL) {
+    tw = settings_get("editor.tab-width");
+  }
+
+  uint32_t tab_width = 4;
+  if (tw != NULL && tw->value.type == Setting_Number) {
+    tab_width = tw->value.number_value;
+  }
+  return tab_width;
+}
+
+static bool use_tabs(struct buffer *buffer) {
+  struct setting *ut = lang_setting(&buffer->lang, "use-tabs");
+  if (ut == NULL) {
+    ut = settings_get("editor.use-tabs");
+  }
+
+  bool use_tabs = false;
+  if (ut != NULL && ut->value.type == Setting_Bool) {
+    use_tabs = ut->value.bool_value;
+  }
+
+  return use_tabs;
+}
+
+static uint32_t visual_char_width(struct codepoint *codepoint,
+                                  uint32_t tab_width) {
+  if (codepoint->codepoint == '\t') {
+    return tab_width;
+  } else {
+    return unicode_visual_char_width(codepoint);
+  }
+}
+
 static struct buffer create_internal(const char *name, char *filename) {
   struct buffer b = (struct buffer){
       .filename = filename,
@@ -185,7 +221,7 @@ static struct buffer create_internal(const char *name, char *filename) {
 
 static void strip_final_newline(struct buffer *b) {
   uint32_t nlines = text_num_lines(b->text);
-  if (nlines > 0 && text_line_length(b->text, nlines - 1) == 0) {
+  if (nlines > 0 && buffer_line_length(b, nlines - 1) == 0) {
     text_delete(b->text, nlines - 1, 0, nlines - 1, 1);
   }
 }
@@ -207,7 +243,7 @@ static void buffer_read_from_file(struct buffer *b) {
       int bytes = fread(buff, 1, 4096, file);
       if (bytes > 0) {
         uint32_t ignore;
-        text_append(b->text, buff, bytes, &ignore, &ignore);
+        text_append(b->text, buff, bytes, &ignore);
       } else if (bytes == 0) {
         break; // EOF
       } else {
@@ -239,70 +275,66 @@ static void write_line(struct text_chunk *chunk, void *userdata) {
   fputc('\n', file);
 }
 
-static bool is_word_break(uint8_t c) {
+static bool is_word_break(const struct codepoint *codepoint) {
+  uint32_t c = codepoint->codepoint;
   return c == ' ' || c == '.' || c == '(' || c == ')' || c == '[' || c == ']' ||
-         c == '{' || c == '}' || c == ';' || c == '<' || c == '>' || c == ':';
+         c == '{' || c == '}' || c == ';' || c == '<' || c == '>' || c == ':' ||
+         c == '"';
 }
 
-static bool is_word_char(uint8_t c) { return !is_word_break(c); }
-
-struct match_result {
-  struct location at;
-  bool found;
-};
-
-static struct match_result find_next_in_line(struct buffer *buffer,
-                                             struct location start,
-                                             bool (*predicate)(uint8_t c)) {
-  struct text_chunk line = text_get_line(buffer->text, start.line);
-  bool found = false;
+static bool is_word_char(const struct codepoint *c) {
+  return !is_word_break(c);
+}
 
-  if (line.nbytes == 0) {
+static struct match_result
+find_next_in_line(struct buffer *buffer, struct location start,
+                  bool (*predicate)(const struct codepoint *c)) {
+  if (text_line_size(buffer->text, start.line) == 0) {
     return (struct match_result){.at = start, .found = false};
   }
 
-  uint32_t bytei = text_col_to_byteindex(buffer->text, start.line, start.col);
-  while (bytei < line.nbytes) {
-    if (predicate(line.text[bytei])) {
+  bool found = false;
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, start.line);
+  uint32_t coli = 0, tab_width = get_tab_width(buffer);
+  struct codepoint *codepoint;
+  while ((codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    if (coli >= start.col && predicate(codepoint)) {
       found = true;
       break;
     }
-    ++bytei;
+
+    coli += visual_char_width(codepoint, tab_width);
   }
 
-  uint32_t target_col = text_byteindex_to_col(buffer->text, start.line, bytei);
   return (struct match_result){
-      .at = (struct location){.line = start.line, .col = target_col},
-      .found = found};
+      .at = (struct location){.line = start.line, .col = coli}, .found = found};
 }
 
-static struct match_result find_prev_in_line(struct buffer *buffer,
-                                             struct location start,
-                                             bool (*predicate)(uint8_t c)) {
-  struct text_chunk line = text_get_line(buffer->text, start.line);
-  bool found = false;
+static struct match_result
+find_prev_in_line(struct buffer *buffer, struct location start,
+                  bool (*predicate)(const struct codepoint *c)) {
 
-  if (line.nbytes == 0) {
+  if (text_line_size(buffer->text, start.line) == 0) {
     return (struct match_result){.at = start, .found = false};
   }
 
-  uint32_t bytei = text_col_to_byteindex(buffer->text, start.line, start.col);
-  while (bytei > 0) {
-    if (predicate(line.text[bytei])) {
+  bool found = false;
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, start.line);
+  uint32_t coli = 0, tab_width = get_tab_width(buffer), found_at;
+  struct codepoint *codepoint;
+  while (coli < start.col && (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    if (predicate(codepoint)) {
       found = true;
-      break;
+      found_at = coli;
     }
-    --bytei;
-  }
-
-  // first byte on line can also be a match
-  if (predicate(line.text[bytei])) {
-    found = true;
+    coli += visual_char_width(codepoint, tab_width);
   }
 
-  uint32_t target_col = text_byteindex_to_col(buffer->text, start.line, bytei);
   return (struct match_result){
-      .at = (struct location){.line = start.line, .col = target_col},
+      .at =
+          (struct location){.line = start.line, .col = found ? found_at : coli},
       .found = found};
 }
 
@@ -315,13 +347,52 @@ static struct text_chunk *copy_region(struct buffer *buffer,
     free(curr->text);
   }
 
+  struct location begin_bytes =
+      buffer_location_to_byte_coords(buffer, region.begin);
+  struct location end_bytes =
+      buffer_location_to_byte_coords(buffer, region.end);
+
   struct text_chunk txt =
-      text_get_region(buffer->text, region.begin.line, region.begin.col,
-                      region.end.line, region.end.col);
+      text_get_region(buffer->text, begin_bytes.line, begin_bytes.col,
+                      end_bytes.line, end_bytes.col);
   *curr = txt;
   return curr;
 }
 
+static struct location do_indent(struct buffer *buffer, struct location at,
+                                 uint32_t tab_width, bool use_tabs) {
+  if (use_tabs) {
+    return buffer_add(buffer, at, (uint8_t *)"\t", 1);
+  } else {
+    return buffer_add(buffer, at, (uint8_t *)"                ",
+                      tab_width > 16 ? 16 : tab_width);
+  }
+}
+
+static uint64_t to_global_offset(struct buffer *buffer,
+                                 struct location bytecoords) {
+  uint32_t line = bytecoords.line;
+  uint32_t col = bytecoords.col;
+  uint32_t byteoff = 0;
+  uint32_t nlines = buffer_num_lines(buffer);
+
+  if (nlines == 0) {
+    return 0;
+  }
+
+  for (uint32_t l = 0; l < line && l < nlines; ++l) {
+    // +1 for newline
+    byteoff += text_line_size(buffer->text, l) + 1;
+  }
+
+  // handle last line
+  uint32_t l = line < nlines ? line : nlines - 1;
+  uint32_t nbytes = text_line_size(buffer->text, l);
+  byteoff += col <= nbytes ? col : nbytes + 1;
+
+  return byteoff;
+}
+
 /* --------------------- buffer methods -------------------- */
 
 struct buffer buffer_create(const char *name) {
@@ -452,18 +523,29 @@ struct location buffer_add(struct buffer *buffer, struct location at,
   struct location initial = at;
   struct location final = at;
 
-  uint32_t lines_added, cols_added;
-  text_insert_at(buffer->text, initial.line, initial.col, text, nbytes,
-                 &lines_added, &cols_added);
+  struct location at_bytes = buffer_location_to_byte_coords(buffer, at);
+
+  uint32_t lines_added;
+  text_insert_at(buffer->text, at_bytes.line, at_bytes.col, text, nbytes,
+                 &lines_added);
 
   // move to after inserted text
   if (lines_added > 0) {
     final = buffer_clamp(buffer, (int64_t)at.line + lines_added, 0);
   } else {
+    uint32_t cols_added = 0, tab_width = get_tab_width(buffer);
+    struct utf8_codepoint_iterator iter =
+        create_utf8_codepoint_iterator(text, nbytes, 0);
+    struct codepoint *codepoint;
+    while ((codepoint = utf8_next_codepoint(&iter)) != NULL) {
+      cols_added += visual_char_width(codepoint, tab_width);
+    }
     final =
         buffer_clamp(buffer, (int64_t)at.line, (int64_t)at.col + cols_added);
   }
 
+  struct location final_bytes = buffer_location_to_byte_coords(buffer, final);
+
   undo_push_add(
       &buffer->undo,
       (struct undo_add){.begin = {.row = initial.line, .col = initial.col},
@@ -474,11 +556,17 @@ struct location buffer_add(struct buffer *buffer, struct location at,
                        (struct undo_boundary){.save_point = false});
   }
 
-  uint32_t begin_idx = text_global_idx(buffer->text, initial.line, initial.col);
-  uint32_t end_idx = text_global_idx(buffer->text, final.line, final.col);
+  uint32_t begin_idx = to_global_offset(buffer, at_bytes);
+  uint32_t end_idx = to_global_offset(buffer, final_bytes);
 
   VEC_FOR_EACH(&buffer->hooks->insert_hooks, struct insert_hook * h) {
-    h->callback(buffer, region_new(initial, final), begin_idx, end_idx,
+    h->callback(buffer,
+                (struct edit_location){
+                    .coordinates = region_new(initial, final),
+                    .bytes = region_new(at_bytes, final_bytes),
+                    .global_byte_begin = begin_idx,
+                    .global_byte_end = end_idx,
+                },
                 h->userdata);
   }
 
@@ -488,15 +576,16 @@ struct location buffer_add(struct buffer *buffer, struct location at,
 
 struct location buffer_set_text(struct buffer *buffer, uint8_t *text,
                                 uint32_t nbytes) {
-  uint32_t lines, cols;
+  uint32_t lines_added;
 
   text_clear(buffer->text);
-  text_append(buffer->text, text, nbytes, &lines, &cols);
+  text_append(buffer->text, text, nbytes, &lines_added);
 
   // if last line is empty, remove it
   strip_final_newline(buffer);
 
-  return buffer_clamp(buffer, lines, cols);
+  return buffer_clamp(buffer, lines_added,
+                      buffer_line_length(buffer, lines_added));
 }
 
 void buffer_clear(struct buffer *buffer) { text_clear(buffer->text); }
@@ -524,9 +613,18 @@ struct location buffer_previous_char(struct buffer *buffer,
     }
 
     --dot.line;
-    dot.col = buffer_num_chars(buffer, dot.line);
+    dot.col = buffer_line_length(buffer, dot.line);
   } else {
-    --dot.col;
+    struct utf8_codepoint_iterator iter =
+        text_line_codepoint_iterator(buffer->text, dot.line);
+    struct codepoint *codepoint;
+    uint32_t coli = 0, tab_width = get_tab_width(buffer), last_width = 0;
+    while (coli < dot.col && (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+      last_width = visual_char_width(codepoint, tab_width);
+      coli += last_width;
+    }
+
+    dot.col = coli - last_width;
   }
 
   return dot;
@@ -571,14 +669,14 @@ struct location buffer_previous_line(struct buffer *buffer,
   }
 
   --dot.line;
-  uint32_t nchars = buffer_num_chars(buffer, dot.line);
+  uint32_t nchars = buffer_line_length(buffer, dot.line);
   uint32_t new_col = dot.col > nchars ? nchars : dot.col;
 
   return dot;
 }
 
 struct location buffer_next_char(struct buffer *buffer, struct location dot) {
-  if (dot.col == buffer_num_chars(buffer, dot.line)) {
+  if (dot.col == buffer_line_length(buffer, dot.line)) {
     uint32_t lastline = buffer->lazy_row_add ? buffer_num_lines(buffer)
                                              : buffer_num_lines(buffer) - 1;
     if (dot.line == lastline) {
@@ -588,7 +686,16 @@ struct location buffer_next_char(struct buffer *buffer, struct location dot) {
     dot.col = 0;
     ++dot.line;
   } else {
-    ++dot.col;
+    struct utf8_codepoint_iterator iter =
+        text_line_codepoint_iterator(buffer->text, dot.line);
+    struct codepoint *codepoint;
+    uint32_t coli = 0;
+    while (coli <= dot.col &&
+           (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+      coli += visual_char_width(codepoint, get_tab_width(buffer));
+    }
+
+    dot.col = coli;
   }
 
   return dot;
@@ -635,7 +742,7 @@ struct location buffer_next_line(struct buffer *buffer, struct location dot) {
 
   ++dot.line;
   uint32_t new_col = dot.col;
-  uint32_t nchars = buffer_num_chars(buffer, dot.line);
+  uint32_t nchars = buffer_line_length(buffer, dot.line);
   new_col = new_col > nchars ? nchars : new_col;
 
   return dot;
@@ -664,8 +771,8 @@ struct location buffer_clamp(struct buffer *buffer, int64_t line, int64_t col) {
   // clamp col
   if (col < 0) {
     col = 0;
-  } else if (col > buffer_num_chars(buffer, line)) {
-    col = buffer_num_chars(buffer, line);
+  } else if (col > buffer_line_length(buffer, line)) {
+    col = buffer_line_length(buffer, line);
   }
 
   location.col = col;
@@ -681,7 +788,7 @@ struct location buffer_end(struct buffer *buffer) {
     return (struct location){.line = nlines, .col = 0};
   } else {
     return (struct location){.line = nlines - 1,
-                             .col = buffer_num_chars(buffer, nlines - 1)};
+                             .col = buffer_line_length(buffer, nlines - 1)};
   }
 }
 
@@ -689,55 +796,22 @@ uint32_t buffer_num_lines(struct buffer *buffer) {
   return text_num_lines(buffer->text);
 }
 
-uint32_t buffer_num_chars(struct buffer *buffer, uint32_t line) {
-  if (line >= buffer_num_lines(buffer)) {
-    return 0;
+uint32_t buffer_line_length(struct buffer *buffer, uint32_t line) {
+  uint32_t tab_size = get_tab_width(buffer), len = 0;
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, line);
+  struct codepoint *codepoint;
+  while ((codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    len += visual_char_width(codepoint, tab_size);
   }
 
-  return text_line_length(buffer->text, line);
+  return len;
 }
 
 struct location buffer_newline(struct buffer *buffer, struct location at) {
   return buffer_add(buffer, at, (uint8_t *)"\n", 1);
 }
 
-static uint32_t get_tab_width(struct buffer *buffer) {
-  struct setting *tw = lang_setting(&buffer->lang, "tab-width");
-  if (tw == NULL) {
-    tw = settings_get("editor.tab-width");
-  }
-
-  uint32_t tab_width = 4;
-  if (tw != NULL && tw->value.type == Setting_Number) {
-    tab_width = tw->value.number_value;
-  }
-  return tab_width;
-}
-
-static bool use_tabs(struct buffer *buffer) {
-  struct setting *ut = lang_setting(&buffer->lang, "use-tabs");
-  if (ut == NULL) {
-    ut = settings_get("editor.use-tabs");
-  }
-
-  bool use_tabs = false;
-  if (ut != NULL && ut->value.type == Setting_Bool) {
-    use_tabs = ut->value.bool_value;
-  }
-
-  return use_tabs;
-}
-
-static struct location do_indent(struct buffer *buffer, struct location at,
-                                 uint32_t tab_width, bool use_tabs) {
-  if (use_tabs) {
-    return buffer_add(buffer, at, (uint8_t *)"\t", 1);
-  } else {
-    return buffer_add(buffer, at, (uint8_t *)"                ",
-                      tab_width > 16 ? 16 : tab_width);
-  }
-}
-
 struct location buffer_indent(struct buffer *buffer, struct location at) {
   return do_indent(buffer, at, get_tab_width(buffer), use_tabs(buffer));
 }
@@ -778,16 +852,13 @@ struct location buffer_undo(struct buffer *buffer, struct location dot) {
     case Undo_Add: {
       struct undo_add *add = &rec->add;
 
-      pos =
-          buffer_delete(buffer, (struct region){.begin =
-                                                    (struct location){
-                                                        .line = add->begin.row,
-                                                        .col = add->begin.col,
-                                                    },
-                                                .end = (struct location){
-                                                    .line = add->end.row,
-                                                    .col = add->end.col,
-                                                }});
+      pos = buffer_delete(buffer,
+                          (struct region){
+                              .begin = (struct location){.line = add->begin.row,
+                                                         .col = add->begin.col},
+                              .end = (struct location){.line = add->end.row,
+                                                       .col = add->end.col},
+                          });
 
       break;
     }
@@ -888,9 +959,14 @@ struct location buffer_delete(struct buffer *buffer, struct region region) {
     return region.begin;
   }
 
+  struct location begin_bytes =
+      buffer_location_to_byte_coords(buffer, region.begin);
+  struct location end_bytes =
+      buffer_location_to_byte_coords(buffer, region.end);
+
   struct text_chunk txt =
-      text_get_region(buffer->text, region.begin.line, region.begin.col,
-                      region.end.line, region.end.col);
+      text_get_region(buffer->text, begin_bytes.line, begin_bytes.col,
+                      end_bytes.line, end_bytes.col);
 
   undo_push_boundary(&buffer->undo,
                      (struct undo_boundary){.save_point = false});
@@ -903,17 +979,22 @@ struct location buffer_delete(struct buffer *buffer, struct region region) {
   undo_push_boundary(&buffer->undo,
                      (struct undo_boundary){.save_point = false});
 
-  uint32_t begin_idx =
-      text_global_idx(buffer->text, region.begin.line, region.begin.col);
-  uint32_t end_idx =
-      text_global_idx(buffer->text, region.end.line, region.end.col);
+  uint64_t begin_idx = to_global_offset(buffer, begin_bytes);
+  uint64_t end_idx = to_global_offset(buffer, end_bytes);
 
-  text_delete(buffer->text, region.begin.line, region.begin.col,
-              region.end.line, region.end.col);
+  text_delete(buffer->text, begin_bytes.line, begin_bytes.col, end_bytes.line,
+              end_bytes.col);
   buffer->modified = true;
 
   VEC_FOR_EACH(&buffer->hooks->delete_hooks, struct delete_hook * h) {
-    h->callback(buffer, region, begin_idx, end_idx, h->userdata);
+    h->callback(buffer,
+                (struct edit_location){
+                    .coordinates = region,
+                    .bytes = region_new(begin_bytes, end_bytes),
+                    .global_byte_begin = begin_idx,
+                    .global_byte_end = end_idx,
+                },
+                h->userdata);
   }
 
   return region.begin;
@@ -1035,27 +1116,6 @@ struct cmdbuf {
   struct buffer *buffer;
 };
 
-static uint32_t visual_char_width(uint8_t *byte, uint32_t maxlen) {
-  if (*byte == '\t') {
-    return 4;
-  } else {
-    return utf8_visual_char_width(byte, maxlen);
-  }
-}
-
-uint32_t visual_string_width(uint8_t *txt, uint32_t len, uint32_t start_col,
-                             uint32_t end_col) {
-  uint32_t start_byte = utf8_nbytes(txt, len, start_col);
-  uint32_t end_byte = utf8_nbytes(txt, len, end_col);
-
-  uint32_t width = 0;
-  for (uint32_t bytei = start_byte; bytei < end_byte; ++bytei) {
-    width += visual_char_width(&txt[bytei], len - bytei);
-  }
-
-  return width;
-}
-
 static void apply_properties(struct command_list *cmds,
                              struct text_property *properties[],
                              uint32_t nproperties) {
@@ -1097,65 +1157,67 @@ void render_line(struct text_chunk *line, void *userdata) {
   command_list_set_show_whitespace(cmdbuf->cmds, cmdbuf->show_ws);
 
   // calculate scroll offsets
-  uint32_t scroll_bytes =
-      utf8_nbytes(line->text, line->nbytes, cmdbuf->origin.col);
-  uint32_t text_nbytes_scroll =
-      scroll_bytes > line->nbytes ? 0 : line->nbytes - scroll_bytes;
-  uint8_t *text = line->text + scroll_bytes;
-
-  uint32_t visual_col_start = 0;
-  uint32_t cur_visual_col = 0;
-  uint32_t start_byte = 0, text_nbytes = 0;
   struct text_property *properties[32] = {0};
   uint64_t prev_properties_hash = 0;
 
-  for (uint32_t cur_byte = start_byte, coli = 0;
-       cur_byte < text_nbytes_scroll && cur_visual_col < cmdbuf->width &&
-       coli < line->nchars - cmdbuf->origin.col;
-       ++coli) {
+  uint32_t tab_width = get_tab_width(cmdbuf->buffer);
+
+  // handle scroll column offset
+  uint32_t coli = 0, bytei = 0;
+  struct utf8_codepoint_iterator iter = text_chunk_codepoint_iterator(line);
+  struct codepoint *codepoint;
+  while (coli < cmdbuf->origin.col &&
+         (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    coli += visual_char_width(codepoint, tab_width);
+    bytei += codepoint->nbytes;
+  }
 
-    uint32_t bytes_remaining = text_nbytes_scroll - cur_byte;
-    uint32_t char_nbytes = utf8_nbytes(text + cur_byte, bytes_remaining, 1);
-    uint32_t char_vwidth = visual_char_width(text + cur_byte, bytes_remaining);
+  // coli is the visual column [0..width-1]
+  coli = 0;
+  uint32_t drawn_bytei = bytei;
+  uint32_t drawn_coli = coli;
 
+  while (coli < cmdbuf->width &&
+         (codepoint = utf8_next_codepoint(&iter)) != NULL) {
     // calculate character properties
     uint32_t nproperties = 0;
-    text_get_properties(
-        cmdbuf->buffer->text,
-        (struct location){.line = line->line, .col = coli + cmdbuf->origin.col},
-        properties, 32, &nproperties);
+    text_get_properties(cmdbuf->buffer->text, line->line, bytei, properties, 32,
+                        &nproperties);
 
     // if we have any new or lost props, flush text up until now, reset
     // and re-apply current properties
     uint64_t new_properties_hash = properties_hash(properties, nproperties);
     if (new_properties_hash != prev_properties_hash) {
-      command_list_draw_text(cmdbuf->cmds, visual_col_start, visual_line,
-                             text + start_byte, cur_byte - start_byte);
+      command_list_draw_text(cmdbuf->cmds, drawn_coli, visual_line,
+                             line->text + drawn_bytei, bytei - drawn_bytei);
       command_list_reset_color(cmdbuf->cmds);
 
-      visual_col_start = cur_visual_col;
-      start_byte = cur_byte;
+      drawn_coli = coli;
+      drawn_bytei = bytei;
 
       // apply new properties
       apply_properties(cmdbuf->cmds, properties, nproperties);
     }
 
     prev_properties_hash = new_properties_hash;
-    cur_byte += char_nbytes;
-    text_nbytes += char_nbytes;
-    cur_visual_col += char_vwidth;
+    bytei += codepoint->nbytes;
+    coli += visual_char_width(codepoint, tab_width);
   }
 
   // flush remaining
-  command_list_draw_text(cmdbuf->cmds, visual_col_start, visual_line,
-                         text + start_byte, text_nbytes - start_byte);
+  command_list_draw_text(cmdbuf->cmds, drawn_coli, visual_line,
+                         line->text + drawn_bytei, bytei - drawn_bytei);
+
+  drawn_coli = coli;
+  drawn_bytei = bytei;
 
   command_list_reset_color(cmdbuf->cmds);
   command_list_set_show_whitespace(cmdbuf->cmds, false);
 
-  if (cur_visual_col < cmdbuf->width) {
-    command_list_draw_repeated(cmdbuf->cmds, cur_visual_col, visual_line, ' ',
-                               cmdbuf->width - cur_visual_col);
+  // TODO: considering the whole screen is cleared, is this really needed?
+  if (drawn_coli < cmdbuf->width) {
+    command_list_draw_repeated(cmdbuf->cmds, drawn_coli, visual_line, ' ',
+                               cmdbuf->width - drawn_coli);
   }
 }
 
@@ -1200,19 +1262,19 @@ void buffer_render(struct buffer *buffer, struct buffer_render_params *params) {
 void buffer_add_text_property(struct buffer *buffer, struct location start,
                               struct location end,
                               struct text_property property) {
-  text_add_property(
-      buffer->text, (struct location){.line = start.line, .col = start.col},
-      (struct location){.line = end.line, .col = end.col}, property);
+  struct location bytestart = buffer_location_to_byte_coords(buffer, start);
+  struct location byteend = buffer_location_to_byte_coords(buffer, end);
+  text_add_property(buffer->text, bytestart.line, bytestart.col, byteend.line,
+                    byteend.col, property);
 }
 
 void buffer_get_text_properties(struct buffer *buffer, struct location location,
                                 struct text_property **properties,
                                 uint32_t max_nproperties,
                                 uint32_t *nproperties) {
-  text_get_properties(
-      buffer->text,
-      (struct location){.line = location.line, .col = location.col}, properties,
-      max_nproperties, nproperties);
+  struct location bytecoords = buffer_location_to_byte_coords(buffer, location);
+  text_get_properties(buffer->text, bytecoords.line, bytecoords.col, properties,
+                      max_nproperties, nproperties);
 }
 
 void buffer_clear_text_properties(struct buffer *buffer) {
@@ -1244,9 +1306,12 @@ void buffer_sort_lines(struct buffer *buffer, uint32_t start_line,
                  (struct location){.line = end + 1, .col = 0});
 
   struct s8 *lines = (struct s8 *)malloc(sizeof(struct s8) * ntosort);
-  struct text_chunk txt =
-      text_get_region(buffer->text, region.begin.line, region.begin.col,
-                      region.end.line, region.end.col);
+
+  struct location bytebeg =
+      buffer_location_to_byte_coords(buffer, region.begin);
+  struct location byteend = buffer_location_to_byte_coords(buffer, region.end);
+  struct text_chunk txt = text_get_region(
+      buffer->text, bytebeg.line, bytebeg.col, byteend.line, byteend.col);
 
   uint32_t line_start = 0;
   uint32_t curr_line = 0;
@@ -1278,3 +1343,41 @@ void buffer_sort_lines(struct buffer *buffer, uint32_t start_line,
     free(txt.text);
   }
 }
+
+struct location buffer_location_to_byte_coords(struct buffer *buffer,
+                                               struct location coords) {
+  struct utf8_codepoint_iterator iter =
+      text_line_codepoint_iterator(buffer->text, coords.line);
+  uint32_t byteoffset = 0, col = 0, tab_width = get_tab_width(buffer);
+  struct codepoint *codepoint;
+
+  /* Let this walk up to (and including the target column) to
+   * make sure we account for zero-width characters when calculating the
+   * byte offset.
+   */
+  while (col <= coords.col &&
+         (codepoint = utf8_next_codepoint(&iter)) != NULL) {
+    byteoffset += codepoint->nbytes;
+    col += visual_char_width(codepoint, tab_width);
+  }
+
+  /* Remove the byte-width of the last char again since it gives us the
+   * position right before it while still taking zero-width codepoints
+   * into account.
+   */
+  return (struct location){.line = coords.line,
+                           .col = byteoffset -
+                                  (codepoint != NULL ? codepoint->nbytes : 0)};
+}
+
+struct match_result
+buffer_find_prev_in_line(struct buffer *buffer, struct location start,
+                         bool (*predicate)(const struct codepoint *c)) {
+  return find_prev_in_line(buffer, start, predicate);
+}
+
+struct match_result
+buffer_find_next_in_line(struct buffer *buffer, struct location start,
+                         bool (*predicate)(const struct codepoint *c)) {
+  return find_next_in_line(buffer, start, predicate);
+}
-- 
cgit v1.2.3