From 51e92f0c8e7b50c684287bea1a55edbde128053f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc=20Andr=C3=A9=20Tanner?= <mat@brain-dump.org>
Date: Mon, 23 Nov 2015 11:10:38 +0100
Subject: text: introduce functions to iterate over graphemes

They currently consider any character for which wcwidth(3)
return 0 as a combining character.
---
 text-motions.h |  8 ++++----
 text.c         | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 text.h         |  7 ++++++-
 3 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/text-motions.h b/text-motions.h
index d65bdf2..57aa09e 100644
--- a/text-motions.h
+++ b/text-motions.h
@@ -12,7 +12,7 @@
 size_t text_begin(Text*, size_t pos);
 size_t text_end(Text*, size_t pos);
 
-/* move to start of next / previous UTF-8 character */
+/* char refers to a grapheme (might skip over multiple Unicode codepoints) */
 size_t text_char_next(Text*, size_t pos);
 size_t text_char_prev(Text*, size_t pos);
 
@@ -39,11 +39,11 @@ size_t text_line_lastchar(Text*, size_t pos);
 size_t text_line_end(Text*, size_t pos);
 size_t text_line_next(Text*, size_t pos);
 size_t text_line_offset(Text*, size_t pos, size_t off);
-/* get character count of the line upto `pos' */
+/* get grapheme count of the line upto `pos' */
 int text_line_char_get(Text*, size_t pos);
-/* get position of the `count' character in the line containing `pos' */
+/* get position of the `count' grapheme in the line containing `pos' */
 size_t text_line_char_set(Text*, size_t pos, int count);
-/* move to the next/previous character on the same line */
+/* move to the next/previous grapheme on the same line */
 size_t text_line_char_next(Text*, size_t pos);
 size_t text_line_char_prev(Text*, size_t pos);
 /* move to the next/previous empty line */
diff --git a/text.c b/text.c
index 1e1c968..2de96b8 100644
--- a/text.c
+++ b/text.c
@@ -20,6 +20,7 @@
 #include <time.h>
 #include <fcntl.h>
 #include <errno.h>
+#include <wchar.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
@@ -1333,7 +1334,7 @@ bool text_iterator_byte_prev(Iterator *it, char *b) {
 	return true;
 }
 
-bool text_iterator_char_next(Iterator *it, char *c) {
+bool text_iterator_codepoint_next(Iterator *it, char *c) {
 	while (text_iterator_byte_next(it, NULL)) {
 		if (ISUTF8(*it->text)) {
 			if (c)
@@ -1344,7 +1345,7 @@ bool text_iterator_char_next(Iterator *it, char *c) {
 	return false;
 }
 
-bool text_iterator_char_prev(Iterator *it, char *c) {
+bool text_iterator_codepoint_prev(Iterator *it, char *c) {
 	while (text_iterator_byte_prev(it, NULL)) {
 		if (ISUTF8(*it->text)) {
 			if (c)
@@ -1355,6 +1356,58 @@ bool text_iterator_char_prev(Iterator *it, char *c) {
 	return false;
 }
 
+bool text_iterator_char_next(Iterator *it, char *c) {
+	if (!text_iterator_codepoint_next(it, c))
+		return false;
+	mbstate_t ps = { 0 };
+	for (;;) {
+		char buf[MB_CUR_MAX];
+		size_t len = text_bytes_get(it->piece->text, it->pos, sizeof buf, buf);
+		wchar_t wc;
+		size_t wclen = mbrtowc(&wc, buf, len, &ps);
+		if (wclen == (size_t)-1 && errno == EILSEQ) {
+			return true;
+		} else if (wclen == (size_t)-2) {
+			return false;
+		} else if (wclen == 0) {
+			return true;
+		} else {
+			int width = wcwidth(wc);
+			if (width != 0)
+				return true;
+			if (!text_iterator_codepoint_next(it, c))
+				return false;
+		}
+	}
+	return true;
+}
+
+bool text_iterator_char_prev(Iterator *it, char *c) {
+	if (!text_iterator_codepoint_prev(it, c))
+		return false;
+	for (;;) {
+		char buf[MB_CUR_MAX];
+		size_t len = text_bytes_get(it->piece->text, it->pos, sizeof buf, buf);
+		wchar_t wc;
+		mbstate_t ps = { 0 };
+		size_t wclen = mbrtowc(&wc, buf, len, &ps);
+		if (wclen == (size_t)-1 && errno == EILSEQ) {
+			return true;
+		} else if (wclen == (size_t)-2) {
+			return false;
+		} else if (wclen == 0) {
+			return true;
+		} else {
+			int width = wcwidth(wc);
+			if (width != 0)
+				return true;
+			if (!text_iterator_codepoint_prev(it, c))
+				return false;
+		}
+	}
+	return true;
+}
+
 bool text_byte_get(Text *txt, size_t pos, char *buf) {
 	return text_bytes_get(txt, pos, 1, buf);
 }
diff --git a/text.h b/text.h
index d338e0b..685079e 100644
--- a/text.h
+++ b/text.h
@@ -84,7 +84,12 @@ bool text_iterator_byte_prev(Iterator*, char *b);
 /* if the new position is at EOF a NUL byte (which is not actually
  * part of the file) is read. */
 bool text_iterator_byte_next(Iterator*, char *b);
-
+/* move to the next/previous UTF-8 encoded Unicode codepoint
+ * and set c (if it is non NULL) to the first byte */
+bool text_iterator_codepoint_next(Iterator *it, char *c);
+bool text_iterator_codepoint_prev(Iterator *it, char *c);
+/* move to next/previous grapheme i.e. might skip over multiple
+ * Unicode codepoints (e.g. for combining characters) */
 bool text_iterator_char_next(Iterator*, char *c);
 bool text_iterator_char_prev(Iterator*, char *c);
 
-- 
cgit v1.2.3