aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarc André Tanner <mat@brain-dump.org>2015-11-23 11:10:38 +0100
committerMarc André Tanner <mat@brain-dump.org>2015-11-23 11:39:37 +0100
commit51e92f0c8e7b50c684287bea1a55edbde128053f (patch)
tree02eaed9cd71db186569698928c7afeca457e2bd7
parent0667089d47dd0cee5bda83965ff6bbdc2e4fd288 (diff)
downloadvis-51e92f0c8e7b50c684287bea1a55edbde128053f.tar.gz
vis-51e92f0c8e7b50c684287bea1a55edbde128053f.tar.xz
text: introduce functions to iterate over graphemes
They currently consider any character for which wcwidth(3) return 0 as a combining character.
-rw-r--r--text-motions.h8
-rw-r--r--text.c57
-rw-r--r--text.h7
3 files changed, 65 insertions, 7 deletions
diff --git a/text-motions.h b/text-motions.h
index d65bdf2..57aa09e 100644
--- a/text-motions.h
+++ b/text-motions.h
@@ -12,7 +12,7 @@
size_t text_begin(Text*, size_t pos);
size_t text_end(Text*, size_t pos);
-/* move to start of next / previous UTF-8 character */
+/* char refers to a grapheme (might skip over multiple Unicode codepoints) */
size_t text_char_next(Text*, size_t pos);
size_t text_char_prev(Text*, size_t pos);
@@ -39,11 +39,11 @@ size_t text_line_lastchar(Text*, size_t pos);
size_t text_line_end(Text*, size_t pos);
size_t text_line_next(Text*, size_t pos);
size_t text_line_offset(Text*, size_t pos, size_t off);
-/* get character count of the line upto `pos' */
+/* get grapheme count of the line upto `pos' */
int text_line_char_get(Text*, size_t pos);
-/* get position of the `count' character in the line containing `pos' */
+/* get position of the `count' grapheme in the line containing `pos' */
size_t text_line_char_set(Text*, size_t pos, int count);
-/* move to the next/previous character on the same line */
+/* move to the next/previous grapheme on the same line */
size_t text_line_char_next(Text*, size_t pos);
size_t text_line_char_prev(Text*, size_t pos);
/* move to the next/previous empty line */
diff --git a/text.c b/text.c
index 1e1c968..2de96b8 100644
--- a/text.c
+++ b/text.c
@@ -20,6 +20,7 @@
#include <time.h>
#include <fcntl.h>
#include <errno.h>
+#include <wchar.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
@@ -1333,7 +1334,7 @@ bool text_iterator_byte_prev(Iterator *it, char *b) {
return true;
}
-bool text_iterator_char_next(Iterator *it, char *c) {
+bool text_iterator_codepoint_next(Iterator *it, char *c) {
while (text_iterator_byte_next(it, NULL)) {
if (ISUTF8(*it->text)) {
if (c)
@@ -1344,7 +1345,7 @@ bool text_iterator_char_next(Iterator *it, char *c) {
return false;
}
-bool text_iterator_char_prev(Iterator *it, char *c) {
+bool text_iterator_codepoint_prev(Iterator *it, char *c) {
while (text_iterator_byte_prev(it, NULL)) {
if (ISUTF8(*it->text)) {
if (c)
@@ -1355,6 +1356,58 @@ bool text_iterator_char_prev(Iterator *it, char *c) {
return false;
}
+bool text_iterator_char_next(Iterator *it, char *c) {
+ if (!text_iterator_codepoint_next(it, c))
+ return false;
+ mbstate_t ps = { 0 };
+ for (;;) {
+ char buf[MB_CUR_MAX];
+ size_t len = text_bytes_get(it->piece->text, it->pos, sizeof buf, buf);
+ wchar_t wc;
+ size_t wclen = mbrtowc(&wc, buf, len, &ps);
+ if (wclen == (size_t)-1 && errno == EILSEQ) {
+ return true;
+ } else if (wclen == (size_t)-2) {
+ return false;
+ } else if (wclen == 0) {
+ return true;
+ } else {
+ int width = wcwidth(wc);
+ if (width != 0)
+ return true;
+ if (!text_iterator_codepoint_next(it, c))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool text_iterator_char_prev(Iterator *it, char *c) {
+ if (!text_iterator_codepoint_prev(it, c))
+ return false;
+ for (;;) {
+ char buf[MB_CUR_MAX];
+ size_t len = text_bytes_get(it->piece->text, it->pos, sizeof buf, buf);
+ wchar_t wc;
+ mbstate_t ps = { 0 };
+ size_t wclen = mbrtowc(&wc, buf, len, &ps);
+ if (wclen == (size_t)-1 && errno == EILSEQ) {
+ return true;
+ } else if (wclen == (size_t)-2) {
+ return false;
+ } else if (wclen == 0) {
+ return true;
+ } else {
+ int width = wcwidth(wc);
+ if (width != 0)
+ return true;
+ if (!text_iterator_codepoint_prev(it, c))
+ return false;
+ }
+ }
+ return true;
+}
+
bool text_byte_get(Text *txt, size_t pos, char *buf) {
return text_bytes_get(txt, pos, 1, buf);
}
diff --git a/text.h b/text.h
index d338e0b..685079e 100644
--- a/text.h
+++ b/text.h
@@ -84,7 +84,12 @@ bool text_iterator_byte_prev(Iterator*, char *b);
/* if the new position is at EOF a NUL byte (which is not actually
* part of the file) is read. */
bool text_iterator_byte_next(Iterator*, char *b);
-
+/* move to the next/previous UTF-8 encoded Unicode codepoint
+ * and set c (if it is non NULL) to the first byte */
+bool text_iterator_codepoint_next(Iterator *it, char *c);
+bool text_iterator_codepoint_prev(Iterator *it, char *c);
+/* move to next/previous grapheme i.e. might skip over multiple
+ * Unicode codepoints (e.g. for combining characters) */
bool text_iterator_char_next(Iterator*, char *c);
bool text_iterator_char_prev(Iterator*, char *c);