From 7f3d570cc23cf80163514013af2fc224eeff80c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Andr=C3=A9=20Tanner?= Date: Mon, 3 Jul 2017 14:33:04 +0200 Subject: Fix Unicode regex searches for libtre builds Fix #566 --- text-regex-tre.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/text-regex-tre.c b/text-regex-tre.c index 0ac8b38..3f02dcd 100644 --- a/text-regex-tre.c +++ b/text-regex-tre.c @@ -1,5 +1,7 @@ #include #include +#include +#include #include "text-regex.h" #include "text-motions.h" @@ -20,13 +22,60 @@ size_t text_regex_nsub(Regex *r) { static int str_next_char(tre_char_t *c, unsigned int *pos_add, void *context) { Regex *r = context; - *pos_add = 1; - if (r->it.pos < r->end && text_iterator_byte_get(&r->it, (char*)c)) { - text_iterator_byte_next(&r->it, NULL); - return 0; + Iterator *it = &r->it; + if (TRE_WCHAR) { + mbstate_t ps = { 0 }; + bool eof = false; + size_t start = it->pos; + for (;;) { + if (it->pos >= r->end) { + eof = true; + break; + } + size_t rem = r->end - it->pos; + size_t plen = it->end - it->text; + size_t len = rem < plen ? rem : plen; + size_t wclen = mbrtowc(c, it->text, len, &ps); + if (wclen == (size_t)-1 && errno == EILSEQ) { + *c = L'\0'; + text_iterator_codepoint_next(it, NULL); + break; + } else if (wclen == (size_t)-2) { + if (!text_iterator_next(it)) { + eof = true; + break; + } + } else if (wclen == 0) { + text_iterator_byte_next(it, NULL); + break; + } else { + if (wclen < plen) { + it->text += wclen; + it->pos += wclen; + } else { + text_iterator_next(it); + } + break; + } + } + + if (eof) { + *c = L'\0'; + *pos_add = 1; + return 1; + } else { + *pos_add = it->pos - start; + return 0; + } } else { - *c = '\0'; - return 1; + *pos_add = 1; + if (it->pos < r->end && text_iterator_byte_get(it, (char*)c)) { + text_iterator_byte_next(it, NULL); + return 0; + } else { + *c = '\0'; + return 1; + } } } -- cgit v1.2.3