From 2b4a550d432e5e570bb888ce06440a825c2e2e7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Andr=C3=A9=20Tanner?= Date: Sun, 7 Aug 2016 22:27:23 +0200 Subject: text-regex: improve searching in binary data The regex(3) API we currently use, matches on NUL terminated strings. Therefore it does not work for binary data. This commit adds loops to manually skip over NUL bytes. While it does not work for patterns which would match strings containing NUL bytes, it should improve the most basic cases. Binary file handling will need further improvements in the future. Fixes #359. --- text-regex.c | 63 +++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/text-regex.c b/text-regex.c index 640ead5..dd541d7 100644 --- a/text-regex.c +++ b/text-regex.c @@ -38,13 +38,25 @@ int text_search_range_forward(Text *txt, size_t pos, size_t len, Regex *r, size_ char *buf = text_bytes_alloc0(txt, pos, len); if (!buf) return REG_NOMATCH; + char *cur = buf, *end = buf + len; + int ret = REG_NOMATCH; regmatch_t match[nmatch]; - int ret = regexec(&r->regex, buf, nmatch, match, eflags); - if (!ret) { - for (size_t i = 0; i < nmatch; i++) { - pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; - pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + for (size_t junk = len; len > 0; len -= junk, pos += junk) { + ret = regexec(&r->regex, cur, nmatch, match, eflags); + if (!ret) { + for (size_t i = 0; i < nmatch; i++) { + pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; + pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + } + break; } + char *next = memchr(cur, 0, len); + if (!next) + break; + while (!*next && next != end) + next++; + junk = next - cur; + cur = next; } free(buf); return ret; @@ -54,25 +66,36 @@ int text_search_range_backward(Text *txt, size_t pos, size_t len, Regex *r, size char *buf = text_bytes_alloc0(txt, pos, len); if (!buf) return REG_NOMATCH; - regmatch_t match[nmatch]; - char *cur = buf; + char *cur = buf, *end = buf + len; int ret = REG_NOMATCH; - while (!regexec(&r->regex, cur, nmatch, match, eflags)) { - ret = 0; - for (size_t i = 0; i < nmatch; i++) { - pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + (size_t)(cur - buf) + match[i].rm_so; - pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + (size_t)(cur - buf) + match[i].rm_eo; - } - if (match[0].rm_so == 0 && match[0].rm_eo == 0) { - /* empty match at the beginning of cur, advance to next line */ - if ((cur = strchr(cur, '\n'))) - cur++; - else - break; + regmatch_t match[nmatch]; + for (size_t junk = len; len > 0; len -= junk, pos += junk) { + char *next; + if (!regexec(&r->regex, cur, nmatch, match, eflags)) { + ret = 0; + for (size_t i = 0; i < nmatch; i++) { + pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; + pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + } + if (match[0].rm_so == 0 && match[0].rm_eo == 0) { + /* empty match at the beginning of cur, advance to next line */ + next = strchr(cur, '\n'); + if (!next) + break; + next++; + } else { + next = cur + match[0].rm_eo; + } } else { - cur += match[0].rm_eo; + next = memchr(cur, 0, len); + if (!next) + break; + while (!*next && next != end) + next++; } + junk = next - cur; + cur = next; } free(buf); return ret; -- cgit v1.2.3