diff options
| author | Marc André Tanner <mat@brain-dump.org> | 2016-08-07 22:27:23 +0200 |
|---|---|---|
| committer | Marc André Tanner <mat@brain-dump.org> | 2016-08-07 22:27:23 +0200 |
| commit | 2b4a550d432e5e570bb888ce06440a825c2e2e7c (patch) | |
| tree | 20ffcd1ebc90f2c7d07cad7b10581c27b69ee038 | |
| parent | bec4efe8bbc2956b026438611f9340349d980388 (diff) | |
| download | vis-2b4a550d432e5e570bb888ce06440a825c2e2e7c.tar.gz vis-2b4a550d432e5e570bb888ce06440a825c2e2e7c.tar.xz | |
text-regex: improve searching in binary data
The regex(3) API we currently use, matches on NUL terminated strings.
Therefore it does not work for binary data. This commit adds loops
to manually skip over NUL bytes. While it does not work for patterns
which would match strings containing NUL bytes, it should improve the
most basic cases.
Binary file handling will need further improvements in the future.
Fixes #359.
| -rw-r--r-- | text-regex.c | 63 |
1 files changed, 43 insertions, 20 deletions
diff --git a/text-regex.c b/text-regex.c index 640ead5..dd541d7 100644 --- a/text-regex.c +++ b/text-regex.c @@ -38,13 +38,25 @@ int text_search_range_forward(Text *txt, size_t pos, size_t len, Regex *r, size_ char *buf = text_bytes_alloc0(txt, pos, len); if (!buf) return REG_NOMATCH; + char *cur = buf, *end = buf + len; + int ret = REG_NOMATCH; regmatch_t match[nmatch]; - int ret = regexec(&r->regex, buf, nmatch, match, eflags); - if (!ret) { - for (size_t i = 0; i < nmatch; i++) { - pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; - pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + for (size_t junk = len; len > 0; len -= junk, pos += junk) { + ret = regexec(&r->regex, cur, nmatch, match, eflags); + if (!ret) { + for (size_t i = 0; i < nmatch; i++) { + pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; + pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + } + break; } + char *next = memchr(cur, 0, len); + if (!next) + break; + while (!*next && next != end) + next++; + junk = next - cur; + cur = next; } free(buf); return ret; @@ -54,25 +66,36 @@ int text_search_range_backward(Text *txt, size_t pos, size_t len, Regex *r, size char *buf = text_bytes_alloc0(txt, pos, len); if (!buf) return REG_NOMATCH; - regmatch_t match[nmatch]; - char *cur = buf; + char *cur = buf, *end = buf + len; int ret = REG_NOMATCH; - while (!regexec(&r->regex, cur, nmatch, match, eflags)) { - ret = 0; - for (size_t i = 0; i < nmatch; i++) { - pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + (size_t)(cur - buf) + match[i].rm_so; - pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + (size_t)(cur - buf) + match[i].rm_eo; - } - if (match[0].rm_so == 0 && match[0].rm_eo == 0) { - /* empty match at the beginning of cur, advance to next line */ - if ((cur = strchr(cur, '\n'))) - cur++; - else - break; + regmatch_t match[nmatch]; + for (size_t junk = len; len > 0; len -= junk, pos += junk) { + char *next; + if (!regexec(&r->regex, cur, nmatch, match, eflags)) { + ret = 0; + for (size_t i = 0; i < nmatch; i++) { + pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; + pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + } + if (match[0].rm_so == 0 && match[0].rm_eo == 0) { + /* empty match at the beginning of cur, advance to next line */ + next = strchr(cur, '\n'); + if (!next) + break; + next++; + } else { + next = cur + match[0].rm_eo; + } } else { - cur += match[0].rm_eo; + next = memchr(cur, 0, len); + if (!next) + break; + while (!*next && next != end) + next++; } + junk = next - cur; + cur = next; } free(buf); return ret; |
