From 53f84f7cbafcb177406f8f7bcc890e626e72ca63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc=20Andr=C3=A9=20Tanner?= Date: Mon, 12 Dec 2016 12:02:20 +0100 Subject: text-regex: add regex backend based on libtre While memory consumption should be improved, backward searches will still be slow, because they are implemented in terms of repeated forward searches. It needs to be investigated whether the underlying automaton can have its transitions reversed and essentially run backwards, as is the case in sam. --- Makefile | 12 ++++-- README.md | 1 + configure | 52 ++++++++++++++++++++++++++ text-regex-tre.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ text-regex.h | 5 +++ vis-cmds.c | 1 + 6 files changed, 179 insertions(+), 4 deletions(-) create mode 100644 text-regex-tre.c diff --git a/Makefile b/Makefile index 18bf93d..164acae 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,11 @@ -include config.mk +REGEX_SRC ?= text-regex.c + SRC = array.c buffer.c libutf.c main.c map.c register.c ring-buffer.c \ - sam.c text.c text-motions.c text-objects.c text-regex.c text-util.c \ + sam.c text.c text-motions.c text-objects.c text-util.c \ ui-curses.c view.c vis.c vis-lua.c vis-modes.c vis-motions.c \ - vis-operators.c vis-prompt.c vis-text-objects.c + vis-operators.c vis-prompt.c vis-text-objects.c $(REGEX_SRC) EXECUTABLES = vis vis-clipboard vis-complete vis-menu vis-open @@ -19,6 +21,7 @@ VERSION = $(shell git describe --always --dirty 2>/dev/null || echo "0.2-git") CONFIG_LUA ?= 1 CONFIG_LPEG ?= 0 +CONFIG_TRE ?= 0 CONFIG_ACL ?= 0 CONFIG_SELINUX ?= 0 @@ -27,16 +30,17 @@ CFLAGS_STD += -DVERSION=\"${VERSION}\" LDFLAGS_STD ?= -lc CFLAGS_VIS = $(CFLAGS_AUTO) $(CFLAGS_TERMKEY) $(CFLAGS_CURSES) $(CFLAGS_ACL) \ - $(CFLAGS_SELINUX) $(CFLAGS_LUA) $(CFLAGS_LPEG) $(CFLAGS_STD) + $(CFLAGS_SELINUX) $(CFLAGS_TRE) $(CFLAGS_LUA) $(CFLAGS_LPEG) $(CFLAGS_STD) CFLAGS_VIS += -DVIS_PATH=\"${SHAREPREFIX}/vis\" CFLAGS_VIS += -DCONFIG_LUA=${CONFIG_LUA} CFLAGS_VIS += -DCONFIG_LPEG=${CONFIG_LPEG} +CFLAGS_VIS += -DCONFIG_TRE=${CONFIG_TRE} CFLAGS_VIS += -DCONFIG_SELINUX=${CONFIG_SELINUX} CFLAGS_VIS += -DCONFIG_ACL=${CONFIG_ACL} LDFLAGS_VIS = $(LDFLAGS_AUTO) $(LDFLAGS_TERMKEY) $(LDFLAGS_CURSES) $(LDFLAGS_ACL) \ - $(LDFLAGS_SELINUX) $(LDFLAGS_LUA) $(LDFLAGS_LPEG) $(LDFLAGS_STD) + $(LDFLAGS_SELINUX) $(LDFLAGS_TRE) $(LDFLAGS_LUA) $(LDFLAGS_LPEG) $(LDFLAGS_STD) STRIP?=strip diff --git a/README.md b/README.md index 7e9bfbc..ca8206a 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ compatible environment as well as: * [Lua](http://www.lua.org/) >= 5.2 (optional) * [LPeg](http://www.inf.puc-rio.br/~roberto/lpeg/) >= 0.12 (optional runtime dependency required for syntax highlighting) + * [TRE](http://laurikari.net/tre/) (optional for more memory efficient regex search) Assuming these dependencies are met, execute: diff --git a/configure b/configure index dc8f278..fe85ccf 100755 --- a/configure +++ b/configure @@ -25,6 +25,7 @@ Fine tuning of the installation directories: Optional features: --enable-lua build with Lua support [auto] --enable-lpeg build with support for statically linking to LPeg [auto] + --enable-tre build with TRE regex support [auto] --enable-selinux build with SELinux support [auto] --enable-acl build with POSIX ACL support [auto] @@ -115,6 +116,7 @@ MANDIR='$(PREFIX)/share/man' lua=auto lpeg=auto +tre=auto selinux=auto acl=auto @@ -133,6 +135,8 @@ case "$arg" in --disable-lua|--enable-lua=no) lua=no ;; --enable-lpeg|--enable-lpeg=yes) lpeg=yes ;; --disable-lpeg|--enable-lpeg=no) lpeg=no ;; +--enable-tre|--enable-tre=yes) tre=yes ;; +--disable-tre|--enable-tre=no) tre=no ;; --enable-selinux|--enable-selinux=yes) selinux=yes ;; --disable-selinux|--enable-selinux=no) selinux=no ;; --enable-acl|--enable-acl=yes) acl=yes ;; @@ -353,6 +357,50 @@ else fail "$0: cannot find libtermkey" fi +CONFIG_TRE=0 +REGEX_SRC=text-regex.c + +if test "$tre" != "no" ; then + + printf "checking for libtre... " + +cat > "$tmpc" < + +int main() { + regex_t preg; + tre_str_source *source = NULL; + regmatch_t pmatch[1]; + tre_regcomp(&preg, "\0", REG_EXTENDED); + tre_reguexec(&preg, source, 1, pmatch, 0); + tre_regfree(&preg); + return 0; +} +EOF + + if test "$have_pkgconfig" = "yes" ; then + CFLAGS_TRE=$(pkg-config --cflags tre 2>/dev/null) + LDFLAGS_TRE=$(pkg-config --libs tre 2>/dev/null) + fi + + if test -z "$LDFLAGS_TRE"; then + CFLAGS_TRE="" + LDFLAGS_TRE="-ltre" + fi + + if $CC $CFLAGS $CFLAGS_TRE "$tmpc" \ + $LDFLAGS $LDFLAGS_TRE -o "$tmpo" >/dev/null 2>&1; then + CONFIG_TRE=1 + REGEX_SRC=text-regex-tre.c + printf "%s\n" "yes" + else + printf "%s\n" "no" + CFLAGS_TRE="" + LDFLAGS_TRE="" + test "$tre" = "yes" && fail "$0: cannot find libtre" + fi +fi + CONFIG_LUA=0 # enabling builtin lpeg requires lua support @@ -537,6 +585,10 @@ CFLAGS_CURSES = $CFLAGS_CURSES LDFLAGS_CURSES = $LDFLAGS_CURSES CFLAGS_TERMKEY = $CFLAGS_TERMKEY LDFLAGS_TERMKEY = $LDFLAGS_TERMKEY +REGEX_SRC = $REGEX_SRC +CONFIG_TRE = $CONFIG_TRE +CFLAGS_TRE = $CFLAGS_TRE +LDFLAGS_TRE = $LDFLAGS_TRE CONFIG_LUA = $CONFIG_LUA CFLAGS_LUA = $CFLAGS_LUA LDFLAGS_LUA = $LDFLAGS_LUA diff --git a/text-regex-tre.c b/text-regex-tre.c new file mode 100644 index 0000000..d45252a --- /dev/null +++ b/text-regex-tre.c @@ -0,0 +1,112 @@ +#include +#include + +#include "text-regex.h" +#include "text-motions.h" + +struct Regex { + regex_t regex; + tre_str_source str_source; + Text *text; + Iterator it; + size_t end; +}; + +size_t text_regex_nsub(Regex *r) { + if (!r) + return 0; + return r->regex.re_nsub; +} + +static int str_next_char(tre_char_t *c, unsigned int *pos_add, void *context) { + Regex *r = context; + text_iterator_byte_get(&r->it, (char*)c); + return r->it.pos < r->end && text_iterator_byte_next(&r->it, NULL) ? 0 : 1; +} + +static void str_rewind(size_t pos, void *context) { + Regex *r = context; + r->it = text_iterator_get(r->text, pos); +} + +static int str_compare(size_t pos1, size_t pos2, size_t len, void *context) { + Regex *r = context; + int ret = 1; + void *buf1 = malloc(len), *buf2 = malloc(len); + if (!buf1 || !buf2) + goto err; + text_bytes_get(r->text, pos1, len, buf1); + text_bytes_get(r->text, pos2, len, buf2); + ret = memcmp(buf1, buf2, len); +err: + free(buf1); + free(buf2); + return ret; +} + +Regex *text_regex_new(void) { + Regex *r = calloc(1, sizeof(*r)); + if (!r) + return NULL; + r->str_source = (tre_str_source) { + .get_next_char = str_next_char, + .rewind = str_rewind, + .compare = str_compare, + .context = r, + }; + return r; +} + +void text_regex_free(Regex *r) { + if (!r) + return; + tre_regfree(&r->regex); + free(r); +} + +int text_regex_compile(Regex *regex, const char *string, int cflags) { + int r = tre_regcomp(®ex->regex, string, cflags); + if (r) + tre_regcomp(®ex->regex, "\0\0", 0); + return r; +} + +int text_regex_match(Regex *r, const char *data, int eflags) { + return tre_regexec(&r->regex, data, 0, NULL, eflags); +} + +int text_search_range_forward(Text *txt, size_t pos, size_t len, Regex *r, size_t nmatch, RegexMatch pmatch[], int eflags) { + r->text = txt; + r->it = text_iterator_get(txt, pos); + r->end = pos+len; + + regmatch_t match[nmatch]; + int ret = tre_reguexec(&r->regex, &r->str_source, nmatch, match, eflags); + if (!ret) { + for (size_t i = 0; i < nmatch; i++) { + pmatch[i].start = match[i].rm_so == -1 ? EPOS : pos + match[i].rm_so; + pmatch[i].end = match[i].rm_eo == -1 ? EPOS : pos + match[i].rm_eo; + } + } + return ret; +} + +int text_search_range_backward(Text *txt, size_t pos, size_t len, Regex *r, size_t nmatch, RegexMatch pmatch[], int eflags) { + int ret = REG_NOMATCH; + size_t end = pos + len; + + while (pos < end && !text_search_range_forward(txt, pos, len, r, nmatch, pmatch, eflags)) { + ret = 0; + // FIXME: assumes nmatch >= 1 + size_t next = pmatch[0].end; + if (next == pos) { + next = text_line_next(txt, pos); + if (next == pos) + break; + } + pos = next; + len = end - pos; + } + + return ret; +} diff --git a/text-regex.h b/text-regex.h index 1b2c382..45054c8 100644 --- a/text-regex.h +++ b/text-regex.h @@ -1,7 +1,12 @@ #ifndef TEXT_REGEX_H #define TEXT_REGEX_H +/* make the REG_* constants available */ +#if CONFIG_TRE +#include +#else #include +#endif #include "text.h" typedef struct Regex Regex; diff --git a/vis-cmds.c b/vis-cmds.c index 60e5f91..879eadd 100644 --- a/vis-cmds.c +++ b/vis-cmds.c @@ -741,6 +741,7 @@ static bool cmd_help(Vis *vis, Win *win, Command *cmd, const char *argv[], Curso } configs[] = { { "Lua support: ", CONFIG_LUA }, { "Lua LPeg statically built-in: ", CONFIG_LPEG }, + { "TRE based regex support: ", CONFIG_TRE }, { "POSIX ACL support: ", CONFIG_ACL }, { "SELinux support: ", CONFIG_SELINUX }, }; -- cgit v1.2.3