replace oversized libutf with smaller version

this is taken from one of my other projects. there was no reason for there to be 2x the code tests checking for surrogate characters and non characters were removed. I see no reason why the user shouldn't be allowed to insert those characters in text (they exist in the standard). Also, in the case of non-characters only the first two were being checked and not the other 64.
author: Randy Palamar <randy@rnpnr.xyz> 2025-12-05 22:36:10 -0700
committer: Randy Palamar <randy@rnpnr.xyz> 2026-01-06 16:52:56 -0700
commit: 0d9bbb74c6de959ab7c6b93b7a97f9f2e643e8e8 (patch)
tree: 25596fd04e3623571a155e9c2b1e2503aa9dd4f6
parent: 6ced61ef5f366001877823ed8aff978035fa53c8 (diff)
download: vis-0d9bbb74c6de959ab7c6b93b7a97f9f2e643e8e8.tar.gz
vis-0d9bbb74c6de959ab7c6b93b7a97f9f2e643e8e8.tar.xz
10 files changed, 49 insertions, 118 deletions
diff --git a/LICENSE b/LICENSE
index 0e027be..02db665 100644
--- a/LICENSE
+++ b/LICENSE
@@ -23,9 +23,6 @@ under terms compatible with the above ISC license:
  - map.[ch] originate from the Comprehensive C Archive Network strmap
    module and are public domain / CC0 licensed
 
- - libutf.[ch] originate from libutf a port of Plan 9's Unicode library
-   to Unix and are MIT licensed
-
  - sam.[ch] is heavily inspired (and partially based upon) the X11
    version of Rob Pike's sam text editor originally written for Plan 9
    and distributed under an ISC-like license
diff --git a/libutf.c b/libutf.c
deleted file mode 100644
index 108595e..0000000
--- a/libutf.c
+++ /dev/null
@@ -1,54 +0,0 @@
-/* libutf8 © 2012-2015 Connor Lane Smith <cls@lubutu.com> */
-#include "util.h"
-
-#include "libutf.h"
-
-int
-runelen(Rune r)
-{
-	if(r <= 0x7F)
-		return 1;
-	else if(r <= 0x07FF)
-		return 2;
-	else if(r <= 0xD7FF)
-		return 3;
-	else if(r <= 0xDFFF)
-		return 0; /* surrogate character */
-	else if(r <= 0xFFFD)
-		return 3;
-	else if(r <= 0xFFFF)
-		return 0; /* illegal character */
-	else if(r <= Runemax)
-		return 4;
-	else
-		return 0; /* rune too large */
-}
-
-int
-runetochar(char *s, const Rune *p)
-{
-	Rune r = *p;
-
-	switch(runelen(r)) {
-	case 1: /* 0aaaaaaa */
-		s[0] = r;
-		return 1;
-	case 2: /* 00000aaa aabbbbbb */
-		s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
-		s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
-		return 2;
-	case 3: /* aaaabbbb bbcccccc */
-		s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
-		s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
-		s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
-		return 3;
-	case 4: /* 000aaabb bbbbcccc ccdddddd */
-		s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
-		s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
-		s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
-		s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
-		return 4;
-	default:
-		return 0; /* error */
-	}
-}
diff --git a/libutf.h b/libutf.h
deleted file mode 100644
index 30255cc..0000000
--- a/libutf.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef LIBUTF_H
-#define LIBUTF_H
-
-/* libutf8 © 2012-2015 Connor Lane Smith <cls@lubutu.com> */
-#include <stddef.h>
-#include <stdint.h>
-
-#if __STDC_VERSION__ >= 201112L
-#include <uchar.h>
-#ifdef __STDC_UTF_32__
-#define RUNE_C INT32_C
-typedef char32_t Rune;
-#endif
-#endif
-
-#ifndef RUNE_C
-#ifdef INT32_C
-#define RUNE_C INT32_C
-typedef uint_least32_t Rune;
-#else
-#define RUNE_C(x) x##L
-typedef unsigned long Rune;
-#endif
-#endif
-
-#define UTFmax 4 /* maximum bytes per rune */
-
-#define Runeself 0x80             /* rune and utf are equal (<) */
-#define Runemax  RUNE_C(0x10FFFF) /* maximum rune value */
-
-VIS_INTERNAL int runelen(Rune r);
-VIS_INTERNAL int runetochar(char *s, const Rune *p);
-
-#endif
diff --git a/main.c b/main.c
index 1aade34..d6bbf28 100644
--- a/main.c
+++ b/main.c
@@ -844,7 +844,7 @@ static KEY_ACTION_FN(ka_replace)
 	if (!next)
 		return NULL;
 
-	char replacement[UTFmax+1];
+	char replacement[4+1];
 	if (!vis_keys_utf8(vis, keys, replacement))
 		return next;
 
@@ -897,7 +897,7 @@ static KEY_ACTION_FN(ka_movement_key)
 	const char *next = vis_keys_next(vis, keys);
 	if (!next)
 		return NULL;
-	char utf8[UTFmax+1];
+	char utf8[4+1];
 	if (vis_keys_utf8(vis, keys, utf8))
 		vis_motion(vis, arg->i, utf8);
 	return next;
@@ -1030,8 +1030,8 @@ static KEY_ACTION_FN(ka_prompt_show)
 
 static KEY_ACTION_FN(ka_insert_verbatim)
 {
-	Rune rune = 0;
-	char buf[4], type = keys[0];
+	uint32_t rune = 0;
+	unsigned char buf[4], type = keys[0];
 	const char *data = NULL;
 	int len = 0, count = 0, base = 0;
 	switch (type) {
@@ -1084,22 +1084,22 @@ static KEY_ACTION_FN(ka_insert_verbatim)
 		if (count > 0)
 			return NULL;
 		if (type == 'u' || type == 'U') {
-			len = runetochar(buf, &rune);
+			len = utf8_encode(buf, rune);
 		} else {
 			buf[0] = rune;
 			len = 1;
 		}
 
-		data = buf;
+		data = (char *)buf;
 	} else {
 		const char *next = vis_keys_next(vis, keys);
 		if (!next)
 			return NULL;
-		if ((rune = vis_keys_codepoint(vis, keys)) != (Rune)-1) {
-			len = runetochar(buf, &rune);
+		if ((rune = vis_keys_codepoint(vis, keys)) != -1) {
+			len = utf8_encode(buf, rune);
 			if (buf[0] == '\n')
 				buf[0] = '\r';
-			data = buf;
+			data = (char *)buf;
 		} else {
 			vis_info_show(vis, "Unknown key");
 		}
diff --git a/test/vis/insert-mode/verbatim.in b/test/vis/insert-mode/verbatim.in
index 6fcb11a..da2d1b4 100644
--- a/test/vis/insert-mode/verbatim.in
+++ b/test/vis/insert-mode/verbatim.in
@@ -13,9 +13,7 @@ O100 =
 64 =
 U+07FF =
 U+D7FF =
-U+DFFF = /* not really correct */
 U+FFFD =
-U+FFFF = /* not really correct */
 U+10FFFF =
 U+11000 = /* invalid */
 <Escape> =
diff --git a/test/vis/insert-mode/verbatim.keys b/test/vis/insert-mode/verbatim.keys
index ba71c21..0441e3b 100644
--- a/test/vis/insert-mode/verbatim.keys
+++ b/test/vis/insert-mode/verbatim.keys
@@ -21,12 +21,8 @@ a<Space><C-v>u07FF<Escape>
 n
 a<Space><C-v>uD7FF<Escape>
 n
-a<Space><C-v>uDFFF<Escape>
-n
 a<Space><C-v>uFFFD<Escape>
 n
-a<Space><C-v>uFFFF<Escape>
-n
 a<Space><C-v>U0010FFFF<Escape>
 n
 a<Space><C-v>U00110000<Escape>
diff --git a/test/vis/insert-mode/verbatim.ref b/test/vis/insert-mode/verbatim.ref
index 28a5c8c..98494b0 100644
--- a/test/vis/insert-mode/verbatim.ref
+++ b/test/vis/insert-mode/verbatim.ref
@@ -13,9 +13,7 @@ O100 = @
 64 = @
 U+07FF = ߿
 U+D7FF = ퟿
-U+DFFF =  /* not really correct */
 U+FFFD = �
-U+FFFF =  /* not really correct */
 U+10FFFF = 􏿿
 U+11000 =  /* invalid */
 <Escape> = 
diff --git a/util.c b/util.c
new file mode 100644
index 0000000..202c750
--- /dev/null
+++ b/util.c
@@ -0,0 +1,28 @@
+static uint32_t
+utf8_encode(uint8_t out[4], uint32_t cp)
+{
+	uint32_t result;
+	if (cp <= 0x7F) {
+		out[0] = cp & 0x7F;
+		result = 1;
+	} else if (cp <= 0x7FF) {
+		result = 2;
+		out[0] = ((cp >>  6) & 0x1F) | 0xC0;
+		out[1] = ((cp >>  0) & 0x3F) | 0x80;
+	} else if (cp <= 0xFFFF) {
+		result = 3;
+		out[0] = ((cp >> 12) & 0x0F) | 0xE0;
+		out[1] = ((cp >>  6) & 0x3F) | 0x80;
+		out[2] = ((cp >>  0) & 0x3F) | 0x80;
+	} else if (cp <= 0x10FFFF) {
+		result = 4;
+		out[0] = ((cp >> 18) & 0x07) | 0xF0;
+		out[1] = ((cp >> 12) & 0x3F) | 0x80;
+		out[2] = ((cp >>  6) & 0x3F) | 0x80;
+		out[3] = ((cp >>  0) & 0x3F) | 0x80;
+	} else {
+		//out[0] = '?';
+		result = 0;
+	}
+	return result;
+}
diff --git a/vis.c b/vis.c
index 5b5ce2f..bd7bc2a 100644
--- a/vis.c
+++ b/vis.c
@@ -11,10 +11,11 @@
 #include "ui.h"
 #include "vis-subprocess.h"
 
+#include "util.c"
+
 #include "array.c"
 #include "buffer.c"
 #include "event-basic.c"
-#include "libutf.c"
 #include "map.c"
 #include "sam.c"
 #include "text.c"
@@ -1017,13 +1018,15 @@ long vis_keys_codepoint(Vis *vis, const char *keys) {
 	return -1;
 }
 
-bool vis_keys_utf8(Vis *vis, const char *keys, char utf8[static UTFmax+1]) {
-	Rune rune = vis_keys_codepoint(vis, keys);
-	if (rune == (Rune)-1)
-		return false;
-	size_t len = runetochar(utf8, &rune);
-	utf8[len] = '\0';
-	return true;
+bool vis_keys_utf8(Vis *vis, const char *keys, char utf8[4+1])
+{
+	uint32_t cp = vis_keys_codepoint(vis, keys);
+	bool result = cp != -1;
+	if (result) {
+		size_t len = utf8_encode((unsigned char *)utf8, cp);
+		utf8[len] = 0;
+	}
+	return result;
 }
 
 typedef struct {
diff --git a/vis.h b/vis.h
index 72f6ec5..b9d6f14 100644
--- a/vis.h
+++ b/vis.h
@@ -16,7 +16,6 @@ typedef struct Win Win;
 #include "ui.h"
 #include "view.h"
 #include "text-regex.h"
-#include "libutf.h"
 #include "array.h"
 #include "buffer.h"
 
@@ -1326,7 +1325,7 @@ VIS_EXPORT long vis_keys_codepoint(Vis *vis, const char *keys);
  * .. note:: Guarantees that ``utf8`` is NUL terminated on success.
  * @endrst
  */
-VIS_EXPORT bool vis_keys_utf8(Vis *vis, const char *keys, char utf8[static UTFmax+1]);
+VIS_EXPORT bool vis_keys_utf8(Vis *vis, const char *keys, char utf8[4+1]);
 /**
  * Process symbolic keys as if they were user originated input.
  * @param vis The editor instance.
author	Randy Palamar <randy@rnpnr.xyz>	2025-12-05 22:36:10 -0700
committer	Randy Palamar <randy@rnpnr.xyz>	2026-01-06 16:52:56 -0700
commit	0d9bbb74c6de959ab7c6b93b7a97f9f2e643e8e8 (patch)
tree	25596fd04e3623571a155e9c2b1e2503aa9dd4f6
parent	6ced61ef5f366001877823ed8aff978035fa53c8 (diff)
download	vis-0d9bbb74c6de959ab7c6b93b7a97f9f2e643e8e8.tar.gz vis-0d9bbb74c6de959ab7c6b93b7a97f9f2e643e8e8.tar.xz