commit b7125d595e08b2c70cda367782d2689ef2350587
parent ea636e77ab27c87482a51ee8176e150d3e92003d
Author: Connor Lane Smith <cls@lubutu.com>
Date: Sun, 6 May 2012 20:36:37 +0100
add istyperune functions
Diffstat:
Makefile | | | 24 | +++++++++++++++++++----- |
config.mk | | | 5 | +++-- |
mkrunetype.awk | | | 75 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
rune.c | | | 296 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
runetype.c | | | 28 | ++++++++++++++++++++++++++++ |
runetypebody.h | | | 1865 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
utf.c | | | 296 | ------------------------------------------------------------------------------- |
utf.h | | | 7 | +++++++ |
utftest.c | | | 107 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- |
9 files changed, 2378 insertions(+), 325 deletions(-)
diff --git a/Makefile b/Makefile
@@ -2,16 +2,30 @@
include config.mk
-SRC = utf.c utftest.c
+SRC = rune.c runetype.c
OBJ = $(SRC:.c=.o)
-all: utftest
+UCD = UnicodeData-6.1.0.txt
-utftest: $(OBJ)
- $(CC) $(LDFLAGS) -o $@ $(OBJ)
+all: libutf.a utftest
+
+libutf.a: $(OBJ)
+ rm -f $@
+ $(AR) rc $@ $(OBJ)
+
+utftest: utftest.o libutf.a
+ $(CC) $(LDFLAGS) -o $@ utftest.o
.c.o:
$(CC) $(CFLAGS) -c $<
+runetypebody.h: mkrunetype.awk $(UCD)
+ $(AWK) -f mkrunetype.awk $(UCD) > $@
+
+runetype.o: runetype.c runetypebody.h
+
clean:
- rm -f utftest $(OBJ)
+ rm -f libutf.a utftest utftest.o $(OBJ)
+
+purge: $(UCD) clean
+ rm -f runetypebody.h
diff --git a/config.mk b/config.mk
@@ -1,6 +1,7 @@
# See LICENSE file for copyright and license details.
CFLAGS = -ansi -pedantic -Os -Wall -Wextra
-LDFLAGS = -s
+LDFLAGS = -L. -lutf
-CC = cc
+CC = cc
+AWK = awk
diff --git a/mkrunetype.awk b/mkrunetype.awk
@@ -0,0 +1,75 @@
+# See LICENSE file for copyright and license details.
+
+BEGIN {
+ FS = ";"
+ # setup hexadecimal lookup table
+ for(i = 0; i < 16; i++)
+ hex[sprintf("%X",i)] = i;
+}
+
+$3 ~ /^L/ { alphav[alphac++] = $1; }
+$3 ~ /^Z/ { spacev[spacec++] = $1; }
+$3 == "Lu" { upperv[upperc++] = $1; }
+$3 == "Ll" { lowerv[lowerc++] = $1; }
+$3 == "Lt" { titlev[titlec++] = $1; }
+$3 == "Nd" { digitv[digitc++] = $1; }
+
+END {
+ print "/* Automatically generated from "FILENAME" by mkrunetype.awk */\n"
+
+ mkis("alpha", alphav, alphac);
+ mkis("space", spacev, spacec);
+ mkis("upper", upperv, upperc);
+ mkis("lower", lowerv, lowerc);
+ mkis("title", titlev, titlec);
+ mkis("digit", digitv, digitc);
+}
+
+function code(s) {
+ x = 0;
+ for(i = 1; i <= length(s); i++) {
+ c = substr(s, i, 1);
+ x = (x*16) + hex[c];
+ }
+ return x;
+}
+
+function mkis(name, runev, runec) {
+ rune1c = 0;
+ rune2c = 0;
+
+ for(j = k = 0; j < runec; j++) {
+ if(j+1 == runec || code(runev[j+1]) != code(runev[j])+1) {
+ if(j == k) {
+ rune1v[rune1c] = runev[j];
+ rune1c++;
+ }
+ else {
+ rune2v0[rune2c] = runev[k];
+ rune2v1[rune2c] = runev[j];
+ rune2c++;
+ }
+ k = j+1;
+ }
+ }
+ if(rune2c > 0) {
+ print "static Rune "name"2[][2] = {";
+ for(j = 0; j < rune2c; j++) {
+ print "\t{ 0x"rune2v0[j]", 0x"rune2v1[j]" },";
+ }
+ print "};\n";
+ }
+ if(rune1c > 0) {
+ print "static Rune "name"1[] = {";
+ for(j = 0; j < rune1c; j++) {
+ print "\t0x"rune1v[j]",";
+ }
+ print "};\n";
+ }
+ print "int\nis"name"rune(Rune r)\n{";
+ if(rune2c > 0)
+ print "\tif(bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &rune2cmp))\n\t\treturn 1;";
+ if(rune1c > 0)
+ print "\tif(bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &rune1cmp))\n\t\treturn 1;";
+ print "\treturn 0;\n}\n";
+}
diff --git a/rune.c b/rune.c
@@ -0,0 +1,296 @@
+/* See LICENSE file for copyright and license details. */
+#include <string.h>
+#include "utf.h"
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \
+ : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \
+ : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \
+ : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \
+ : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \
+ : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \
+ : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \
+ : 0 )
+
+/*
+ * runetochar copies one rune at p to at most UTFmax bytes starting at s and
+ * returns the number of bytes copied. UTFmax is the maximum number of bytes
+ * required to represent a legal rune.
+ *
+ * If the rune is illegal, runetochar will return 0.
+ */
+int
+runetochar(char *s, Rune *p)
+{
+ Rune r = *p;
+
+ switch(runelen(r)) {
+ case 1: /* 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ case 2: /* 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ case 3: /* aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ case 4: /* 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ default:
+ return 0; /* error */
+ }
+}
+
+/*
+ * chartorune copies at most UTFmax bytes starting at s to one rune at p and
+ * returns the number of bytes copied. If the input is not valid UTF-8,
+ * chartorune will convert the sequence to Runeerror (0xFFFD), returning the
+ * number of bytes in the invalid sequence.
+ */
+int
+chartorune(Rune *p, const char *s)
+{
+ return charntorune(p, s, UTFmax);
+}
+
+/*
+ * charntorune copies at most len bytes starting at s to one rune at p and
+ * returns the number of bytes copied. If the input is not valid UTF-8,
+ * charntorune will convert the sequence to Runeerror (0xFFFD), returning the
+ * number of bytes in the invalid sequence.
+ *
+ * If a potentially valid sequence is cut off by the len limit, charntorune will
+ * return 0.
+ */
+int
+charntorune(Rune *p, const char *s, size_t len)
+{
+ unsigned int i, n;
+ Rune r;
+
+ if(len == 0) /* can't even look at s[0] */
+ return 0;
+
+ switch((n = UTFSEQ(s[0]))) {
+ case 1: r = s[0]; break; /* 0xxxxxxx */
+ case 2: r = s[0] & 0x1F; break; /* 110xxxxx */
+ case 3: r = s[0] & 0x0F; break; /* 1110xxxx */
+ case 4: r = s[0] & 0x07; break; /* 11110xxx */
+ case 5: r = s[0] & 0x03; break; /* 111110xx */
+ case 6: r = s[0] & 0x01; break; /* 1111110x */
+ default: /* invalid sequence */
+ *p = Runeerror;
+ return 1;
+ }
+ /* add values from continuation bytes */
+ for(i = 1; i < MIN(n, len); i++)
+ if((s[i] & 0xC0) != 0x80) {
+ /* expected continuation */
+ *p = Runeerror;
+ return i;
+ }
+ else
+ r = (r << 6) | (s[i] & 0x3F);
+
+ if(i < n) /* must have reached len limit */
+ return 0;
+
+ /* reject invalid runes and overlong sequences */
+ if(n > UTFmax || r > 0x10FFFF || runelen(r) < (int)n || (r & 0xFFFE) == 0xFFFE
+ || (r >= 0xD800 && r <= 0xDFFF) || (r >= 0xFDD0 && r <= 0xFDEF))
+ r = Runeerror;
+
+ *p = r;
+ return n;
+}
+
+/*
+ * runelen returns the number of bytes required to convert r into UTF-8. If the
+ * rune is illegal, runelen will return 0.
+ */
+int
+runelen(Rune r)
+{
+ if(r <= 0x7F)
+ return 1;
+ else if(r <= 0x07FF)
+ return 2;
+ else if(r <= 0xFFFF)
+ return 3;
+ else if(r <= Runemax)
+ return 4;
+ else
+ return 0; /* error */
+}
+
+/*
+ * runelen returns the number of bytes required to convert the rune-string of
+ * length len pointed to by p into UTF-8.
+ */
+size_t
+runenlen(Rune *p, size_t len)
+{
+ size_t i, n = 0;
+
+ for(i = 0; i < len; i++)
+ n += runelen(p[i]);
+ return n;
+}
+
+/*
+ * fullrune returns 1 if the string s of length len is long enough to be
+ * decoded by chartorune, and 0 otherwise.
+ */
+int
+fullrune(const char *s, size_t len)
+{
+ Rune r;
+
+ return charntorune(&r, s, len) > 0;
+}
+
+/*
+ * utfecpy copies UTF-8 sequences until a null sequence has been copied, but
+ * writes no sequences beyond end. If any sequences are copied, the to string is
+ * terminated by a null sequence, and a pointer to that sequence is returned.
+ * Otherwise, the original to string is returned.
+ */
+char *
+utfecpy(char *to, char *end, const char *from)
+{
+ Rune r = Runeerror;
+ size_t i, n;
+
+ /* seek through to find final full rune */
+ for(i = 0; r != '\0' && (n = charntorune(&r, &from[i], end - &to[i])); i += n)
+ ;
+ memcpy(to, from, i); /* copy over bytes up to this rune */
+
+ if(i > 0 && r != '\0')
+ to[i] = '\0'; /* terminate if unterminated */
+ return &to[i];
+}
+
+/*
+ * utflen returns the number of runes that are represented by the UTF-8 string
+ * s.
+ */
+size_t
+utflen(const char *s)
+{
+ const char *p = s;
+ size_t i;
+ Rune r;
+
+ for(i = 0; *p != '\0'; i++)
+ p += chartorune(&r, p);
+ return i;
+}
+
+/*
+ * utfnlen returns the number of runes that are represented by the UTF-8 string
+ * s of length len. If the last few bytes contain an incompletely coded rune,
+ * utfnlen will not count them; in this way it differs from utflen, which
+ * includes every byte of the string.
+ */
+size_t
+utfnlen(const char *s, size_t len)
+{
+ const char *p = s;
+ size_t i;
+ Rune r;
+ int n;
+
+ for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++)
+ p += n;
+ return i;
+}
+
+/*
+ * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s,
+ * or NULL if r does not occur in s. The null byte terminating a string is
+ * considered to be part of the string s.
+ */
+char *
+utfrune(const char *s, Rune r)
+{
+ if(r < Runeself) {
+ return strchr(s, r);
+ }
+ else if(r == Runeerror) {
+ Rune r0;
+ int n;
+
+ for(; *s != '\0'; s += n) {
+ n = chartorune(&r0, s);
+ if(r == r0)
+ return (char *)s;
+ }
+ }
+ else {
+ char buf[UTFmax+1];
+ int n;
+
+ if(!(n = runetochar(buf, &r)))
+ return NULL;
+ buf[n] = '\0';
+ return strstr(s, buf);
+ }
+ return NULL;
+}
+
+/*
+ * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s,
+ * or NULL if r does not occur in s. The null byte terminating a string is
+ * considered to be part of the string s.
+ */
+char *
+utfrrune(const char *s, Rune r)
+{
+ const char *p = NULL;
+ Rune r0;
+ int n;
+
+ if(r < Runeself)
+ return strrchr(s, r);
+
+ for(; *s != '\0'; s += n) {
+ n = chartorune(&r0, s);
+ if(r == r0)
+ p = s;
+ }
+ return (char *)p;
+}
+
+/*
+ * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a
+ * UTF-8 substring of s, or NULL if there is none. If t is the null string,
+ * utfutf returns s.
+ */
+char *
+utfutf(const char *s, const char *t)
+{
+ const char *p, *q;
+ Rune r0, r1, r2;
+ int n, m;
+
+ for(chartorune(&r0, t); (s = utfrune(s, r0)); s++) {
+ for(p = s, q = t; *q && *p; p += n, q += m) {
+ n = chartorune(&r1, p);
+ m = chartorune(&r2, q);
+ if(r1 != r2)
+ break;
+ }
+ if(!*q)
+ return (char *)s;
+ }
+ return NULL;
+}
diff --git a/runetype.c b/runetype.c
@@ -0,0 +1,28 @@
+#include <stdlib.h>
+#include "utf.h"
+
+#define nelem(x) (sizeof (x) / sizeof *(x))
+
+static int rune1cmp(const void *, const void *);
+static int rune2cmp(const void *, const void *);
+
+#include "runetypebody.h"
+
+int
+rune1cmp(const void *v1, const void *v2)
+{
+ Rune r1 = *(Rune *)v1, r2 = *(Rune *)v2;
+
+ return r1 - r2;
+}
+
+int
+rune2cmp(const void *v1, const void *v2)
+{
+ Rune r = *(Rune *)v1, *p = (Rune *)v2;
+
+ if(r >= p[0] && r <= p[1])
+ return 0;
+ else
+ return r - p[0];
+}
diff --git a/runetypebody.h b/runetypebody.h
@@ -0,0 +1,1865 @@
+/* Automatically generated from UnicodeData-6.1.0.txt by mkrunetype.awk */
+
+static Rune alpha2[][2] = {
+ { 0x0041, 0x005A },
+ { 0x0061, 0x007A },
+ { 0x00C0, 0x00D6 },
+ { 0x00D8, 0x00F6 },
+ { 0x00F8, 0x02C1 },
+ { 0x02C6, 0x02D1 },
+ { 0x02E0, 0x02E4 },
+ { 0x0370, 0x0374 },
+ { 0x0376, 0x0377 },
+ { 0x037A, 0x037D },
+ { 0x0388, 0x038A },
+ { 0x038E, 0x03A1 },
+ { 0x03A3, 0x03F5 },
+ { 0x03F7, 0x0481 },
+ { 0x048A, 0x0527 },
+ { 0x0531, 0x0556 },
+ { 0x0561, 0x0587 },
+ { 0x05D0, 0x05EA },
+ { 0x05F0, 0x05F2 },
+ { 0x0620, 0x064A },
+ { 0x066E, 0x066F },
+ { 0x0671, 0x06D3 },
+ { 0x06E5, 0x06E6 },
+ { 0x06EE, 0x06EF },
+ { 0x06FA, 0x06FC },
+ { 0x0712, 0x072F },
+ { 0x074D, 0x07A5 },
+ { 0x07CA, 0x07EA },
+ { 0x07F4, 0x07F5 },
+ { 0x0800, 0x0815 },
+ { 0x0840, 0x0858 },
+ { 0x08A2, 0x08AC },
+ { 0x0904, 0x0939 },
+ { 0x0958, 0x0961 },
+ { 0x0971, 0x0977 },
+ { 0x0979, 0x097F },
+ { 0x0985, 0x098C },
+ { 0x098F, 0x0990 },
+ { 0x0993, 0x09A8 },
+ { 0x09AA, 0x09B0 },
+ { 0x09B6, 0x09B9 },
+ { 0x09DC, 0x09DD },
+ { 0x09DF, 0x09E1 },
+ { 0x09F0, 0x09F1 },
+ { 0x0A05, 0x0A0A },
+ { 0x0A0F, 0x0A10 },
+ { 0x0A13, 0x0A28 },
+ { 0x0A2A, 0x0A30 },
+ { 0x0A32, 0x0A33 },
+ { 0x0A35, 0x0A36 },
+ { 0x0A38, 0x0A39 },
+ { 0x0A59, 0x0A5C },
+ { 0x0A72, 0x0A74 },
+ { 0x0A85, 0x0A8D },
+ { 0x0A8F, 0x0A91 },
+ { 0x0A93, 0x0AA8 },
+ { 0x0AAA, 0x0AB0 },
+ { 0x0AB2, 0x0AB3 },
+ { 0x0AB5, 0x0AB9 },
+ { 0x0AE0, 0x0AE1 },
+ { 0x0B05, 0x0B0C },
+ { 0x0B0F, 0x0B10 },
+ { 0x0B13, 0x0B28 },
+ { 0x0B2A, 0x0B30 },
+ { 0x0B32, 0x0B33 },
+ { 0x0B35, 0x0B39 },
+ { 0x0B5C, 0x0B5D },
+ { 0x0B5F, 0x0B61 },
+ { 0x0B85, 0x0B8A },
+ { 0x0B8E, 0x0B90 },
+ { 0x0B92, 0x0B95 },
+ { 0x0B99, 0x0B9A },
+ { 0x0B9E, 0x0B9F },
+ { 0x0BA3, 0x0BA4 },
+ { 0x0BA8, 0x0BAA },
+ { 0x0BAE, 0x0BB9 },
+ { 0x0C05, 0x0C0C },
+ { 0x0C0E, 0x0C10 },
+ { 0x0C12, 0x0C28 },
+ { 0x0C2A, 0x0C33 },
+ { 0x0C35, 0x0C39 },
+ { 0x0C58, 0x0C59 },
+ { 0x0C60, 0x0C61 },
+ { 0x0C85, 0x0C8C },
+ { 0x0C8E, 0x0C90 },
+ { 0x0C92, 0x0CA8 },
+ { 0x0CAA, 0x0CB3 },
+ { 0x0CB5, 0x0CB9 },
+ { 0x0CE0, 0x0CE1 },
+ { 0x0CF1, 0x0CF2 },
+ { 0x0D05, 0x0D0C },
+ { 0x0D0E, 0x0D10 },
+ { 0x0D12, 0x0D3A },
+ { 0x0D60, 0x0D61 },
+ { 0x0D7A, 0x0D7F },
+ { 0x0D85, 0x0D96 },
+ { 0x0D9A, 0x0DB1 },
+ { 0x0DB3, 0x0DBB },
+ { 0x0DC0, 0x0DC6 },
+ { 0x0E01, 0x0E30 },
+ { 0x0E32, 0x0E33 },
+ { 0x0E40, 0x0E46 },
+ { 0x0E81, 0x0E82 },
+ { 0x0E87, 0x0E88 },
+ { 0x0E94, 0x0E97 },
+ { 0x0E99, 0x0E9F },
+ { 0x0EA1, 0x0EA3 },
+ { 0x0EAA, 0x0EAB },
+ { 0x0EAD, 0x0EB0 },
+ { 0x0EB2, 0x0EB3 },
+ { 0x0EC0, 0x0EC4 },
+ { 0x0EDC, 0x0EDF },
+ { 0x0F40, 0x0F47 },
+ { 0x0F49, 0x0F6C },
+ { 0x0F88, 0x0F8C },
+ { 0x1000, 0x102A },
+ { 0x1050, 0x1055 },
+ { 0x105A, 0x105D },
+ { 0x1065, 0x1066 },
+ { 0x106E, 0x1070 },
+ { 0x1075, 0x1081 },
+ { 0x10A0, 0x10C5 },
+ { 0x10D0, 0x10FA },
+ { 0x10FC, 0x1248 },
+ { 0x124A, 0x124D },
+ { 0x1250, 0x1256 },
+ { 0x125A, 0x125D },
+ { 0x1260, 0x1288 },
+ { 0x128A, 0x128D },
+ { 0x1290, 0x12B0 },
+ { 0x12B2, 0x12B5 },
+ { 0x12B8, 0x12BE },
+ { 0x12C2, 0x12C5 },
+ { 0x12C8, 0x12D6 },
+ { 0x12D8, 0x1310 },
+ { 0x1312, 0x1315 },
+ { 0x1318, 0x135A },
+ { 0x1380, 0x138F },
+ { 0x13A0, 0x13F4 },
+ { 0x1401, 0x166C },
+ { 0x166F, 0x167F },
+ { 0x1681, 0x169A },
+ { 0x16A0, 0x16EA },
+ { 0x1700, 0x170C },
+ { 0x170E, 0x1711 },
+ { 0x1720, 0x1731 },
+ { 0x1740, 0x1751 },
+ { 0x1760, 0x176C },
+ { 0x176E, 0x1770 },
+ { 0x1780, 0x17B3 },
+ { 0x1820, 0x1877 },
+ { 0x1880, 0x18A8 },
+ { 0x18B0, 0x18F5 },
+ { 0x1900, 0x191C },
+ { 0x1950, 0x196D },
+ { 0x1970, 0x1974 },
+ { 0x1980, 0x19AB },
+ { 0x19C1, 0x19C7 },
+ { 0x1A00, 0x1A16 },
+ { 0x1A20, 0x1A54 },
+ { 0x1B05, 0x1B33 },
+ { 0x1B45, 0x1B4B },
+ { 0x1B83, 0x1BA0 },
+ { 0x1BAE, 0x1BAF },
+ { 0x1BBA, 0x1BE5 },
+ { 0x1C00, 0x1C23 },
+ { 0x1C4D, 0x1C4F },
+ { 0x1C5A, 0x1C7D },
+ { 0x1CE9, 0x1CEC },
+ { 0x1CEE, 0x1CF1 },
+ { 0x1CF5, 0x1CF6 },
+ { 0x1D00, 0x1DBF },
+ { 0x1E00, 0x1F15 },
+ { 0x1F18, 0x1F1D },
+ { 0x1F20, 0x1F45 },
+ { 0x1F48, 0x1F4D },
+ { 0x1F50, 0x1F57 },
+ { 0x1F5F, 0x1F7D },
+ { 0x1F80, 0x1FB4 },
+ { 0x1FB6, 0x1FBC },
+ { 0x1FC2, 0x1FC4 },
+ { 0x1FC6, 0x1FCC },
+ { 0x1FD0, 0x1FD3 },
+ { 0x1FD6, 0x1FDB },
+ { 0x1FE0, 0x1FEC },
+ { 0x1FF2, 0x1FF4 },
+ { 0x1FF6, 0x1FFC },
+ { 0x2090, 0x209C },
+ { 0x210A, 0x2113 },
+ { 0x2119, 0x211D },
+ { 0x212A, 0x212D },
+ { 0x212F, 0x2139 },
+ { 0x213C, 0x213F },
+ { 0x2145, 0x2149 },
+ { 0x2183, 0x2184 },
+ { 0x2C00, 0x2C2E },
+ { 0x2C30, 0x2C5E },
+ { 0x2C60, 0x2CE4 },
+ { 0x2CEB, 0x2CEE },
+ { 0x2CF2, 0x2CF3 },
+ { 0x2D00, 0x2D25 },
+ { 0x2D30, 0x2D67 },
+ { 0x2D80, 0x2D96 },
+ { 0x2DA0, 0x2DA6 },
+ { 0x2DA8, 0x2DAE },
+ { 0x2DB0, 0x2DB6 },
+ { 0x2DB8, 0x2DBE },
+ { 0x2DC0, 0x2DC6 },
+ { 0x2DC8, 0x2DCE },
+ { 0x2DD0, 0x2DD6 },
+ { 0x2DD8, 0x2DDE },
+ { 0x3005, 0x3006 },
+ { 0x3031, 0x3035 },
+ { 0x303B, 0x303C },
+ { 0x3041, 0x3096 },
+ { 0x309D, 0x309F },
+ { 0x30A1, 0x30FA },
+ { 0x30FC, 0x30FF },
+ { 0x3105, 0x312D },
+ { 0x3131, 0x318E },
+ { 0x31A0, 0x31BA },
+ { 0x31F0, 0x31FF },
+ { 0xA000, 0xA48C },
+ { 0xA4D0, 0xA4FD },
+ { 0xA500, 0xA60C },
+ { 0xA610, 0xA61F },
+ { 0xA62A, 0xA62B },
+ { 0xA640, 0xA66E },
+ { 0xA67F, 0xA697 },
+ { 0xA6A0, 0xA6E5 },
+ { 0xA717, 0xA71F },
+ { 0xA722, 0xA788 },
+ { 0xA78B, 0xA78E },
+ { 0xA790, 0xA793 },
+ { 0xA7A0, 0xA7AA },
+ { 0xA7F8, 0xA801 },
+ { 0xA803, 0xA805 },
+ { 0xA807, 0xA80A },
+ { 0xA80C, 0xA822 },
+ { 0xA840, 0xA873 },
+ { 0xA882, 0xA8B3 },
+ { 0xA8F2, 0xA8F7 },
+ { 0xA90A, 0xA925 },
+ { 0xA930, 0xA946 },
+ { 0xA960, 0xA97C },
+ { 0xA984, 0xA9B2 },
+ { 0xAA00, 0xAA28 },
+ { 0xAA40, 0xAA42 },
+ { 0xAA44, 0xAA4B },
+ { 0xAA60, 0xAA76 },
+ { 0xAA80, 0xAAAF },
+ { 0xAAB5, 0xAAB6 },
+ { 0xAAB9, 0xAABD },
+ { 0xAADB, 0xAADD },
+ { 0xAAE0, 0xAAEA },
+ { 0xAAF2, 0xAAF4 },
+ { 0xAB01, 0xAB06 },
+ { 0xAB09, 0xAB0E },
+ { 0xAB11, 0xAB16 },
+ { 0xAB20, 0xAB26 },
+ { 0xAB28, 0xAB2E },
+ { 0xABC0, 0xABE2 },
+ { 0xD7B0, 0xD7C6 },
+ { 0xD7CB, 0xD7FB },
+ { 0xF900, 0xFA6D },
+ { 0xFA70, 0xFAD9 },
+ { 0xFB00, 0xFB06 },
+ { 0xFB13, 0xFB17 },
+ { 0xFB1F, 0xFB28 },
+ { 0xFB2A, 0xFB36 },
+ { 0xFB38, 0xFB3C },
+ { 0xFB40, 0xFB41 },
+ { 0xFB43, 0xFB44 },
+ { 0xFB46, 0xFBB1 },
+ { 0xFBD3, 0xFD3D },
+ { 0xFD50, 0xFD8F },
+ { 0xFD92, 0xFDC7 },
+ { 0xFDF0, 0xFDFB },
+ { 0xFE70, 0xFE74 },
+ { 0xFE76, 0xFEFC },
+ { 0xFF21, 0xFF3A },
+ { 0xFF41, 0xFF5A },
+ { 0xFF66, 0xFFBE },
+ { 0xFFC2, 0xFFC7 },
+ { 0xFFCA, 0xFFCF },
+ { 0xFFD2, 0xFFD7 },
+ { 0xFFDA, 0xFFDC },
+ { 0x10000, 0x1000B },
+ { 0x1000D, 0x10026 },
+ { 0x10028, 0x1003A },
+ { 0x1003C, 0x1003D },
+ { 0x1003F, 0x1004D },
+ { 0x10050, 0x1005D },
+ { 0x10080, 0x100FA },
+ { 0x10280, 0x1029C },
+ { 0x102A0, 0x102D0 },
+ { 0x10300, 0x1031E },
+ { 0x10330, 0x10340 },
+ { 0x10342, 0x10349 },
+ { 0x10380, 0x1039D },
+ { 0x103A0, 0x103C3 },
+ { 0x103C8, 0x103CF },
+ { 0x10400, 0x1049D },
+ { 0x10800, 0x10805 },
+ { 0x1080A, 0x10835 },
+ { 0x10837, 0x10838 },
+ { 0x1083F, 0x10855 },
+ { 0x10900, 0x10915 },
+ { 0x10920, 0x10939 },
+ { 0x10980, 0x109B7 },
+ { 0x109BE, 0x109BF },
+ { 0x10A10, 0x10A13 },
+ { 0x10A15, 0x10A17 },
+ { 0x10A19, 0x10A33 },
+ { 0x10A60, 0x10A7C },
+ { 0x10B00, 0x10B35 },
+ { 0x10B40, 0x10B55 },
+ { 0x10B60, 0x10B72 },
+ { 0x10C00, 0x10C48 },
+ { 0x11003, 0x11037 },
+ { 0x11083, 0x110AF },
+ { 0x110D0, 0x110E8 },
+ { 0x11103, 0x11126 },
+ { 0x11183, 0x111B2 },
+ { 0x111C1, 0x111C4 },
+ { 0x11680, 0x116AA },
+ { 0x12000, 0x1236E },
+ { 0x13000, 0x1342E },
+ { 0x16800, 0x16A38 },
+ { 0x16F00, 0x16F44 },
+ { 0x16F93, 0x16F9F },
+ { 0x1B000, 0x1B001 },
+ { 0x1D400, 0x1D454 },
+ { 0x1D456, 0x1D49C },
+ { 0x1D49E, 0x1D49F },
+ { 0x1D4A5, 0x1D4A6 },
+ { 0x1D4A9, 0x1D4AC },
+ { 0x1D4AE, 0x1D4B9 },
+ { 0x1D4BD, 0x1D4C3 },
+ { 0x1D4C5, 0x1D505 },
+ { 0x1D507, 0x1D50A },
+ { 0x1D50D, 0x1D514 },
+ { 0x1D516, 0x1D51C },
+ { 0x1D51E, 0x1D539 },
+ { 0x1D53B, 0x1D53E },
+ { 0x1D540, 0x1D544 },
+ { 0x1D54A, 0x1D550 },
+ { 0x1D552, 0x1D6A5 },
+ { 0x1D6A8, 0x1D6C0 },
+ { 0x1D6C2, 0x1D6DA },
+ { 0x1D6DC, 0x1D6FA },
+ { 0x1D6FC, 0x1D714 },
+ { 0x1D716, 0x1D734 },
+ { 0x1D736, 0x1D74E },
+ { 0x1D750, 0x1D76E },
+ { 0x1D770, 0x1D788 },
+ { 0x1D78A, 0x1D7A8 },
+ { 0x1D7AA, 0x1D7C2 },
+ { 0x1D7C4, 0x1D7CB },
+ { 0x1EE00, 0x1EE03 },
+ { 0x1EE05, 0x1EE1F },
+ { 0x1EE21, 0x1EE22 },
+ { 0x1EE29, 0x1EE32 },
+ { 0x1EE34, 0x1EE37 },
+ { 0x1EE4D, 0x1EE4F },
+ { 0x1EE51, 0x1EE52 },
+ { 0x1EE61, 0x1EE62 },
+ { 0x1EE67, 0x1EE6A },
+ { 0x1EE6C, 0x1EE72 },
+ { 0x1EE74, 0x1EE77 },
+ { 0x1EE79, 0x1EE7C },
+ { 0x1EE80, 0x1EE89 },
+ { 0x1EE8B, 0x1EE9B },
+ { 0x1EEA1, 0x1EEA3 },
+ { 0x1EEA5, 0x1EEA9 },
+ { 0x1EEAB, 0x1EEBB },
+ { 0x2F800, 0x2FA1D },
+};
+
+static Rune alpha1[] = {
+ 0x00AA,
+ 0x00B5,
+ 0x00BA,
+ 0x02EC,
+ 0x02EE,
+ 0x0386,
+ 0x038C,
+ 0x0559,
+ 0x06D5,
+ 0x06FF,
+ 0x0710,
+ 0x07B1,
+ 0x07FA,
+ 0x081A,
+ 0x0824,
+ 0x0828,
+ 0x08A0,
+ 0x093D,
+ 0x0950,
+ 0x09B2,
+ 0x09BD,
+ 0x09CE,
+ 0x0A5E,
+ 0x0ABD,
+ 0x0AD0,
+ 0x0B3D,
+ 0x0B71,
+ 0x0B83,
+ 0x0B9C,
+ 0x0BD0,
+ 0x0C3D,
+ 0x0CBD,
+ 0x0CDE,
+ 0x0D3D,
+ 0x0D4E,
+ 0x0DBD,
+ 0x0E84,
+ 0x0E8A,
+ 0x0E8D,
+ 0x0EA5,
+ 0x0EA7,
+ 0x0EBD,
+ 0x0EC6,
+ 0x0F00,
+ 0x103F,
+ 0x1061,
+ 0x108E,
+ 0x10C7,
+ 0x10CD,
+ 0x1258,
+ 0x12C0,
+ 0x17D7,
+ 0x17DC,
+ 0x18AA,
+ 0x1AA7,
+ 0x1F59,
+ 0x1F5B,
+ 0x1F5D,
+ 0x1FBE,
+ 0x2071,
+ 0x207F,
+ 0x2102,
+ 0x2107,
+ 0x2115,
+ 0x2124,
+ 0x2126,
+ 0x2128,
+ 0x214E,
+ 0x2D27,
+ 0x2D2D,
+ 0x2D6F,
+ 0x2E2F,
+ 0x3400,
+ 0x4DB5,
+ 0x4E00,
+ 0x9FCC,
+ 0xA8FB,
+ 0xA9CF,
+ 0xAA7A,
+ 0xAAB1,
+ 0xAAC0,
+ 0xAAC2,
+ 0xAC00,
+ 0xD7A3,
+ 0xFB1D,
+ 0xFB3E,
+ 0x10808,
+ 0x1083C,
+ 0x10A00,
+ 0x16F50,
+ 0x1D4A2,
+ 0x1D4BB,
+ 0x1D546,
+ 0x1EE24,
+ 0x1EE27,
+ 0x1EE39,
+ 0x1EE3B,
+ 0x1EE42,
+ 0x1EE47,
+ 0x1EE49,
+ 0x1EE4B,
+ 0x1EE54,
+ 0x1EE57,
+ 0x1EE59,
+ 0x1EE5B,
+ 0x1EE5D,
+ 0x1EE5F,
+ 0x1EE64,
+ 0x1EE7E,
+ 0x20000,
+ 0x2A6D6,
+ 0x2A700,
+ 0x2B734,
+ 0x2B740,
+ 0x2B81D,
+};
+
+int
+isalpharune(Rune r)
+{
+ if(bsearch(&r, alpha2, nelem(alpha2), sizeof *alpha2, &rune2cmp))
+ return 1;
+ if(bsearch(&r, alpha1, nelem(alpha1), sizeof *alpha1, &rune1cmp))
+ return 1;
+ return 0;
+}
+
+static Rune space2[][2] = {
+ { 0x2000, 0x200A },
+ { 0x2028, 0x2029 },
+};
+
+static Rune space1[] = {
+ 0x0020,
+ 0x00A0,
+ 0x1680,
+ 0x180E,
+ 0x202F,
+ 0x205F,
+ 0x3000,
+};
+
+int
+isspacerune(Rune r)
+{
+ if(bsearch(&r, space2, nelem(space2), sizeof *space2, &rune2cmp))
+ return 1;
+ if(bsearch(&r, space1, nelem(space1), sizeof *space1, &rune1cmp))
+ return 1;
+ return 0;
+}
+
+static Rune upper2[][2] = {
+ { 0x0041, 0x005A },
+ { 0x00C0, 0x00D6 },
+ { 0x00D8, 0x00DE },
+ { 0x0178, 0x0179 },
+ { 0x0181, 0x0182 },
+ { 0x0186, 0x0187 },
+ { 0x0189, 0x018B },
+ { 0x018E, 0x0191 },
+ { 0x0193, 0x0194 },
+ { 0x0196, 0x0198 },
+ { 0x019C, 0x019D },
+ { 0x019F, 0x01A0 },
+ { 0x01A6, 0x01A7 },
+ { 0x01AE, 0x01AF },
+ { 0x01B1, 0x01B3 },
+ { 0x01B7, 0x01B8 },
+ { 0x01F6, 0x01F8 },
+ { 0x023A, 0x023B },
+ { 0x023D, 0x023E },
+ { 0x0243, 0x0246 },
+ { 0x0388, 0x038A },
+ { 0x038E, 0x038F },
+ { 0x0391, 0x03A1 },
+ { 0x03A3, 0x03AB },
+ { 0x03D2, 0x03D4 },
+ { 0x03F9, 0x03FA },
+ { 0x03FD, 0x042F },
+ { 0x04C0, 0x04C1 },
+ { 0x0531, 0x0556 },
+ { 0x10A0, 0x10C5 },
+ { 0x1F08, 0x1F0F },
+ { 0x1F18, 0x1F1D },
+ { 0x1F28, 0x1F2F },
+ { 0x1F38, 0x1F3F },
+ { 0x1F48, 0x1F4D },
+ { 0x1F68, 0x1F6F },
+ { 0x1FB8, 0x1FBB },
+ { 0x1FC8, 0x1FCB },
+ { 0x1FD8, 0x1FDB },
+ { 0x1FE8, 0x1FEC },
+ { 0x1FF8, 0x1FFB },
+ { 0x210B, 0x210D },
+ { 0x2110, 0x2112 },
+ { 0x2119, 0x211D },
+ { 0x212A, 0x212D },
+ { 0x2130, 0x2133 },
+ { 0x213E, 0x213F },
+ { 0x2C00, 0x2C2E },
+ { 0x2C62, 0x2C64 },
+ { 0x2C6D, 0x2C70 },
+ { 0x2C7E, 0x2C80 },
+ { 0xA77D, 0xA77E },
+ { 0xFF21, 0xFF3A },
+ { 0x10400, 0x10427 },
+ { 0x1D400, 0x1D419 },
+ { 0x1D434, 0x1D44D },
+ { 0x1D468, 0x1D481 },
+ { 0x1D49E, 0x1D49F },
+ { 0x1D4A5, 0x1D4A6 },
+ { 0x1D4A9, 0x1D4AC },
+ { 0x1D4AE, 0x1D4B5 },
+ { 0x1D4D0, 0x1D4E9 },
+ { 0x1D504, 0x1D505 },
+ { 0x1D507, 0x1D50A },
+ { 0x1D50D, 0x1D514 },
+ { 0x1D516, 0x1D51C },
+ { 0x1D538, 0x1D539 },
+ { 0x1D53B, 0x1D53E },
+ { 0x1D540, 0x1D544 },
+ { 0x1D54A, 0x1D550 },
+ { 0x1D56C, 0x1D585 },
+ { 0x1D5A0, 0x1D5B9 },
+ { 0x1D5D4, 0x1D5ED },
+ { 0x1D608, 0x1D621 },
+ { 0x1D63C, 0x1D655 },
+ { 0x1D670, 0x1D689 },
+ { 0x1D6A8, 0x1D6C0 },
+ { 0x1D6E2, 0x1D6FA },
+ { 0x1D71C, 0x1D734 },
+ { 0x1D756, 0x1D76E },
+ { 0x1D790, 0x1D7A8 },
+};
+
+static Rune upper1[] = {
+ 0x0100,
+ 0x0102,
+ 0x0104,
+ 0x0106,
+ 0x0108,
+ 0x010A,
+ 0x010C,
+ 0x010E,
+ 0x0110,
+ 0x0112,
+ 0x0114,
+ 0x0116,
+ 0x0118,
+ 0x011A,
+ 0x011C,
+ 0x011E,
+ 0x0120,
+ 0x0122,
+ 0x0124,
+ 0x0126,
+ 0x0128,
+ 0x012A,
+ 0x012C,
+ 0x012E,
+ 0x0130,
+ 0x0132,
+ 0x0134,
+ 0x0136,
+ 0x0139,
+ 0x013B,
+ 0x013D,
+ 0x013F,
+ 0x0141,
+ 0x0143,
+ 0x0145,
+ 0x0147,
+ 0x014A,
+ 0x014C,
+ 0x014E,
+ 0x0150,
+ 0x0152,
+ 0x0154,
+ 0x0156,
+ 0x0158,
+ 0x015A,
+ 0x015C,
+ 0x015E,
+ 0x0160,
+ 0x0162,
+ 0x0164,
+ 0x0166,
+ 0x0168,
+ 0x016A,
+ 0x016C,
+ 0x016E,
+ 0x0170,
+ 0x0172,
+ 0x0174,
+ 0x0176,
+ 0x017B,
+ 0x017D,
+ 0x0184,
+ 0x01A2,
+ 0x01A4,
+ 0x01A9,
+ 0x01AC,
+ 0x01B5,
+ 0x01BC,
+ 0x01C4,
+ 0x01C7,
+ 0x01CA,
+ 0x01CD,
+ 0x01CF,
+ 0x01D1,
+ 0x01D3,
+ 0x01D5,
+ 0x01D7,
+ 0x01D9,
+ 0x01DB,
+ 0x01DE,
+ 0x01E0,
+ 0x01E2,
+ 0x01E4,
+ 0x01E6,
+ 0x01E8,
+ 0x01EA,
+ 0x01EC,
+ 0x01EE,
+ 0x01F1,
+ 0x01F4,
+ 0x01FA,
+ 0x01FC,
+ 0x01FE,
+ 0x0200,
+ 0x0202,
+ 0x0204,
+ 0x0206,
+ 0x0208,
+ 0x020A,
+ 0x020C,
+ 0x020E,
+ 0x0210,
+ 0x0212,
+ 0x0214,
+ 0x0216,
+ 0x0218,
+ 0x021A,
+ 0x021C,
+ 0x021E,
+ 0x0220,
+ 0x0222,
+ 0x0224,
+ 0x0226,
+ 0x0228,
+ 0x022A,
+ 0x022C,
+ 0x022E,
+ 0x0230,
+ 0x0232,
+ 0x0241,
+ 0x0248,
+ 0x024A,
+ 0x024C,
+ 0x024E,
+ 0x0370,
+ 0x0372,
+ 0x0376,
+ 0x0386,
+ 0x038C,
+ 0x03CF,
+ 0x03D8,
+ 0x03DA,
+ 0x03DC,
+ 0x03DE,
+ 0x03E0,
+ 0x03E2,
+ 0x03E4,
+ 0x03E6,
+ 0x03E8,
+ 0x03EA,
+ 0x03EC,
+ 0x03EE,
+ 0x03F4,
+ 0x03F7,
+ 0x0460,
+ 0x0462,
+ 0x0464,
+ 0x0466,
+ 0x0468,
+ 0x046A,
+ 0x046C,
+ 0x046E,
+ 0x0470,
+ 0x0472,
+ 0x0474,
+ 0x0476,
+ 0x0478,
+ 0x047A,
+ 0x047C,
+ 0x047E,
+ 0x0480,
+ 0x048A,
+ 0x048C,
+ 0x048E,
+ 0x0490,
+ 0x0492,
+ 0x0494,
+ 0x0496,
+ 0x0498,
+ 0x049A,
+ 0x049C,
+ 0x049E,
+ 0x04A0,
+ 0x04A2,
+ 0x04A4,
+ 0x04A6,
+ 0x04A8,
+ 0x04AA,
+ 0x04AC,
+ 0x04AE,
+ 0x04B0,
+ 0x04B2,
+ 0x04B4,
+ 0x04B6,
+ 0x04B8,
+ 0x04BA,
+ 0x04BC,
+ 0x04BE,
+ 0x04C3,
+ 0x04C5,
+ 0x04C7,
+ 0x04C9,
+ 0x04CB,
+ 0x04CD,
+ 0x04D0,
+ 0x04D2,
+ 0x04D4,
+ 0x04D6,
+ 0x04D8,
+ 0x04DA,
+ 0x04DC,
+ 0x04DE,
+ 0x04E0,
+ 0x04E2,
+ 0x04E4,
+ 0x04E6,
+ 0x04E8,
+ 0x04EA,
+ 0x04EC,
+ 0x04EE,
+ 0x04F0,
+ 0x04F2,
+ 0x04F4,
+ 0x04F6,
+ 0x04F8,
+ 0x04FA,
+ 0x04FC,
+ 0x04FE,
+ 0x0500,
+ 0x0502,
+ 0x0504,
+ 0x0506,
+ 0x0508,
+ 0x050A,
+ 0x050C,
+ 0x050E,
+ 0x0510,
+ 0x0512,
+ 0x0514,
+ 0x0516,
+ 0x0518,
+ 0x051A,
+ 0x051C,
+ 0x051E,
+ 0x0520,
+ 0x0522,
+ 0x0524,
+ 0x0526,
+ 0x10C7,
+ 0x10CD,
+ 0x1E00,
+ 0x1E02,
+ 0x1E04,
+ 0x1E06,
+ 0x1E08,
+ 0x1E0A,
+ 0x1E0C,
+ 0x1E0E,
+ 0x1E10,
+ 0x1E12,
+ 0x1E14,
+ 0x1E16,
+ 0x1E18,
+ 0x1E1A,
+ 0x1E1C,
+ 0x1E1E,
+ 0x1E20,
+ 0x1E22,
+ 0x1E24,
+ 0x1E26,
+ 0x1E28,
+ 0x1E2A,
+ 0x1E2C,
+ 0x1E2E,
+ 0x1E30,
+ 0x1E32,
+ 0x1E34,
+ 0x1E36,
+ 0x1E38,
+ 0x1E3A,
+ 0x1E3C,
+ 0x1E3E,
+ 0x1E40,
+ 0x1E42,
+ 0x1E44,
+ 0x1E46,
+ 0x1E48,
+ 0x1E4A,
+ 0x1E4C,
+ 0x1E4E,
+ 0x1E50,
+ 0x1E52,
+ 0x1E54,
+ 0x1E56,
+ 0x1E58,
+ 0x1E5A,
+ 0x1E5C,
+ 0x1E5E,
+ 0x1E60,
+ 0x1E62,
+ 0x1E64,
+ 0x1E66,
+ 0x1E68,
+ 0x1E6A,
+ 0x1E6C,
+ 0x1E6E,
+ 0x1E70,
+ 0x1E72,
+ 0x1E74,
+ 0x1E76,
+ 0x1E78,
+ 0x1E7A,
+ 0x1E7C,
+ 0x1E7E,
+ 0x1E80,
+ 0x1E82,
+ 0x1E84,
+ 0x1E86,
+ 0x1E88,
+ 0x1E8A,
+ 0x1E8C,
+ 0x1E8E,
+ 0x1E90,
+ 0x1E92,
+ 0x1E94,
+ 0x1E9E,
+ 0x1EA0,
+ 0x1EA2,
+ 0x1EA4,
+ 0x1EA6,
+ 0x1EA8,
+ 0x1EAA,
+ 0x1EAC,
+ 0x1EAE,
+ 0x1EB0,
+ 0x1EB2,
+ 0x1EB4,
+ 0x1EB6,
+ 0x1EB8,
+ 0x1EBA,
+ 0x1EBC,
+ 0x1EBE,
+ 0x1EC0,
+ 0x1EC2,
+ 0x1EC4,
+ 0x1EC6,
+ 0x1EC8,
+ 0x1ECA,
+ 0x1ECC,
+ 0x1ECE,
+ 0x1ED0,
+ 0x1ED2,
+ 0x1ED4,
+ 0x1ED6,
+ 0x1ED8,
+ 0x1EDA,
+ 0x1EDC,
+ 0x1EDE,
+ 0x1EE0,
+ 0x1EE2,
+ 0x1EE4,
+ 0x1EE6,
+ 0x1EE8,
+ 0x1EEA,
+ 0x1EEC,
+ 0x1EEE,
+ 0x1EF0,
+ 0x1EF2,
+ 0x1EF4,
+ 0x1EF6,
+ 0x1EF8,
+ 0x1EFA,
+ 0x1EFC,
+ 0x1EFE,
+ 0x1F59,
+ 0x1F5B,
+ 0x1F5D,
+ 0x1F5F,
+ 0x2102,
+ 0x2107,
+ 0x2115,
+ 0x2124,
+ 0x2126,
+ 0x2128,
+ 0x2145,
+ 0x2183,
+ 0x2C60,
+ 0x2C67,
+ 0x2C69,
+ 0x2C6B,
+ 0x2C72,
+ 0x2C75,
+ 0x2C82,
+ 0x2C84,
+ 0x2C86,
+ 0x2C88,
+ 0x2C8A,
+ 0x2C8C,
+ 0x2C8E,
+ 0x2C90,
+ 0x2C92,
+ 0x2C94,
+ 0x2C96,
+ 0x2C98,
+ 0x2C9A,
+ 0x2C9C,
+ 0x2C9E,
+ 0x2CA0,
+ 0x2CA2,
+ 0x2CA4,
+ 0x2CA6,
+ 0x2CA8,
+ 0x2CAA,
+ 0x2CAC,
+ 0x2CAE,
+ 0x2CB0,
+ 0x2CB2,
+ 0x2CB4,
+ 0x2CB6,
+ 0x2CB8,
+ 0x2CBA,
+ 0x2CBC,
+ 0x2CBE,
+ 0x2CC0,
+ 0x2CC2,
+ 0x2CC4,
+ 0x2CC6,
+ 0x2CC8,
+ 0x2CCA,
+ 0x2CCC,
+ 0x2CCE,
+ 0x2CD0,
+ 0x2CD2,
+ 0x2CD4,
+ 0x2CD6,
+ 0x2CD8,
+ 0x2CDA,
+ 0x2CDC,
+ 0x2CDE,
+ 0x2CE0,
+ 0x2CE2,
+ 0x2CEB,
+ 0x2CED,
+ 0x2CF2,
+ 0xA640,
+ 0xA642,
+ 0xA644,
+ 0xA646,
+ 0xA648,
+ 0xA64A,
+ 0xA64C,
+ 0xA64E,
+ 0xA650,
+ 0xA652,
+ 0xA654,
+ 0xA656,
+ 0xA658,
+ 0xA65A,
+ 0xA65C,
+ 0xA65E,
+ 0xA660,
+ 0xA662,
+ 0xA664,
+ 0xA666,
+ 0xA668,
+ 0xA66A,
+ 0xA66C,
+ 0xA680,
+ 0xA682,
+ 0xA684,
+ 0xA686,
+ 0xA688,
+ 0xA68A,
+ 0xA68C,
+ 0xA68E,
+ 0xA690,
+ 0xA692,
+ 0xA694,
+ 0xA696,
+ 0xA722,
+ 0xA724,
+ 0xA726,
+ 0xA728,
+ 0xA72A,
+ 0xA72C,
+ 0xA72E,
+ 0xA732,
+ 0xA734,
+ 0xA736,
+ 0xA738,
+ 0xA73A,
+ 0xA73C,
+ 0xA73E,
+ 0xA740,
+ 0xA742,
+ 0xA744,
+ 0xA746,
+ 0xA748,
+ 0xA74A,
+ 0xA74C,
+ 0xA74E,
+ 0xA750,
+ 0xA752,
+ 0xA754,
+ 0xA756,
+ 0xA758,
+ 0xA75A,
+ 0xA75C,
+ 0xA75E,
+ 0xA760,
+ 0xA762,
+ 0xA764,
+ 0xA766,
+ 0xA768,
+ 0xA76A,
+ 0xA76C,
+ 0xA76E,
+ 0xA779,
+ 0xA77B,
+ 0xA780,
+ 0xA782,
+ 0xA784,
+ 0xA786,
+ 0xA78B,
+ 0xA78D,
+ 0xA790,
+ 0xA792,
+ 0xA7A0,
+ 0xA7A2,
+ 0xA7A4,
+ 0xA7A6,
+ 0xA7A8,
+ 0xA7AA,
+ 0x1D49C,
+ 0x1D4A2,
+ 0x1D546,
+ 0x1D7CA,
+};
+
+int
+isupperrune(Rune r)
+{
+ if(bsearch(&r, upper2, nelem(upper2), sizeof *upper2, &rune2cmp))
+ return 1;
+ if(bsearch(&r, upper1, nelem(upper1), sizeof *upper1, &rune1cmp))
+ return 1;
+ return 0;
+}
+
+static Rune lower2[][2] = {
+ { 0x0061, 0x007A },
+ { 0x00DF, 0x00F6 },
+ { 0x00F8, 0x00FF },
+ { 0x0137, 0x0138 },
+ { 0x0148, 0x0149 },
+ { 0x017E, 0x0180 },
+ { 0x018C, 0x018D },
+ { 0x0199, 0x019B },
+ { 0x01AA, 0x01AB },
+ { 0x01B9, 0x01BA },
+ { 0x01BD, 0x01BF },
+ { 0x01DC, 0x01DD },
+ { 0x01EF, 0x01F0 },
+ { 0x0233, 0x0239 },
+ { 0x023F, 0x0240 },
+ { 0x024F, 0x0293 },
+ { 0x0295, 0x02AF },
+ { 0x037B, 0x037D },
+ { 0x03AC, 0x03CE },
+ { 0x03D0, 0x03D1 },
+ { 0x03D5, 0x03D7 },
+ { 0x03EF, 0x03F3 },
+ { 0x03FB, 0x03FC },
+ { 0x0430, 0x045F },
+ { 0x04CE, 0x04CF },
+ { 0x0561, 0x0587 },
+ { 0x1D00, 0x1D2B },
+ { 0x1D6B, 0x1D77 },
+ { 0x1D79, 0x1D9A },
+ { 0x1E95, 0x1E9D },
+ { 0x1EFF, 0x1F07 },
+ { 0x1F10, 0x1F15 },
+ { 0x1F20, 0x1F27 },
+ { 0x1F30, 0x1F37 },
+ { 0x1F40, 0x1F45 },
+ { 0x1F50, 0x1F57 },
+ { 0x1F60, 0x1F67 },
+ { 0x1F70, 0x1F7D },
+ { 0x1F80, 0x1F87 },
+ { 0x1F90, 0x1F97 },
+ { 0x1FA0, 0x1FA7 },
+ { 0x1FB0, 0x1FB4 },
+ { 0x1FB6, 0x1FB7 },
+ { 0x1FC2, 0x1FC4 },
+ { 0x1FC6, 0x1FC7 },
+ { 0x1FD0, 0x1FD3 },
+ { 0x1FD6, 0x1FD7 },
+ { 0x1FE0, 0x1FE7 },
+ { 0x1FF2, 0x1FF4 },
+ { 0x1FF6, 0x1FF7 },
+ { 0x210E, 0x210F },
+ { 0x213C, 0x213D },
+ { 0x2146, 0x2149 },
+ { 0x2C30, 0x2C5E },
+ { 0x2C65, 0x2C66 },
+ { 0x2C73, 0x2C74 },
+ { 0x2C76, 0x2C7B },
+ { 0x2CE3, 0x2CE4 },
+ { 0x2D00, 0x2D25 },
+ { 0xA72F, 0xA731 },
+ { 0xA771, 0xA778 },
+ { 0xFB00, 0xFB06 },
+ { 0xFB13, 0xFB17 },
+ { 0xFF41, 0xFF5A },
+ { 0x10428, 0x1044F },
+ { 0x1D41A, 0x1D433 },
+ { 0x1D44E, 0x1D454 },
+ { 0x1D456, 0x1D467 },
+ { 0x1D482, 0x1D49B },
+ { 0x1D4B6, 0x1D4B9 },
+ { 0x1D4BD, 0x1D4C3 },
+ { 0x1D4C5, 0x1D4CF },
+ { 0x1D4EA, 0x1D503 },
+ { 0x1D51E, 0x1D537 },
+ { 0x1D552, 0x1D56B },
+ { 0x1D586, 0x1D59F },
+ { 0x1D5BA, 0x1D5D3 },
+ { 0x1D5EE, 0x1D607 },
+ { 0x1D622, 0x1D63B },
+ { 0x1D656, 0x1D66F },
+ { 0x1D68A, 0x1D6A5 },
+ { 0x1D6C2, 0x1D6DA },
+ { 0x1D6DC, 0x1D6E1 },
+ { 0x1D6FC, 0x1D714 },
+ { 0x1D716, 0x1D71B },
+ { 0x1D736, 0x1D74E },
+ { 0x1D750, 0x1D755 },
+ { 0x1D770, 0x1D788 },
+ { 0x1D78A, 0x1D78F },
+ { 0x1D7AA, 0x1D7C2 },
+ { 0x1D7C4, 0x1D7C9 },
+};
+
+static Rune lower1[] = {
+ 0x00B5,
+ 0x0101,
+ 0x0103,
+ 0x0105,
+ 0x0107,
+ 0x0109,
+ 0x010B,
+ 0x010D,
+ 0x010F,
+ 0x0111,
+ 0x0113,
+ 0x0115,
+ 0x0117,
+ 0x0119,
+ 0x011B,
+ 0x011D,
+ 0x011F,
+ 0x0121,
+ 0x0123,
+ 0x0125,
+ 0x0127,
+ 0x0129,
+ 0x012B,
+ 0x012D,
+ 0x012F,
+ 0x0131,
+ 0x0133,
+ 0x0135,
+ 0x013A,
+ 0x013C,
+ 0x013E,
+ 0x0140,
+ 0x0142,
+ 0x0144,
+ 0x0146,
+ 0x014B,
+ 0x014D,
+ 0x014F,
+ 0x0151,
+ 0x0153,
+ 0x0155,
+ 0x0157,
+ 0x0159,
+ 0x015B,
+ 0x015D,
+ 0x015F,
+ 0x0161,
+ 0x0163,
+ 0x0165,
+ 0x0167,
+ 0x0169,
+ 0x016B,
+ 0x016D,
+ 0x016F,
+ 0x0171,
+ 0x0173,
+ 0x0175,
+ 0x0177,
+ 0x017A,
+ 0x017C,
+ 0x0183,
+ 0x0185,
+ 0x0188,
+ 0x0192,
+ 0x0195,
+ 0x019E,
+ 0x01A1,
+ 0x01A3,
+ 0x01A5,
+ 0x01A8,
+ 0x01AD,
+ 0x01B0,
+ 0x01B4,
+ 0x01B6,
+ 0x01C6,
+ 0x01C9,
+ 0x01CC,
+ 0x01CE,
+ 0x01D0,
+ 0x01D2,
+ 0x01D4,
+ 0x01D6,
+ 0x01D8,
+ 0x01DA,
+ 0x01DF,
+ 0x01E1,
+ 0x01E3,
+ 0x01E5,
+ 0x01E7,
+ 0x01E9,
+ 0x01EB,
+ 0x01ED,
+ 0x01F3,
+ 0x01F5,
+ 0x01F9,
+ 0x01FB,
+ 0x01FD,
+ 0x01FF,
+ 0x0201,
+ 0x0203,
+ 0x0205,
+ 0x0207,
+ 0x0209,
+ 0x020B,
+ 0x020D,
+ 0x020F,
+ 0x0211,
+ 0x0213,
+ 0x0215,
+ 0x0217,
+ 0x0219,
+ 0x021B,
+ 0x021D,
+ 0x021F,
+ 0x0221,
+ 0x0223,
+ 0x0225,
+ 0x0227,
+ 0x0229,
+ 0x022B,
+ 0x022D,
+ 0x022F,
+ 0x0231,
+ 0x023C,
+ 0x0242,
+ 0x0247,
+ 0x0249,
+ 0x024B,
+ 0x024D,
+ 0x0371,
+ 0x0373,
+ 0x0377,
+ 0x0390,
+ 0x03D9,
+ 0x03DB,
+ 0x03DD,
+ 0x03DF,
+ 0x03E1,
+ 0x03E3,
+ 0x03E5,
+ 0x03E7,
+ 0x03E9,
+ 0x03EB,
+ 0x03ED,
+ 0x03F5,
+ 0x03F8,
+ 0x0461,
+ 0x0463,
+ 0x0465,
+ 0x0467,
+ 0x0469,
+ 0x046B,
+ 0x046D,
+ 0x046F,
+ 0x0471,
+ 0x0473,
+ 0x0475,
+ 0x0477,
+ 0x0479,
+ 0x047B,
+ 0x047D,
+ 0x047F,
+ 0x0481,
+ 0x048B,
+ 0x048D,
+ 0x048F,
+ 0x0491,
+ 0x0493,
+ 0x0495,
+ 0x0497,
+ 0x0499,
+ 0x049B,
+ 0x049D,
+ 0x049F,
+ 0x04A1,
+ 0x04A3,
+ 0x04A5,
+ 0x04A7,
+ 0x04A9,
+ 0x04AB,
+ 0x04AD,
+ 0x04AF,
+ 0x04B1,
+ 0x04B3,
+ 0x04B5,
+ 0x04B7,
+ 0x04B9,
+ 0x04BB,
+ 0x04BD,
+ 0x04BF,
+ 0x04C2,
+ 0x04C4,
+ 0x04C6,
+ 0x04C8,
+ 0x04CA,
+ 0x04CC,
+ 0x04D1,
+ 0x04D3,
+ 0x04D5,
+ 0x04D7,
+ 0x04D9,
+ 0x04DB,
+ 0x04DD,
+ 0x04DF,
+ 0x04E1,
+ 0x04E3,
+ 0x04E5,
+ 0x04E7,
+ 0x04E9,
+ 0x04EB,
+ 0x04ED,
+ 0x04EF,
+ 0x04F1,
+ 0x04F3,
+ 0x04F5,
+ 0x04F7,
+ 0x04F9,
+ 0x04FB,
+ 0x04FD,
+ 0x04FF,
+ 0x0501,
+ 0x0503,
+ 0x0505,
+ 0x0507,
+ 0x0509,
+ 0x050B,
+ 0x050D,
+ 0x050F,
+ 0x0511,
+ 0x0513,
+ 0x0515,
+ 0x0517,
+ 0x0519,
+ 0x051B,
+ 0x051D,
+ 0x051F,
+ 0x0521,
+ 0x0523,
+ 0x0525,
+ 0x0527,
+ 0x1E01,
+ 0x1E03,
+ 0x1E05,
+ 0x1E07,
+ 0x1E09,
+ 0x1E0B,
+ 0x1E0D,
+ 0x1E0F,
+ 0x1E11,
+ 0x1E13,
+ 0x1E15,
+ 0x1E17,
+ 0x1E19,
+ 0x1E1B,
+ 0x1E1D,
+ 0x1E1F,
+ 0x1E21,
+ 0x1E23,
+ 0x1E25,
+ 0x1E27,
+ 0x1E29,
+ 0x1E2B,
+ 0x1E2D,
+ 0x1E2F,
+ 0x1E31,
+ 0x1E33,
+ 0x1E35,
+ 0x1E37,
+ 0x1E39,
+ 0x1E3B,
+ 0x1E3D,
+ 0x1E3F,
+ 0x1E41,
+ 0x1E43,
+ 0x1E45,
+ 0x1E47,
+ 0x1E49,
+ 0x1E4B,
+ 0x1E4D,
+ 0x1E4F,
+ 0x1E51,
+ 0x1E53,
+ 0x1E55,
+ 0x1E57,
+ 0x1E59,
+ 0x1E5B,
+ 0x1E5D,
+ 0x1E5F,
+ 0x1E61,
+ 0x1E63,
+ 0x1E65,
+ 0x1E67,
+ 0x1E69,
+ 0x1E6B,
+ 0x1E6D,
+ 0x1E6F,
+ 0x1E71,
+ 0x1E73,
+ 0x1E75,
+ 0x1E77,
+ 0x1E79,
+ 0x1E7B,
+ 0x1E7D,
+ 0x1E7F,
+ 0x1E81,
+ 0x1E83,
+ 0x1E85,
+ 0x1E87,
+ 0x1E89,
+ 0x1E8B,
+ 0x1E8D,
+ 0x1E8F,
+ 0x1E91,
+ 0x1E93,
+ 0x1E9F,
+ 0x1EA1,
+ 0x1EA3,
+ 0x1EA5,
+ 0x1EA7,
+ 0x1EA9,
+ 0x1EAB,
+ 0x1EAD,
+ 0x1EAF,
+ 0x1EB1,
+ 0x1EB3,
+ 0x1EB5,
+ 0x1EB7,
+ 0x1EB9,
+ 0x1EBB,
+ 0x1EBD,
+ 0x1EBF,
+ 0x1EC1,
+ 0x1EC3,
+ 0x1EC5,
+ 0x1EC7,
+ 0x1EC9,
+ 0x1ECB,
+ 0x1ECD,
+ 0x1ECF,
+ 0x1ED1,
+ 0x1ED3,
+ 0x1ED5,
+ 0x1ED7,
+ 0x1ED9,
+ 0x1EDB,
+ 0x1EDD,
+ 0x1EDF,
+ 0x1EE1,
+ 0x1EE3,
+ 0x1EE5,
+ 0x1EE7,
+ 0x1EE9,
+ 0x1EEB,
+ 0x1EED,
+ 0x1EEF,
+ 0x1EF1,
+ 0x1EF3,
+ 0x1EF5,
+ 0x1EF7,
+ 0x1EF9,
+ 0x1EFB,
+ 0x1EFD,
+ 0x1FBE,
+ 0x210A,
+ 0x2113,
+ 0x212F,
+ 0x2134,
+ 0x2139,
+ 0x214E,
+ 0x2184,
+ 0x2C61,
+ 0x2C68,
+ 0x2C6A,
+ 0x2C6C,
+ 0x2C71,
+ 0x2C81,
+ 0x2C83,
+ 0x2C85,
+ 0x2C87,
+ 0x2C89,
+ 0x2C8B,
+ 0x2C8D,
+ 0x2C8F,
+ 0x2C91,
+ 0x2C93,
+ 0x2C95,
+ 0x2C97,
+ 0x2C99,
+ 0x2C9B,
+ 0x2C9D,
+ 0x2C9F,
+ 0x2CA1,
+ 0x2CA3,
+ 0x2CA5,
+ 0x2CA7,
+ 0x2CA9,
+ 0x2CAB,
+ 0x2CAD,
+ 0x2CAF,
+ 0x2CB1,
+ 0x2CB3,
+ 0x2CB5,
+ 0x2CB7,
+ 0x2CB9,
+ 0x2CBB,
+ 0x2CBD,
+ 0x2CBF,
+ 0x2CC1,
+ 0x2CC3,
+ 0x2CC5,
+ 0x2CC7,
+ 0x2CC9,
+ 0x2CCB,
+ 0x2CCD,
+ 0x2CCF,
+ 0x2CD1,
+ 0x2CD3,
+ 0x2CD5,
+ 0x2CD7,
+ 0x2CD9,
+ 0x2CDB,
+ 0x2CDD,
+ 0x2CDF,
+ 0x2CE1,
+ 0x2CEC,
+ 0x2CEE,
+ 0x2CF3,
+ 0x2D27,
+ 0x2D2D,
+ 0xA641,
+ 0xA643,
+ 0xA645,
+ 0xA647,
+ 0xA649,
+ 0xA64B,
+ 0xA64D,
+ 0xA64F,
+ 0xA651,
+ 0xA653,
+ 0xA655,
+ 0xA657,
+ 0xA659,
+ 0xA65B,
+ 0xA65D,
+ 0xA65F,
+ 0xA661,
+ 0xA663,
+ 0xA665,
+ 0xA667,
+ 0xA669,
+ 0xA66B,
+ 0xA66D,
+ 0xA681,
+ 0xA683,
+ 0xA685,
+ 0xA687,
+ 0xA689,
+ 0xA68B,
+ 0xA68D,
+ 0xA68F,
+ 0xA691,
+ 0xA693,
+ 0xA695,
+ 0xA697,
+ 0xA723,
+ 0xA725,
+ 0xA727,
+ 0xA729,
+ 0xA72B,
+ 0xA72D,
+ 0xA733,
+ 0xA735,
+ 0xA737,
+ 0xA739,
+ 0xA73B,
+ 0xA73D,
+ 0xA73F,
+ 0xA741,
+ 0xA743,
+ 0xA745,
+ 0xA747,
+ 0xA749,
+ 0xA74B,
+ 0xA74D,
+ 0xA74F,
+ 0xA751,
+ 0xA753,
+ 0xA755,
+ 0xA757,
+ 0xA759,
+ 0xA75B,
+ 0xA75D,
+ 0xA75F,
+ 0xA761,
+ 0xA763,
+ 0xA765,
+ 0xA767,
+ 0xA769,
+ 0xA76B,
+ 0xA76D,
+ 0xA76F,
+ 0xA77A,
+ 0xA77C,
+ 0xA77F,
+ 0xA781,
+ 0xA783,
+ 0xA785,
+ 0xA787,
+ 0xA78C,
+ 0xA78E,
+ 0xA791,
+ 0xA793,
+ 0xA7A1,
+ 0xA7A3,
+ 0xA7A5,
+ 0xA7A7,
+ 0xA7A9,
+ 0xA7FA,
+ 0x1D4BB,
+ 0x1D7CB,
+};
+
+int
+islowerrune(Rune r)
+{
+ if(bsearch(&r, lower2, nelem(lower2), sizeof *lower2, &rune2cmp))
+ return 1;
+ if(bsearch(&r, lower1, nelem(lower1), sizeof *lower1, &rune1cmp))
+ return 1;
+ return 0;
+}
+
+static Rune title2[][2] = {
+ { 0x1F88, 0x1F8F },
+ { 0x1F98, 0x1F9F },
+ { 0x1FA8, 0x1FAF },
+};
+
+static Rune title1[] = {
+ 0x01C5,
+ 0x01C8,
+ 0x01CB,
+ 0x01F2,
+ 0x1FBC,
+ 0x1FCC,
+ 0x1FFC,
+};
+
+int
+istitlerune(Rune r)
+{
+ if(bsearch(&r, title2, nelem(title2), sizeof *title2, &rune2cmp))
+ return 1;
+ if(bsearch(&r, title1, nelem(title1), sizeof *title1, &rune1cmp))
+ return 1;
+ return 0;
+}
+
+static Rune digit2[][2] = {
+ { 0x0030, 0x0039 },
+ { 0x0660, 0x0669 },
+ { 0x06F0, 0x06F9 },
+ { 0x07C0, 0x07C9 },
+ { 0x0966, 0x096F },
+ { 0x09E6, 0x09EF },
+ { 0x0A66, 0x0A6F },
+ { 0x0AE6, 0x0AEF },
+ { 0x0B66, 0x0B6F },
+ { 0x0BE6, 0x0BEF },
+ { 0x0C66, 0x0C6F },
+ { 0x0CE6, 0x0CEF },
+ { 0x0D66, 0x0D6F },
+ { 0x0E50, 0x0E59 },
+ { 0x0ED0, 0x0ED9 },
+ { 0x0F20, 0x0F29 },
+ { 0x1040, 0x1049 },
+ { 0x1090, 0x1099 },
+ { 0x17E0, 0x17E9 },
+ { 0x1810, 0x1819 },
+ { 0x1946, 0x194F },
+ { 0x19D0, 0x19D9 },
+ { 0x1A80, 0x1A89 },
+ { 0x1A90, 0x1A99 },
+ { 0x1B50, 0x1B59 },
+ { 0x1BB0, 0x1BB9 },
+ { 0x1C40, 0x1C49 },
+ { 0x1C50, 0x1C59 },
+ { 0xA620, 0xA629 },
+ { 0xA8D0, 0xA8D9 },
+ { 0xA900, 0xA909 },
+ { 0xA9D0, 0xA9D9 },
+ { 0xAA50, 0xAA59 },
+ { 0xABF0, 0xABF9 },
+ { 0xFF10, 0xFF19 },
+ { 0x104A0, 0x104A9 },
+ { 0x11066, 0x1106F },
+ { 0x110F0, 0x110F9 },
+ { 0x11136, 0x1113F },
+ { 0x111D0, 0x111D9 },
+ { 0x116C0, 0x116C9 },
+ { 0x1D7CE, 0x1D7FF },
+};
+
+int
+isdigitrune(Rune r)
+{
+ if(bsearch(&r, digit2, nelem(digit2), sizeof *digit2, &rune2cmp))
+ return 1;
+ return 0;
+}
+
diff --git a/utf.c b/utf.c
@@ -1,296 +0,0 @@
-/* See LICENSE file for copyright and license details. */
-#include <string.h>
-#include "utf.h"
-
-#define MIN(x,y) ((x) < (y) ? (x) : (y))
-
-#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \
- : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \
- : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \
- : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \
- : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \
- : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \
- : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \
- : 0 )
-
-/*
- * runetochar copies one rune at p to at most UTFmax bytes starting at s and
- * returns the number of bytes copied. UTFmax is the maximum number of bytes
- * required to represent a legal rune.
- *
- * If the rune is illegal, runetochar will return 0.
- */
-int
-runetochar(char *s, Rune *p)
-{
- Rune r = *p;
-
- switch(runelen(r)) {
- case 1: /* 0aaaaaaa */
- s[0] = r;
- return 1;
- case 2: /* 00000aaa aabbbbbb */
- s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
- s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
- return 2;
- case 3: /* aaaabbbb bbcccccc */
- s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
- s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
- s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
- return 3;
- case 4: /* 000aaabb bbbbcccc ccdddddd */
- s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
- s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
- s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
- s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
- return 4;
- default:
- return 0; /* error */
- }
-}
-
-/*
- * chartorune copies at most UTFmax bytes starting at s to one rune at p and
- * returns the number of bytes copied. If the input is not valid UTF-8,
- * chartorune will convert the sequence to Runeerror (0xFFFD), returning the
- * number of bytes in the invalid sequence.
- */
-int
-chartorune(Rune *p, const char *s)
-{
- return charntorune(p, s, UTFmax);
-}
-
-/*
- * charntorune copies at most len bytes starting at s to one rune at p and
- * returns the number of bytes copied. If the input is not valid UTF-8,
- * charntorune will convert the sequence to Runeerror (0xFFFD), returning the
- * number of bytes in the invalid sequence.
- *
- * If a potentially valid sequence is cut off by the len limit, charntorune will
- * return 0.
- */
-int
-charntorune(Rune *p, const char *s, size_t len)
-{
- unsigned int i, n;
- Rune r;
-
- if(len == 0) /* can't even look at s[0] */
- return 0;
-
- switch((n = UTFSEQ(s[0]))) {
- case 1: r = s[0]; break; /* 0xxxxxxx */
- case 2: r = s[0] & 0x1F; break; /* 110xxxxx */
- case 3: r = s[0] & 0x0F; break; /* 1110xxxx */
- case 4: r = s[0] & 0x07; break; /* 11110xxx */
- case 5: r = s[0] & 0x03; break; /* 111110xx */
- case 6: r = s[0] & 0x01; break; /* 1111110x */
- default: /* invalid sequence */
- *p = Runeerror;
- return 1;
- }
- /* add values from continuation bytes */
- for(i = 1; i < MIN(n, len); i++)
- if((s[i] & 0xC0) != 0x80) {
- /* expected continuation */
- *p = Runeerror;
- return i;
- }
- else
- r = (r << 6) | (s[i] & 0x3F);
-
- if(i < n) /* must have reached len limit */
- return 0;
-
- /* reject invalid runes and overlong sequences */
- if(n > UTFmax || r > 0x10FFFF || runelen(r) < (int)n || (r & 0xFFFE) == 0xFFFE
- || (r >= 0xD800 && r <= 0xDFFF) || (r >= 0xFDD0 && r <= 0xFDEF))
- r = Runeerror;
-
- *p = r;
- return n;
-}
-
-/*
- * runelen returns the number of bytes required to convert r into UTF-8. If the
- * rune is illegal, runelen will return 0.
- */
-int
-runelen(Rune r)
-{
- if(r <= 0x7F)
- return 1;
- else if(r <= 0x07FF)
- return 2;
- else if(r <= 0xFFFF)
- return 3;
- else if(r <= Runemax)
- return 4;
- else
- return 0; /* error */
-}
-
-/*
- * runelen returns the number of bytes required to convert the rune-string of
- * length len pointed to by p into UTF-8.
- */
-size_t
-runenlen(Rune *p, size_t len)
-{
- size_t i, n = 0;
-
- for(i = 0; i < len; i++)
- n += runelen(p[i]);
- return n;
-}
-
-/*
- * fullrune returns 1 if the string s of length len is long enough to be
- * decoded by chartorune, and 0 otherwise.
- */
-int
-fullrune(const char *s, size_t len)
-{
- Rune r;
-
- return charntorune(&r, s, len) > 0;
-}
-
-/*
- * utfecpy copies UTF-8 sequences until a null sequence has been copied, but
- * writes no sequences beyond end. If any sequences are copied, the to string is
- * terminated by a null sequence, and a pointer to that sequence is returned.
- * Otherwise, the original to string is returned.
- */
-char *
-utfecpy(char *to, char *end, const char *from)
-{
- Rune r = Runeerror;
- size_t i, n;
-
- /* seek through to find final full rune */
- for(i = 0; r != '\0' && (n = charntorune(&r, &from[i], end - &to[i])); i += n)
- ;
- memcpy(to, from, i); /* copy over bytes up to this rune */
-
- if(i > 0 && r != '\0')
- to[i] = '\0'; /* terminate if unterminated */
- return &to[i];
-}
-
-/*
- * utflen returns the number of runes that are represented by the UTF-8 string
- * s.
- */
-size_t
-utflen(const char *s)
-{
- const char *p = s;
- size_t i;
- Rune r;
-
- for(i = 0; *p != '\0'; i++)
- p += chartorune(&r, p);
- return i;
-}
-
-/*
- * utfnlen returns the number of runes that are represented by the UTF-8 string
- * s of length len. If the last few bytes contain an incompletely coded rune,
- * utfnlen will not count them; in this way it differs from utflen, which
- * includes every byte of the string.
- */
-size_t
-utfnlen(const char *s, size_t len)
-{
- const char *p = s;
- size_t i;
- Rune r;
- int n;
-
- for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++)
- p += n;
- return i;
-}
-
-/*
- * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s,
- * or NULL if r does not occur in s. The null byte terminating a string is
- * considered to be part of the string s.
- */
-char *
-utfrune(const char *s, Rune r)
-{
- if(r < Runeself) {
- return strchr(s, r);
- }
- else if(r == Runeerror) {
- Rune r0;
- int n;
-
- for(; *s != '\0'; s += n) {
- n = chartorune(&r0, s);
- if(r == r0)
- return (char *)s;
- }
- }
- else {
- char buf[UTFmax+1];
- int n;
-
- if(!(n = runetochar(buf, &r)))
- return NULL;
- buf[n] = '\0';
- return strstr(s, buf);
- }
- return NULL;
-}
-
-/*
- * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s,
- * or NULL if r does not occur in s. The null byte terminating a string is
- * considered to be part of the string s.
- */
-char *
-utfrrune(const char *s, Rune r)
-{
- const char *p = NULL;
- Rune r0;
- int n;
-
- if(r < Runeself)
- return strrchr(s, r);
-
- for(; *s != '\0'; s += n) {
- n = chartorune(&r0, s);
- if(r == r0)
- p = s;
- }
- return (char *)p;
-}
-
-/*
- * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a
- * UTF-8 substring of s, or NULL if there is none. If t is the null string,
- * utfutf returns s.
- */
-char *
-utfutf(const char *s, const char *t)
-{
- const char *p, *q;
- Rune r0, r1, r2;
- int n, m;
-
- for(chartorune(&r0, t); (s = utfrune(s, r0)); s++) {
- for(p = s, q = t; *q && *p; p += n, q += m) {
- n = chartorune(&r1, p);
- m = chartorune(&r2, q);
- if(r1 != r2)
- break;
- }
- if(!*q)
- return (char *)s;
- }
- return NULL;
-}
diff --git a/utf.h b/utf.h
@@ -25,4 +25,11 @@ char *utfrune(const char *, Rune);
char *utfrrune(const char *, Rune);
char *utfutf(const char *, const char *);
+int isalpharune(Rune);
+int isspacerune(Rune);
+int isupperrune(Rune);
+int islowerrune(Rune);
+int istitlerune(Rune);
+int isdigitrune(Rune);
+
#endif
diff --git a/utftest.c b/utftest.c
@@ -1,49 +1,112 @@
/* See LICENSE file for copyright and license details. */
+#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "utf.h"
-static void utferror(const char *, int);
+#define ARGBEGIN \
+ { \
+ Rune _argr; \
+ \
+ if(!argv0) \
+ argv0 = argv[0]; \
+ for(argc--, argv++; *argv && (*argv)[0] == '-' && (*argv)[1] != '\0'; argc--, argv++) { \
+ if((*argv)[1] == '-' && (*argv)[2] == '\0') { /* -- signifies end of flags */ \
+ argc--; argv++; \
+ break; \
+ } \
+ (*argv)++; \
+ while(**argv != '\0' && (*argv += chartorune(&_argr, *argv))) \
+ switch(_argr)
+#define ARGEND \
+ } \
+ }
+#define ARGC() _argr
+
+static void eprintf(const char *, ...);
+static void usage(void);
+
+static const char *argv0;
int
-main(void)
+main(int argc, char *argv[])
{
- char buf[BUFSIZ], buf2[UTFmax];
+ int aflag = 0, dflag = 0, lflag = 0, sflag = 0, tflag = 0, uflag = 0, vflag = 0;
+ char buf[BUFSIZ], rbuf[UTFmax];
int len, len2;
- size_t i, n;
+ size_t i, j, n;
Rune r;
+ ARGBEGIN {
+ case 'a': aflag = 1; break;
+ case 'd': dflag = 1; break;
+ case 'l': lflag = 1; break;
+ case 's': sflag = 1; break;
+ case 't': tflag = 1; break;
+ case 'u': uflag = 1; break;
+ case 'v': vflag = 1; break;
+ default:
+ r = ARGC();
+ rbuf[runetochar(rbuf, &r)] = '\0';
+ eprintf("unknown flag -%s\n", rbuf);
+ usage();
+ } ARGEND;
+
+ if(argc != 0)
+ usage();
+
for(i = 0; (n = fread(&buf[i], 1, sizeof buf - i, stdin)); i = n-i) {
for(n += i, i = 0; (len = charntorune(&r, &buf[i], n-i)); i += len) {
- if(r == Runeerror)
- utferror(&buf[i], len);
- if((len2 = runetochar(buf2, &r))) {
- if(!fwrite(buf2, len2, 1, stdout)) {
- perror("write error");
- exit(EXIT_FAILURE);
- }
+ if(r == Runeerror) {
+ fprintf(stderr, "%s: error converting char to rune:", argv0);
+ for(j = i; j < i+len; j++)
+ fprintf(stderr, " %02X", (unsigned char)buf[j]);
+ fputc('\n', stderr);
+ }
+ if(((aflag && isalpharune(r)) || (dflag && isdigitrune(r))
+ || (lflag && islowerrune(r)) || (sflag && isspacerune(r))
+ || (tflag && istitlerune(r)) || (uflag && isupperrune(r))
+ || (!aflag && !dflag && !lflag && !sflag && !tflag && !uflag)) == vflag)
+ continue;
+
+ if((len2 = runetochar(rbuf, &r))) {
+ if(!fwrite(rbuf, len2, 1, stdout))
+ eprintf("write error:");
}
else
- fprintf(stderr, "error converting rune to char: U+%02X\n", r);
+ fprintf(stderr, "%s: error converting rune to char: U+%02X\n", argv0, r);
}
if(i < n)
memcpy(buf, &buf[i], n-i);
}
- if(ferror(stdin)) {
- perror("read error");
- exit(EXIT_FAILURE);
- }
+ if(ferror(stdin))
+ eprintf("read error:");
+
return EXIT_SUCCESS;
}
void
-utferror(const char *s, int n)
+eprintf(const char *fmt, ...)
{
- int i = 0;
+ va_list ap;
+
+ fprintf(stderr, "%s: ", argv0);
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+
+ if(fmt[0] && fmt[strlen(fmt)-1] == ':') {
+ fputc(' ', stderr);
+ perror(NULL);
+ }
+ exit(EXIT_FAILURE);
+}
- fprintf(stderr, "error converting char to rune:");
- for(i = 0; i < n; i++)
- fprintf(stderr, " %02X", (unsigned char)s[i]);
- fputc('\n', stderr);
+void
+usage(void)
+{
+ fprintf(stderr, "usage: %s [-adlstu]\n", argv0);
+ exit(EXIT_FAILURE);
}