libutf

UTF-8 library
git clone git://git.suckless.org/libutf
Log | Files | Refs | README | LICENSE

commit b7125d595e08b2c70cda367782d2689ef2350587
parent ea636e77ab27c87482a51ee8176e150d3e92003d
Author: Connor Lane Smith <cls@lubutu.com>
Date:   Sun,  6 May 2012 20:36:37 +0100

add istyperune functions
Diffstat:
Makefile | 24+++++++++++++++++++-----
config.mk | 5+++--
mkrunetype.awk | 75+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rune.c | 296+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
runetype.c | 28++++++++++++++++++++++++++++
runetypebody.h | 1865+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
utf.c | 296-------------------------------------------------------------------------------
utf.h | 7+++++++
utftest.c | 107++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
9 files changed, 2378 insertions(+), 325 deletions(-)

diff --git a/Makefile b/Makefile @@ -2,16 +2,30 @@ include config.mk -SRC = utf.c utftest.c +SRC = rune.c runetype.c OBJ = $(SRC:.c=.o) -all: utftest +UCD = UnicodeData-6.1.0.txt -utftest: $(OBJ) - $(CC) $(LDFLAGS) -o $@ $(OBJ) +all: libutf.a utftest + +libutf.a: $(OBJ) + rm -f $@ + $(AR) rc $@ $(OBJ) + +utftest: utftest.o libutf.a + $(CC) $(LDFLAGS) -o $@ utftest.o .c.o: $(CC) $(CFLAGS) -c $< +runetypebody.h: mkrunetype.awk $(UCD) + $(AWK) -f mkrunetype.awk $(UCD) > $@ + +runetype.o: runetype.c runetypebody.h + clean: - rm -f utftest $(OBJ) + rm -f libutf.a utftest utftest.o $(OBJ) + +purge: $(UCD) clean + rm -f runetypebody.h diff --git a/config.mk b/config.mk @@ -1,6 +1,7 @@ # See LICENSE file for copyright and license details. CFLAGS = -ansi -pedantic -Os -Wall -Wextra -LDFLAGS = -s +LDFLAGS = -L. -lutf -CC = cc +CC = cc +AWK = awk diff --git a/mkrunetype.awk b/mkrunetype.awk @@ -0,0 +1,75 @@ +# See LICENSE file for copyright and license details. + +BEGIN { + FS = ";" + # setup hexadecimal lookup table + for(i = 0; i < 16; i++) + hex[sprintf("%X",i)] = i; +} + +$3 ~ /^L/ { alphav[alphac++] = $1; } +$3 ~ /^Z/ { spacev[spacec++] = $1; } +$3 == "Lu" { upperv[upperc++] = $1; } +$3 == "Ll" { lowerv[lowerc++] = $1; } +$3 == "Lt" { titlev[titlec++] = $1; } +$3 == "Nd" { digitv[digitc++] = $1; } + +END { + print "/* Automatically generated from "FILENAME" by mkrunetype.awk */\n" + + mkis("alpha", alphav, alphac); + mkis("space", spacev, spacec); + mkis("upper", upperv, upperc); + mkis("lower", lowerv, lowerc); + mkis("title", titlev, titlec); + mkis("digit", digitv, digitc); +} + +function code(s) { + x = 0; + for(i = 1; i <= length(s); i++) { + c = substr(s, i, 1); + x = (x*16) + hex[c]; + } + return x; +} + +function mkis(name, runev, runec) { + rune1c = 0; + rune2c = 0; + + for(j = k = 0; j < runec; j++) { + if(j+1 == runec || code(runev[j+1]) != code(runev[j])+1) { + if(j == k) { + rune1v[rune1c] = runev[j]; + rune1c++; + } + else { + rune2v0[rune2c] = runev[k]; + rune2v1[rune2c] = runev[j]; + rune2c++; + } + k = j+1; + } + } + if(rune2c > 0) { + print "static Rune "name"2[][2] = {"; + for(j = 0; j < rune2c; j++) { + print "\t{ 0x"rune2v0[j]", 0x"rune2v1[j]" },"; + } + print "};\n"; + } + if(rune1c > 0) { + print "static Rune "name"1[] = {"; + for(j = 0; j < rune1c; j++) { + print "\t0x"rune1v[j]","; + } + print "};\n"; + } + print "int\nis"name"rune(Rune r)\n{"; + if(rune2c > 0) + print "\tif(bsearch(&r, "name"2, nelem("name"2), sizeof *"name"2, &rune2cmp))\n\t\treturn 1;"; + if(rune1c > 0) + print "\tif(bsearch(&r, "name"1, nelem("name"1), sizeof *"name"1, &rune1cmp))\n\t\treturn 1;"; + print "\treturn 0;\n}\n"; +} diff --git a/rune.c b/rune.c @@ -0,0 +1,296 @@ +/* See LICENSE file for copyright and license details. */ +#include <string.h> +#include "utf.h" + +#define MIN(x,y) ((x) < (y) ? (x) : (y)) + +#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \ + : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \ + : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \ + : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \ + : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \ + : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \ + : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \ + : 0 ) + +/* + * runetochar copies one rune at p to at most UTFmax bytes starting at s and + * returns the number of bytes copied. UTFmax is the maximum number of bytes + * required to represent a legal rune. + * + * If the rune is illegal, runetochar will return 0. + */ +int +runetochar(char *s, Rune *p) +{ + Rune r = *p; + + switch(runelen(r)) { + case 1: /* 0aaaaaaa */ + s[0] = r; + return 1; + case 2: /* 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + case 3: /* aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + case 4: /* 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + default: + return 0; /* error */ + } +} + +/* + * chartorune copies at most UTFmax bytes starting at s to one rune at p and + * returns the number of bytes copied. If the input is not valid UTF-8, + * chartorune will convert the sequence to Runeerror (0xFFFD), returning the + * number of bytes in the invalid sequence. + */ +int +chartorune(Rune *p, const char *s) +{ + return charntorune(p, s, UTFmax); +} + +/* + * charntorune copies at most len bytes starting at s to one rune at p and + * returns the number of bytes copied. If the input is not valid UTF-8, + * charntorune will convert the sequence to Runeerror (0xFFFD), returning the + * number of bytes in the invalid sequence. + * + * If a potentially valid sequence is cut off by the len limit, charntorune will + * return 0. + */ +int +charntorune(Rune *p, const char *s, size_t len) +{ + unsigned int i, n; + Rune r; + + if(len == 0) /* can't even look at s[0] */ + return 0; + + switch((n = UTFSEQ(s[0]))) { + case 1: r = s[0]; break; /* 0xxxxxxx */ + case 2: r = s[0] & 0x1F; break; /* 110xxxxx */ + case 3: r = s[0] & 0x0F; break; /* 1110xxxx */ + case 4: r = s[0] & 0x07; break; /* 11110xxx */ + case 5: r = s[0] & 0x03; break; /* 111110xx */ + case 6: r = s[0] & 0x01; break; /* 1111110x */ + default: /* invalid sequence */ + *p = Runeerror; + return 1; + } + /* add values from continuation bytes */ + for(i = 1; i < MIN(n, len); i++) + if((s[i] & 0xC0) != 0x80) { + /* expected continuation */ + *p = Runeerror; + return i; + } + else + r = (r << 6) | (s[i] & 0x3F); + + if(i < n) /* must have reached len limit */ + return 0; + + /* reject invalid runes and overlong sequences */ + if(n > UTFmax || r > 0x10FFFF || runelen(r) < (int)n || (r & 0xFFFE) == 0xFFFE + || (r >= 0xD800 && r <= 0xDFFF) || (r >= 0xFDD0 && r <= 0xFDEF)) + r = Runeerror; + + *p = r; + return n; +} + +/* + * runelen returns the number of bytes required to convert r into UTF-8. If the + * rune is illegal, runelen will return 0. + */ +int +runelen(Rune r) +{ + if(r <= 0x7F) + return 1; + else if(r <= 0x07FF) + return 2; + else if(r <= 0xFFFF) + return 3; + else if(r <= Runemax) + return 4; + else + return 0; /* error */ +} + +/* + * runelen returns the number of bytes required to convert the rune-string of + * length len pointed to by p into UTF-8. + */ +size_t +runenlen(Rune *p, size_t len) +{ + size_t i, n = 0; + + for(i = 0; i < len; i++) + n += runelen(p[i]); + return n; +} + +/* + * fullrune returns 1 if the string s of length len is long enough to be + * decoded by chartorune, and 0 otherwise. + */ +int +fullrune(const char *s, size_t len) +{ + Rune r; + + return charntorune(&r, s, len) > 0; +} + +/* + * utfecpy copies UTF-8 sequences until a null sequence has been copied, but + * writes no sequences beyond end. If any sequences are copied, the to string is + * terminated by a null sequence, and a pointer to that sequence is returned. + * Otherwise, the original to string is returned. + */ +char * +utfecpy(char *to, char *end, const char *from) +{ + Rune r = Runeerror; + size_t i, n; + + /* seek through to find final full rune */ + for(i = 0; r != '\0' && (n = charntorune(&r, &from[i], end - &to[i])); i += n) + ; + memcpy(to, from, i); /* copy over bytes up to this rune */ + + if(i > 0 && r != '\0') + to[i] = '\0'; /* terminate if unterminated */ + return &to[i]; +} + +/* + * utflen returns the number of runes that are represented by the UTF-8 string + * s. + */ +size_t +utflen(const char *s) +{ + const char *p = s; + size_t i; + Rune r; + + for(i = 0; *p != '\0'; i++) + p += chartorune(&r, p); + return i; +} + +/* + * utfnlen returns the number of runes that are represented by the UTF-8 string + * s of length len. If the last few bytes contain an incompletely coded rune, + * utfnlen will not count them; in this way it differs from utflen, which + * includes every byte of the string. + */ +size_t +utfnlen(const char *s, size_t len) +{ + const char *p = s; + size_t i; + Rune r; + int n; + + for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++) + p += n; + return i; +} + +/* + * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s, + * or NULL if r does not occur in s. The null byte terminating a string is + * considered to be part of the string s. + */ +char * +utfrune(const char *s, Rune r) +{ + if(r < Runeself) { + return strchr(s, r); + } + else if(r == Runeerror) { + Rune r0; + int n; + + for(; *s != '\0'; s += n) { + n = chartorune(&r0, s); + if(r == r0) + return (char *)s; + } + } + else { + char buf[UTFmax+1]; + int n; + + if(!(n = runetochar(buf, &r))) + return NULL; + buf[n] = '\0'; + return strstr(s, buf); + } + return NULL; +} + +/* + * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s, + * or NULL if r does not occur in s. The null byte terminating a string is + * considered to be part of the string s. + */ +char * +utfrrune(const char *s, Rune r) +{ + const char *p = NULL; + Rune r0; + int n; + + if(r < Runeself) + return strrchr(s, r); + + for(; *s != '\0'; s += n) { + n = chartorune(&r0, s); + if(r == r0) + p = s; + } + return (char *)p; +} + +/* + * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a + * UTF-8 substring of s, or NULL if there is none. If t is the null string, + * utfutf returns s. + */ +char * +utfutf(const char *s, const char *t) +{ + const char *p, *q; + Rune r0, r1, r2; + int n, m; + + for(chartorune(&r0, t); (s = utfrune(s, r0)); s++) { + for(p = s, q = t; *q && *p; p += n, q += m) { + n = chartorune(&r1, p); + m = chartorune(&r2, q); + if(r1 != r2) + break; + } + if(!*q) + return (char *)s; + } + return NULL; +} diff --git a/runetype.c b/runetype.c @@ -0,0 +1,28 @@ +#include <stdlib.h> +#include "utf.h" + +#define nelem(x) (sizeof (x) / sizeof *(x)) + +static int rune1cmp(const void *, const void *); +static int rune2cmp(const void *, const void *); + +#include "runetypebody.h" + +int +rune1cmp(const void *v1, const void *v2) +{ + Rune r1 = *(Rune *)v1, r2 = *(Rune *)v2; + + return r1 - r2; +} + +int +rune2cmp(const void *v1, const void *v2) +{ + Rune r = *(Rune *)v1, *p = (Rune *)v2; + + if(r >= p[0] && r <= p[1]) + return 0; + else + return r - p[0]; +} diff --git a/runetypebody.h b/runetypebody.h @@ -0,0 +1,1865 @@ +/* Automatically generated from UnicodeData-6.1.0.txt by mkrunetype.awk */ + +static Rune alpha2[][2] = { + { 0x0041, 0x005A }, + { 0x0061, 0x007A }, + { 0x00C0, 0x00D6 }, + { 0x00D8, 0x00F6 }, + { 0x00F8, 0x02C1 }, + { 0x02C6, 0x02D1 }, + { 0x02E0, 0x02E4 }, + { 0x0370, 0x0374 }, + { 0x0376, 0x0377 }, + { 0x037A, 0x037D }, + { 0x0388, 0x038A }, + { 0x038E, 0x03A1 }, + { 0x03A3, 0x03F5 }, + { 0x03F7, 0x0481 }, + { 0x048A, 0x0527 }, + { 0x0531, 0x0556 }, + { 0x0561, 0x0587 }, + { 0x05D0, 0x05EA }, + { 0x05F0, 0x05F2 }, + { 0x0620, 0x064A }, + { 0x066E, 0x066F }, + { 0x0671, 0x06D3 }, + { 0x06E5, 0x06E6 }, + { 0x06EE, 0x06EF }, + { 0x06FA, 0x06FC }, + { 0x0712, 0x072F }, + { 0x074D, 0x07A5 }, + { 0x07CA, 0x07EA }, + { 0x07F4, 0x07F5 }, + { 0x0800, 0x0815 }, + { 0x0840, 0x0858 }, + { 0x08A2, 0x08AC }, + { 0x0904, 0x0939 }, + { 0x0958, 0x0961 }, + { 0x0971, 0x0977 }, + { 0x0979, 0x097F }, + { 0x0985, 0x098C }, + { 0x098F, 0x0990 }, + { 0x0993, 0x09A8 }, + { 0x09AA, 0x09B0 }, + { 0x09B6, 0x09B9 }, + { 0x09DC, 0x09DD }, + { 0x09DF, 0x09E1 }, + { 0x09F0, 0x09F1 }, + { 0x0A05, 0x0A0A }, + { 0x0A0F, 0x0A10 }, + { 0x0A13, 0x0A28 }, + { 0x0A2A, 0x0A30 }, + { 0x0A32, 0x0A33 }, + { 0x0A35, 0x0A36 }, + { 0x0A38, 0x0A39 }, + { 0x0A59, 0x0A5C }, + { 0x0A72, 0x0A74 }, + { 0x0A85, 0x0A8D }, + { 0x0A8F, 0x0A91 }, + { 0x0A93, 0x0AA8 }, + { 0x0AAA, 0x0AB0 }, + { 0x0AB2, 0x0AB3 }, + { 0x0AB5, 0x0AB9 }, + { 0x0AE0, 0x0AE1 }, + { 0x0B05, 0x0B0C }, + { 0x0B0F, 0x0B10 }, + { 0x0B13, 0x0B28 }, + { 0x0B2A, 0x0B30 }, + { 0x0B32, 0x0B33 }, + { 0x0B35, 0x0B39 }, + { 0x0B5C, 0x0B5D }, + { 0x0B5F, 0x0B61 }, + { 0x0B85, 0x0B8A }, + { 0x0B8E, 0x0B90 }, + { 0x0B92, 0x0B95 }, + { 0x0B99, 0x0B9A }, + { 0x0B9E, 0x0B9F }, + { 0x0BA3, 0x0BA4 }, + { 0x0BA8, 0x0BAA }, + { 0x0BAE, 0x0BB9 }, + { 0x0C05, 0x0C0C }, + { 0x0C0E, 0x0C10 }, + { 0x0C12, 0x0C28 }, + { 0x0C2A, 0x0C33 }, + { 0x0C35, 0x0C39 }, + { 0x0C58, 0x0C59 }, + { 0x0C60, 0x0C61 }, + { 0x0C85, 0x0C8C }, + { 0x0C8E, 0x0C90 }, + { 0x0C92, 0x0CA8 }, + { 0x0CAA, 0x0CB3 }, + { 0x0CB5, 0x0CB9 }, + { 0x0CE0, 0x0CE1 }, + { 0x0CF1, 0x0CF2 }, + { 0x0D05, 0x0D0C }, + { 0x0D0E, 0x0D10 }, + { 0x0D12, 0x0D3A }, + { 0x0D60, 0x0D61 }, + { 0x0D7A, 0x0D7F }, + { 0x0D85, 0x0D96 }, + { 0x0D9A, 0x0DB1 }, + { 0x0DB3, 0x0DBB }, + { 0x0DC0, 0x0DC6 }, + { 0x0E01, 0x0E30 }, + { 0x0E32, 0x0E33 }, + { 0x0E40, 0x0E46 }, + { 0x0E81, 0x0E82 }, + { 0x0E87, 0x0E88 }, + { 0x0E94, 0x0E97 }, + { 0x0E99, 0x0E9F }, + { 0x0EA1, 0x0EA3 }, + { 0x0EAA, 0x0EAB }, + { 0x0EAD, 0x0EB0 }, + { 0x0EB2, 0x0EB3 }, + { 0x0EC0, 0x0EC4 }, + { 0x0EDC, 0x0EDF }, + { 0x0F40, 0x0F47 }, + { 0x0F49, 0x0F6C }, + { 0x0F88, 0x0F8C }, + { 0x1000, 0x102A }, + { 0x1050, 0x1055 }, + { 0x105A, 0x105D }, + { 0x1065, 0x1066 }, + { 0x106E, 0x1070 }, + { 0x1075, 0x1081 }, + { 0x10A0, 0x10C5 }, + { 0x10D0, 0x10FA }, + { 0x10FC, 0x1248 }, + { 0x124A, 0x124D }, + { 0x1250, 0x1256 }, + { 0x125A, 0x125D }, + { 0x1260, 0x1288 }, + { 0x128A, 0x128D }, + { 0x1290, 0x12B0 }, + { 0x12B2, 0x12B5 }, + { 0x12B8, 0x12BE }, + { 0x12C2, 0x12C5 }, + { 0x12C8, 0x12D6 }, + { 0x12D8, 0x1310 }, + { 0x1312, 0x1315 }, + { 0x1318, 0x135A }, + { 0x1380, 0x138F }, + { 0x13A0, 0x13F4 }, + { 0x1401, 0x166C }, + { 0x166F, 0x167F }, + { 0x1681, 0x169A }, + { 0x16A0, 0x16EA }, + { 0x1700, 0x170C }, + { 0x170E, 0x1711 }, + { 0x1720, 0x1731 }, + { 0x1740, 0x1751 }, + { 0x1760, 0x176C }, + { 0x176E, 0x1770 }, + { 0x1780, 0x17B3 }, + { 0x1820, 0x1877 }, + { 0x1880, 0x18A8 }, + { 0x18B0, 0x18F5 }, + { 0x1900, 0x191C }, + { 0x1950, 0x196D }, + { 0x1970, 0x1974 }, + { 0x1980, 0x19AB }, + { 0x19C1, 0x19C7 }, + { 0x1A00, 0x1A16 }, + { 0x1A20, 0x1A54 }, + { 0x1B05, 0x1B33 }, + { 0x1B45, 0x1B4B }, + { 0x1B83, 0x1BA0 }, + { 0x1BAE, 0x1BAF }, + { 0x1BBA, 0x1BE5 }, + { 0x1C00, 0x1C23 }, + { 0x1C4D, 0x1C4F }, + { 0x1C5A, 0x1C7D }, + { 0x1CE9, 0x1CEC }, + { 0x1CEE, 0x1CF1 }, + { 0x1CF5, 0x1CF6 }, + { 0x1D00, 0x1DBF }, + { 0x1E00, 0x1F15 }, + { 0x1F18, 0x1F1D }, + { 0x1F20, 0x1F45 }, + { 0x1F48, 0x1F4D }, + { 0x1F50, 0x1F57 }, + { 0x1F5F, 0x1F7D }, + { 0x1F80, 0x1FB4 }, + { 0x1FB6, 0x1FBC }, + { 0x1FC2, 0x1FC4 }, + { 0x1FC6, 0x1FCC }, + { 0x1FD0, 0x1FD3 }, + { 0x1FD6, 0x1FDB }, + { 0x1FE0, 0x1FEC }, + { 0x1FF2, 0x1FF4 }, + { 0x1FF6, 0x1FFC }, + { 0x2090, 0x209C }, + { 0x210A, 0x2113 }, + { 0x2119, 0x211D }, + { 0x212A, 0x212D }, + { 0x212F, 0x2139 }, + { 0x213C, 0x213F }, + { 0x2145, 0x2149 }, + { 0x2183, 0x2184 }, + { 0x2C00, 0x2C2E }, + { 0x2C30, 0x2C5E }, + { 0x2C60, 0x2CE4 }, + { 0x2CEB, 0x2CEE }, + { 0x2CF2, 0x2CF3 }, + { 0x2D00, 0x2D25 }, + { 0x2D30, 0x2D67 }, + { 0x2D80, 0x2D96 }, + { 0x2DA0, 0x2DA6 }, + { 0x2DA8, 0x2DAE }, + { 0x2DB0, 0x2DB6 }, + { 0x2DB8, 0x2DBE }, + { 0x2DC0, 0x2DC6 }, + { 0x2DC8, 0x2DCE }, + { 0x2DD0, 0x2DD6 }, + { 0x2DD8, 0x2DDE }, + { 0x3005, 0x3006 }, + { 0x3031, 0x3035 }, + { 0x303B, 0x303C }, + { 0x3041, 0x3096 }, + { 0x309D, 0x309F }, + { 0x30A1, 0x30FA }, + { 0x30FC, 0x30FF }, + { 0x3105, 0x312D }, + { 0x3131, 0x318E }, + { 0x31A0, 0x31BA }, + { 0x31F0, 0x31FF }, + { 0xA000, 0xA48C }, + { 0xA4D0, 0xA4FD }, + { 0xA500, 0xA60C }, + { 0xA610, 0xA61F }, + { 0xA62A, 0xA62B }, + { 0xA640, 0xA66E }, + { 0xA67F, 0xA697 }, + { 0xA6A0, 0xA6E5 }, + { 0xA717, 0xA71F }, + { 0xA722, 0xA788 }, + { 0xA78B, 0xA78E }, + { 0xA790, 0xA793 }, + { 0xA7A0, 0xA7AA }, + { 0xA7F8, 0xA801 }, + { 0xA803, 0xA805 }, + { 0xA807, 0xA80A }, + { 0xA80C, 0xA822 }, + { 0xA840, 0xA873 }, + { 0xA882, 0xA8B3 }, + { 0xA8F2, 0xA8F7 }, + { 0xA90A, 0xA925 }, + { 0xA930, 0xA946 }, + { 0xA960, 0xA97C }, + { 0xA984, 0xA9B2 }, + { 0xAA00, 0xAA28 }, + { 0xAA40, 0xAA42 }, + { 0xAA44, 0xAA4B }, + { 0xAA60, 0xAA76 }, + { 0xAA80, 0xAAAF }, + { 0xAAB5, 0xAAB6 }, + { 0xAAB9, 0xAABD }, + { 0xAADB, 0xAADD }, + { 0xAAE0, 0xAAEA }, + { 0xAAF2, 0xAAF4 }, + { 0xAB01, 0xAB06 }, + { 0xAB09, 0xAB0E }, + { 0xAB11, 0xAB16 }, + { 0xAB20, 0xAB26 }, + { 0xAB28, 0xAB2E }, + { 0xABC0, 0xABE2 }, + { 0xD7B0, 0xD7C6 }, + { 0xD7CB, 0xD7FB }, + { 0xF900, 0xFA6D }, + { 0xFA70, 0xFAD9 }, + { 0xFB00, 0xFB06 }, + { 0xFB13, 0xFB17 }, + { 0xFB1F, 0xFB28 }, + { 0xFB2A, 0xFB36 }, + { 0xFB38, 0xFB3C }, + { 0xFB40, 0xFB41 }, + { 0xFB43, 0xFB44 }, + { 0xFB46, 0xFBB1 }, + { 0xFBD3, 0xFD3D }, + { 0xFD50, 0xFD8F }, + { 0xFD92, 0xFDC7 }, + { 0xFDF0, 0xFDFB }, + { 0xFE70, 0xFE74 }, + { 0xFE76, 0xFEFC }, + { 0xFF21, 0xFF3A }, + { 0xFF41, 0xFF5A }, + { 0xFF66, 0xFFBE }, + { 0xFFC2, 0xFFC7 }, + { 0xFFCA, 0xFFCF }, + { 0xFFD2, 0xFFD7 }, + { 0xFFDA, 0xFFDC }, + { 0x10000, 0x1000B }, + { 0x1000D, 0x10026 }, + { 0x10028, 0x1003A }, + { 0x1003C, 0x1003D }, + { 0x1003F, 0x1004D }, + { 0x10050, 0x1005D }, + { 0x10080, 0x100FA }, + { 0x10280, 0x1029C }, + { 0x102A0, 0x102D0 }, + { 0x10300, 0x1031E }, + { 0x10330, 0x10340 }, + { 0x10342, 0x10349 }, + { 0x10380, 0x1039D }, + { 0x103A0, 0x103C3 }, + { 0x103C8, 0x103CF }, + { 0x10400, 0x1049D }, + { 0x10800, 0x10805 }, + { 0x1080A, 0x10835 }, + { 0x10837, 0x10838 }, + { 0x1083F, 0x10855 }, + { 0x10900, 0x10915 }, + { 0x10920, 0x10939 }, + { 0x10980, 0x109B7 }, + { 0x109BE, 0x109BF }, + { 0x10A10, 0x10A13 }, + { 0x10A15, 0x10A17 }, + { 0x10A19, 0x10A33 }, + { 0x10A60, 0x10A7C }, + { 0x10B00, 0x10B35 }, + { 0x10B40, 0x10B55 }, + { 0x10B60, 0x10B72 }, + { 0x10C00, 0x10C48 }, + { 0x11003, 0x11037 }, + { 0x11083, 0x110AF }, + { 0x110D0, 0x110E8 }, + { 0x11103, 0x11126 }, + { 0x11183, 0x111B2 }, + { 0x111C1, 0x111C4 }, + { 0x11680, 0x116AA }, + { 0x12000, 0x1236E }, + { 0x13000, 0x1342E }, + { 0x16800, 0x16A38 }, + { 0x16F00, 0x16F44 }, + { 0x16F93, 0x16F9F }, + { 0x1B000, 0x1B001 }, + { 0x1D400, 0x1D454 }, + { 0x1D456, 0x1D49C }, + { 0x1D49E, 0x1D49F }, + { 0x1D4A5, 0x1D4A6 }, + { 0x1D4A9, 0x1D4AC }, + { 0x1D4AE, 0x1D4B9 }, + { 0x1D4BD, 0x1D4C3 }, + { 0x1D4C5, 0x1D505 }, + { 0x1D507, 0x1D50A }, + { 0x1D50D, 0x1D514 }, + { 0x1D516, 0x1D51C }, + { 0x1D51E, 0x1D539 }, + { 0x1D53B, 0x1D53E }, + { 0x1D540, 0x1D544 }, + { 0x1D54A, 0x1D550 }, + { 0x1D552, 0x1D6A5 }, + { 0x1D6A8, 0x1D6C0 }, + { 0x1D6C2, 0x1D6DA }, + { 0x1D6DC, 0x1D6FA }, + { 0x1D6FC, 0x1D714 }, + { 0x1D716, 0x1D734 }, + { 0x1D736, 0x1D74E }, + { 0x1D750, 0x1D76E }, + { 0x1D770, 0x1D788 }, + { 0x1D78A, 0x1D7A8 }, + { 0x1D7AA, 0x1D7C2 }, + { 0x1D7C4, 0x1D7CB }, + { 0x1EE00, 0x1EE03 }, + { 0x1EE05, 0x1EE1F }, + { 0x1EE21, 0x1EE22 }, + { 0x1EE29, 0x1EE32 }, + { 0x1EE34, 0x1EE37 }, + { 0x1EE4D, 0x1EE4F }, + { 0x1EE51, 0x1EE52 }, + { 0x1EE61, 0x1EE62 }, + { 0x1EE67, 0x1EE6A }, + { 0x1EE6C, 0x1EE72 }, + { 0x1EE74, 0x1EE77 }, + { 0x1EE79, 0x1EE7C }, + { 0x1EE80, 0x1EE89 }, + { 0x1EE8B, 0x1EE9B }, + { 0x1EEA1, 0x1EEA3 }, + { 0x1EEA5, 0x1EEA9 }, + { 0x1EEAB, 0x1EEBB }, + { 0x2F800, 0x2FA1D }, +}; + +static Rune alpha1[] = { + 0x00AA, + 0x00B5, + 0x00BA, + 0x02EC, + 0x02EE, + 0x0386, + 0x038C, + 0x0559, + 0x06D5, + 0x06FF, + 0x0710, + 0x07B1, + 0x07FA, + 0x081A, + 0x0824, + 0x0828, + 0x08A0, + 0x093D, + 0x0950, + 0x09B2, + 0x09BD, + 0x09CE, + 0x0A5E, + 0x0ABD, + 0x0AD0, + 0x0B3D, + 0x0B71, + 0x0B83, + 0x0B9C, + 0x0BD0, + 0x0C3D, + 0x0CBD, + 0x0CDE, + 0x0D3D, + 0x0D4E, + 0x0DBD, + 0x0E84, + 0x0E8A, + 0x0E8D, + 0x0EA5, + 0x0EA7, + 0x0EBD, + 0x0EC6, + 0x0F00, + 0x103F, + 0x1061, + 0x108E, + 0x10C7, + 0x10CD, + 0x1258, + 0x12C0, + 0x17D7, + 0x17DC, + 0x18AA, + 0x1AA7, + 0x1F59, + 0x1F5B, + 0x1F5D, + 0x1FBE, + 0x2071, + 0x207F, + 0x2102, + 0x2107, + 0x2115, + 0x2124, + 0x2126, + 0x2128, + 0x214E, + 0x2D27, + 0x2D2D, + 0x2D6F, + 0x2E2F, + 0x3400, + 0x4DB5, + 0x4E00, + 0x9FCC, + 0xA8FB, + 0xA9CF, + 0xAA7A, + 0xAAB1, + 0xAAC0, + 0xAAC2, + 0xAC00, + 0xD7A3, + 0xFB1D, + 0xFB3E, + 0x10808, + 0x1083C, + 0x10A00, + 0x16F50, + 0x1D4A2, + 0x1D4BB, + 0x1D546, + 0x1EE24, + 0x1EE27, + 0x1EE39, + 0x1EE3B, + 0x1EE42, + 0x1EE47, + 0x1EE49, + 0x1EE4B, + 0x1EE54, + 0x1EE57, + 0x1EE59, + 0x1EE5B, + 0x1EE5D, + 0x1EE5F, + 0x1EE64, + 0x1EE7E, + 0x20000, + 0x2A6D6, + 0x2A700, + 0x2B734, + 0x2B740, + 0x2B81D, +}; + +int +isalpharune(Rune r) +{ + if(bsearch(&r, alpha2, nelem(alpha2), sizeof *alpha2, &rune2cmp)) + return 1; + if(bsearch(&r, alpha1, nelem(alpha1), sizeof *alpha1, &rune1cmp)) + return 1; + return 0; +} + +static Rune space2[][2] = { + { 0x2000, 0x200A }, + { 0x2028, 0x2029 }, +}; + +static Rune space1[] = { + 0x0020, + 0x00A0, + 0x1680, + 0x180E, + 0x202F, + 0x205F, + 0x3000, +}; + +int +isspacerune(Rune r) +{ + if(bsearch(&r, space2, nelem(space2), sizeof *space2, &rune2cmp)) + return 1; + if(bsearch(&r, space1, nelem(space1), sizeof *space1, &rune1cmp)) + return 1; + return 0; +} + +static Rune upper2[][2] = { + { 0x0041, 0x005A }, + { 0x00C0, 0x00D6 }, + { 0x00D8, 0x00DE }, + { 0x0178, 0x0179 }, + { 0x0181, 0x0182 }, + { 0x0186, 0x0187 }, + { 0x0189, 0x018B }, + { 0x018E, 0x0191 }, + { 0x0193, 0x0194 }, + { 0x0196, 0x0198 }, + { 0x019C, 0x019D }, + { 0x019F, 0x01A0 }, + { 0x01A6, 0x01A7 }, + { 0x01AE, 0x01AF }, + { 0x01B1, 0x01B3 }, + { 0x01B7, 0x01B8 }, + { 0x01F6, 0x01F8 }, + { 0x023A, 0x023B }, + { 0x023D, 0x023E }, + { 0x0243, 0x0246 }, + { 0x0388, 0x038A }, + { 0x038E, 0x038F }, + { 0x0391, 0x03A1 }, + { 0x03A3, 0x03AB }, + { 0x03D2, 0x03D4 }, + { 0x03F9, 0x03FA }, + { 0x03FD, 0x042F }, + { 0x04C0, 0x04C1 }, + { 0x0531, 0x0556 }, + { 0x10A0, 0x10C5 }, + { 0x1F08, 0x1F0F }, + { 0x1F18, 0x1F1D }, + { 0x1F28, 0x1F2F }, + { 0x1F38, 0x1F3F }, + { 0x1F48, 0x1F4D }, + { 0x1F68, 0x1F6F }, + { 0x1FB8, 0x1FBB }, + { 0x1FC8, 0x1FCB }, + { 0x1FD8, 0x1FDB }, + { 0x1FE8, 0x1FEC }, + { 0x1FF8, 0x1FFB }, + { 0x210B, 0x210D }, + { 0x2110, 0x2112 }, + { 0x2119, 0x211D }, + { 0x212A, 0x212D }, + { 0x2130, 0x2133 }, + { 0x213E, 0x213F }, + { 0x2C00, 0x2C2E }, + { 0x2C62, 0x2C64 }, + { 0x2C6D, 0x2C70 }, + { 0x2C7E, 0x2C80 }, + { 0xA77D, 0xA77E }, + { 0xFF21, 0xFF3A }, + { 0x10400, 0x10427 }, + { 0x1D400, 0x1D419 }, + { 0x1D434, 0x1D44D }, + { 0x1D468, 0x1D481 }, + { 0x1D49E, 0x1D49F }, + { 0x1D4A5, 0x1D4A6 }, + { 0x1D4A9, 0x1D4AC }, + { 0x1D4AE, 0x1D4B5 }, + { 0x1D4D0, 0x1D4E9 }, + { 0x1D504, 0x1D505 }, + { 0x1D507, 0x1D50A }, + { 0x1D50D, 0x1D514 }, + { 0x1D516, 0x1D51C }, + { 0x1D538, 0x1D539 }, + { 0x1D53B, 0x1D53E }, + { 0x1D540, 0x1D544 }, + { 0x1D54A, 0x1D550 }, + { 0x1D56C, 0x1D585 }, + { 0x1D5A0, 0x1D5B9 }, + { 0x1D5D4, 0x1D5ED }, + { 0x1D608, 0x1D621 }, + { 0x1D63C, 0x1D655 }, + { 0x1D670, 0x1D689 }, + { 0x1D6A8, 0x1D6C0 }, + { 0x1D6E2, 0x1D6FA }, + { 0x1D71C, 0x1D734 }, + { 0x1D756, 0x1D76E }, + { 0x1D790, 0x1D7A8 }, +}; + +static Rune upper1[] = { + 0x0100, + 0x0102, + 0x0104, + 0x0106, + 0x0108, + 0x010A, + 0x010C, + 0x010E, + 0x0110, + 0x0112, + 0x0114, + 0x0116, + 0x0118, + 0x011A, + 0x011C, + 0x011E, + 0x0120, + 0x0122, + 0x0124, + 0x0126, + 0x0128, + 0x012A, + 0x012C, + 0x012E, + 0x0130, + 0x0132, + 0x0134, + 0x0136, + 0x0139, + 0x013B, + 0x013D, + 0x013F, + 0x0141, + 0x0143, + 0x0145, + 0x0147, + 0x014A, + 0x014C, + 0x014E, + 0x0150, + 0x0152, + 0x0154, + 0x0156, + 0x0158, + 0x015A, + 0x015C, + 0x015E, + 0x0160, + 0x0162, + 0x0164, + 0x0166, + 0x0168, + 0x016A, + 0x016C, + 0x016E, + 0x0170, + 0x0172, + 0x0174, + 0x0176, + 0x017B, + 0x017D, + 0x0184, + 0x01A2, + 0x01A4, + 0x01A9, + 0x01AC, + 0x01B5, + 0x01BC, + 0x01C4, + 0x01C7, + 0x01CA, + 0x01CD, + 0x01CF, + 0x01D1, + 0x01D3, + 0x01D5, + 0x01D7, + 0x01D9, + 0x01DB, + 0x01DE, + 0x01E0, + 0x01E2, + 0x01E4, + 0x01E6, + 0x01E8, + 0x01EA, + 0x01EC, + 0x01EE, + 0x01F1, + 0x01F4, + 0x01FA, + 0x01FC, + 0x01FE, + 0x0200, + 0x0202, + 0x0204, + 0x0206, + 0x0208, + 0x020A, + 0x020C, + 0x020E, + 0x0210, + 0x0212, + 0x0214, + 0x0216, + 0x0218, + 0x021A, + 0x021C, + 0x021E, + 0x0220, + 0x0222, + 0x0224, + 0x0226, + 0x0228, + 0x022A, + 0x022C, + 0x022E, + 0x0230, + 0x0232, + 0x0241, + 0x0248, + 0x024A, + 0x024C, + 0x024E, + 0x0370, + 0x0372, + 0x0376, + 0x0386, + 0x038C, + 0x03CF, + 0x03D8, + 0x03DA, + 0x03DC, + 0x03DE, + 0x03E0, + 0x03E2, + 0x03E4, + 0x03E6, + 0x03E8, + 0x03EA, + 0x03EC, + 0x03EE, + 0x03F4, + 0x03F7, + 0x0460, + 0x0462, + 0x0464, + 0x0466, + 0x0468, + 0x046A, + 0x046C, + 0x046E, + 0x0470, + 0x0472, + 0x0474, + 0x0476, + 0x0478, + 0x047A, + 0x047C, + 0x047E, + 0x0480, + 0x048A, + 0x048C, + 0x048E, + 0x0490, + 0x0492, + 0x0494, + 0x0496, + 0x0498, + 0x049A, + 0x049C, + 0x049E, + 0x04A0, + 0x04A2, + 0x04A4, + 0x04A6, + 0x04A8, + 0x04AA, + 0x04AC, + 0x04AE, + 0x04B0, + 0x04B2, + 0x04B4, + 0x04B6, + 0x04B8, + 0x04BA, + 0x04BC, + 0x04BE, + 0x04C3, + 0x04C5, + 0x04C7, + 0x04C9, + 0x04CB, + 0x04CD, + 0x04D0, + 0x04D2, + 0x04D4, + 0x04D6, + 0x04D8, + 0x04DA, + 0x04DC, + 0x04DE, + 0x04E0, + 0x04E2, + 0x04E4, + 0x04E6, + 0x04E8, + 0x04EA, + 0x04EC, + 0x04EE, + 0x04F0, + 0x04F2, + 0x04F4, + 0x04F6, + 0x04F8, + 0x04FA, + 0x04FC, + 0x04FE, + 0x0500, + 0x0502, + 0x0504, + 0x0506, + 0x0508, + 0x050A, + 0x050C, + 0x050E, + 0x0510, + 0x0512, + 0x0514, + 0x0516, + 0x0518, + 0x051A, + 0x051C, + 0x051E, + 0x0520, + 0x0522, + 0x0524, + 0x0526, + 0x10C7, + 0x10CD, + 0x1E00, + 0x1E02, + 0x1E04, + 0x1E06, + 0x1E08, + 0x1E0A, + 0x1E0C, + 0x1E0E, + 0x1E10, + 0x1E12, + 0x1E14, + 0x1E16, + 0x1E18, + 0x1E1A, + 0x1E1C, + 0x1E1E, + 0x1E20, + 0x1E22, + 0x1E24, + 0x1E26, + 0x1E28, + 0x1E2A, + 0x1E2C, + 0x1E2E, + 0x1E30, + 0x1E32, + 0x1E34, + 0x1E36, + 0x1E38, + 0x1E3A, + 0x1E3C, + 0x1E3E, + 0x1E40, + 0x1E42, + 0x1E44, + 0x1E46, + 0x1E48, + 0x1E4A, + 0x1E4C, + 0x1E4E, + 0x1E50, + 0x1E52, + 0x1E54, + 0x1E56, + 0x1E58, + 0x1E5A, + 0x1E5C, + 0x1E5E, + 0x1E60, + 0x1E62, + 0x1E64, + 0x1E66, + 0x1E68, + 0x1E6A, + 0x1E6C, + 0x1E6E, + 0x1E70, + 0x1E72, + 0x1E74, + 0x1E76, + 0x1E78, + 0x1E7A, + 0x1E7C, + 0x1E7E, + 0x1E80, + 0x1E82, + 0x1E84, + 0x1E86, + 0x1E88, + 0x1E8A, + 0x1E8C, + 0x1E8E, + 0x1E90, + 0x1E92, + 0x1E94, + 0x1E9E, + 0x1EA0, + 0x1EA2, + 0x1EA4, + 0x1EA6, + 0x1EA8, + 0x1EAA, + 0x1EAC, + 0x1EAE, + 0x1EB0, + 0x1EB2, + 0x1EB4, + 0x1EB6, + 0x1EB8, + 0x1EBA, + 0x1EBC, + 0x1EBE, + 0x1EC0, + 0x1EC2, + 0x1EC4, + 0x1EC6, + 0x1EC8, + 0x1ECA, + 0x1ECC, + 0x1ECE, + 0x1ED0, + 0x1ED2, + 0x1ED4, + 0x1ED6, + 0x1ED8, + 0x1EDA, + 0x1EDC, + 0x1EDE, + 0x1EE0, + 0x1EE2, + 0x1EE4, + 0x1EE6, + 0x1EE8, + 0x1EEA, + 0x1EEC, + 0x1EEE, + 0x1EF0, + 0x1EF2, + 0x1EF4, + 0x1EF6, + 0x1EF8, + 0x1EFA, + 0x1EFC, + 0x1EFE, + 0x1F59, + 0x1F5B, + 0x1F5D, + 0x1F5F, + 0x2102, + 0x2107, + 0x2115, + 0x2124, + 0x2126, + 0x2128, + 0x2145, + 0x2183, + 0x2C60, + 0x2C67, + 0x2C69, + 0x2C6B, + 0x2C72, + 0x2C75, + 0x2C82, + 0x2C84, + 0x2C86, + 0x2C88, + 0x2C8A, + 0x2C8C, + 0x2C8E, + 0x2C90, + 0x2C92, + 0x2C94, + 0x2C96, + 0x2C98, + 0x2C9A, + 0x2C9C, + 0x2C9E, + 0x2CA0, + 0x2CA2, + 0x2CA4, + 0x2CA6, + 0x2CA8, + 0x2CAA, + 0x2CAC, + 0x2CAE, + 0x2CB0, + 0x2CB2, + 0x2CB4, + 0x2CB6, + 0x2CB8, + 0x2CBA, + 0x2CBC, + 0x2CBE, + 0x2CC0, + 0x2CC2, + 0x2CC4, + 0x2CC6, + 0x2CC8, + 0x2CCA, + 0x2CCC, + 0x2CCE, + 0x2CD0, + 0x2CD2, + 0x2CD4, + 0x2CD6, + 0x2CD8, + 0x2CDA, + 0x2CDC, + 0x2CDE, + 0x2CE0, + 0x2CE2, + 0x2CEB, + 0x2CED, + 0x2CF2, + 0xA640, + 0xA642, + 0xA644, + 0xA646, + 0xA648, + 0xA64A, + 0xA64C, + 0xA64E, + 0xA650, + 0xA652, + 0xA654, + 0xA656, + 0xA658, + 0xA65A, + 0xA65C, + 0xA65E, + 0xA660, + 0xA662, + 0xA664, + 0xA666, + 0xA668, + 0xA66A, + 0xA66C, + 0xA680, + 0xA682, + 0xA684, + 0xA686, + 0xA688, + 0xA68A, + 0xA68C, + 0xA68E, + 0xA690, + 0xA692, + 0xA694, + 0xA696, + 0xA722, + 0xA724, + 0xA726, + 0xA728, + 0xA72A, + 0xA72C, + 0xA72E, + 0xA732, + 0xA734, + 0xA736, + 0xA738, + 0xA73A, + 0xA73C, + 0xA73E, + 0xA740, + 0xA742, + 0xA744, + 0xA746, + 0xA748, + 0xA74A, + 0xA74C, + 0xA74E, + 0xA750, + 0xA752, + 0xA754, + 0xA756, + 0xA758, + 0xA75A, + 0xA75C, + 0xA75E, + 0xA760, + 0xA762, + 0xA764, + 0xA766, + 0xA768, + 0xA76A, + 0xA76C, + 0xA76E, + 0xA779, + 0xA77B, + 0xA780, + 0xA782, + 0xA784, + 0xA786, + 0xA78B, + 0xA78D, + 0xA790, + 0xA792, + 0xA7A0, + 0xA7A2, + 0xA7A4, + 0xA7A6, + 0xA7A8, + 0xA7AA, + 0x1D49C, + 0x1D4A2, + 0x1D546, + 0x1D7CA, +}; + +int +isupperrune(Rune r) +{ + if(bsearch(&r, upper2, nelem(upper2), sizeof *upper2, &rune2cmp)) + return 1; + if(bsearch(&r, upper1, nelem(upper1), sizeof *upper1, &rune1cmp)) + return 1; + return 0; +} + +static Rune lower2[][2] = { + { 0x0061, 0x007A }, + { 0x00DF, 0x00F6 }, + { 0x00F8, 0x00FF }, + { 0x0137, 0x0138 }, + { 0x0148, 0x0149 }, + { 0x017E, 0x0180 }, + { 0x018C, 0x018D }, + { 0x0199, 0x019B }, + { 0x01AA, 0x01AB }, + { 0x01B9, 0x01BA }, + { 0x01BD, 0x01BF }, + { 0x01DC, 0x01DD }, + { 0x01EF, 0x01F0 }, + { 0x0233, 0x0239 }, + { 0x023F, 0x0240 }, + { 0x024F, 0x0293 }, + { 0x0295, 0x02AF }, + { 0x037B, 0x037D }, + { 0x03AC, 0x03CE }, + { 0x03D0, 0x03D1 }, + { 0x03D5, 0x03D7 }, + { 0x03EF, 0x03F3 }, + { 0x03FB, 0x03FC }, + { 0x0430, 0x045F }, + { 0x04CE, 0x04CF }, + { 0x0561, 0x0587 }, + { 0x1D00, 0x1D2B }, + { 0x1D6B, 0x1D77 }, + { 0x1D79, 0x1D9A }, + { 0x1E95, 0x1E9D }, + { 0x1EFF, 0x1F07 }, + { 0x1F10, 0x1F15 }, + { 0x1F20, 0x1F27 }, + { 0x1F30, 0x1F37 }, + { 0x1F40, 0x1F45 }, + { 0x1F50, 0x1F57 }, + { 0x1F60, 0x1F67 }, + { 0x1F70, 0x1F7D }, + { 0x1F80, 0x1F87 }, + { 0x1F90, 0x1F97 }, + { 0x1FA0, 0x1FA7 }, + { 0x1FB0, 0x1FB4 }, + { 0x1FB6, 0x1FB7 }, + { 0x1FC2, 0x1FC4 }, + { 0x1FC6, 0x1FC7 }, + { 0x1FD0, 0x1FD3 }, + { 0x1FD6, 0x1FD7 }, + { 0x1FE0, 0x1FE7 }, + { 0x1FF2, 0x1FF4 }, + { 0x1FF6, 0x1FF7 }, + { 0x210E, 0x210F }, + { 0x213C, 0x213D }, + { 0x2146, 0x2149 }, + { 0x2C30, 0x2C5E }, + { 0x2C65, 0x2C66 }, + { 0x2C73, 0x2C74 }, + { 0x2C76, 0x2C7B }, + { 0x2CE3, 0x2CE4 }, + { 0x2D00, 0x2D25 }, + { 0xA72F, 0xA731 }, + { 0xA771, 0xA778 }, + { 0xFB00, 0xFB06 }, + { 0xFB13, 0xFB17 }, + { 0xFF41, 0xFF5A }, + { 0x10428, 0x1044F }, + { 0x1D41A, 0x1D433 }, + { 0x1D44E, 0x1D454 }, + { 0x1D456, 0x1D467 }, + { 0x1D482, 0x1D49B }, + { 0x1D4B6, 0x1D4B9 }, + { 0x1D4BD, 0x1D4C3 }, + { 0x1D4C5, 0x1D4CF }, + { 0x1D4EA, 0x1D503 }, + { 0x1D51E, 0x1D537 }, + { 0x1D552, 0x1D56B }, + { 0x1D586, 0x1D59F }, + { 0x1D5BA, 0x1D5D3 }, + { 0x1D5EE, 0x1D607 }, + { 0x1D622, 0x1D63B }, + { 0x1D656, 0x1D66F }, + { 0x1D68A, 0x1D6A5 }, + { 0x1D6C2, 0x1D6DA }, + { 0x1D6DC, 0x1D6E1 }, + { 0x1D6FC, 0x1D714 }, + { 0x1D716, 0x1D71B }, + { 0x1D736, 0x1D74E }, + { 0x1D750, 0x1D755 }, + { 0x1D770, 0x1D788 }, + { 0x1D78A, 0x1D78F }, + { 0x1D7AA, 0x1D7C2 }, + { 0x1D7C4, 0x1D7C9 }, +}; + +static Rune lower1[] = { + 0x00B5, + 0x0101, + 0x0103, + 0x0105, + 0x0107, + 0x0109, + 0x010B, + 0x010D, + 0x010F, + 0x0111, + 0x0113, + 0x0115, + 0x0117, + 0x0119, + 0x011B, + 0x011D, + 0x011F, + 0x0121, + 0x0123, + 0x0125, + 0x0127, + 0x0129, + 0x012B, + 0x012D, + 0x012F, + 0x0131, + 0x0133, + 0x0135, + 0x013A, + 0x013C, + 0x013E, + 0x0140, + 0x0142, + 0x0144, + 0x0146, + 0x014B, + 0x014D, + 0x014F, + 0x0151, + 0x0153, + 0x0155, + 0x0157, + 0x0159, + 0x015B, + 0x015D, + 0x015F, + 0x0161, + 0x0163, + 0x0165, + 0x0167, + 0x0169, + 0x016B, + 0x016D, + 0x016F, + 0x0171, + 0x0173, + 0x0175, + 0x0177, + 0x017A, + 0x017C, + 0x0183, + 0x0185, + 0x0188, + 0x0192, + 0x0195, + 0x019E, + 0x01A1, + 0x01A3, + 0x01A5, + 0x01A8, + 0x01AD, + 0x01B0, + 0x01B4, + 0x01B6, + 0x01C6, + 0x01C9, + 0x01CC, + 0x01CE, + 0x01D0, + 0x01D2, + 0x01D4, + 0x01D6, + 0x01D8, + 0x01DA, + 0x01DF, + 0x01E1, + 0x01E3, + 0x01E5, + 0x01E7, + 0x01E9, + 0x01EB, + 0x01ED, + 0x01F3, + 0x01F5, + 0x01F9, + 0x01FB, + 0x01FD, + 0x01FF, + 0x0201, + 0x0203, + 0x0205, + 0x0207, + 0x0209, + 0x020B, + 0x020D, + 0x020F, + 0x0211, + 0x0213, + 0x0215, + 0x0217, + 0x0219, + 0x021B, + 0x021D, + 0x021F, + 0x0221, + 0x0223, + 0x0225, + 0x0227, + 0x0229, + 0x022B, + 0x022D, + 0x022F, + 0x0231, + 0x023C, + 0x0242, + 0x0247, + 0x0249, + 0x024B, + 0x024D, + 0x0371, + 0x0373, + 0x0377, + 0x0390, + 0x03D9, + 0x03DB, + 0x03DD, + 0x03DF, + 0x03E1, + 0x03E3, + 0x03E5, + 0x03E7, + 0x03E9, + 0x03EB, + 0x03ED, + 0x03F5, + 0x03F8, + 0x0461, + 0x0463, + 0x0465, + 0x0467, + 0x0469, + 0x046B, + 0x046D, + 0x046F, + 0x0471, + 0x0473, + 0x0475, + 0x0477, + 0x0479, + 0x047B, + 0x047D, + 0x047F, + 0x0481, + 0x048B, + 0x048D, + 0x048F, + 0x0491, + 0x0493, + 0x0495, + 0x0497, + 0x0499, + 0x049B, + 0x049D, + 0x049F, + 0x04A1, + 0x04A3, + 0x04A5, + 0x04A7, + 0x04A9, + 0x04AB, + 0x04AD, + 0x04AF, + 0x04B1, + 0x04B3, + 0x04B5, + 0x04B7, + 0x04B9, + 0x04BB, + 0x04BD, + 0x04BF, + 0x04C2, + 0x04C4, + 0x04C6, + 0x04C8, + 0x04CA, + 0x04CC, + 0x04D1, + 0x04D3, + 0x04D5, + 0x04D7, + 0x04D9, + 0x04DB, + 0x04DD, + 0x04DF, + 0x04E1, + 0x04E3, + 0x04E5, + 0x04E7, + 0x04E9, + 0x04EB, + 0x04ED, + 0x04EF, + 0x04F1, + 0x04F3, + 0x04F5, + 0x04F7, + 0x04F9, + 0x04FB, + 0x04FD, + 0x04FF, + 0x0501, + 0x0503, + 0x0505, + 0x0507, + 0x0509, + 0x050B, + 0x050D, + 0x050F, + 0x0511, + 0x0513, + 0x0515, + 0x0517, + 0x0519, + 0x051B, + 0x051D, + 0x051F, + 0x0521, + 0x0523, + 0x0525, + 0x0527, + 0x1E01, + 0x1E03, + 0x1E05, + 0x1E07, + 0x1E09, + 0x1E0B, + 0x1E0D, + 0x1E0F, + 0x1E11, + 0x1E13, + 0x1E15, + 0x1E17, + 0x1E19, + 0x1E1B, + 0x1E1D, + 0x1E1F, + 0x1E21, + 0x1E23, + 0x1E25, + 0x1E27, + 0x1E29, + 0x1E2B, + 0x1E2D, + 0x1E2F, + 0x1E31, + 0x1E33, + 0x1E35, + 0x1E37, + 0x1E39, + 0x1E3B, + 0x1E3D, + 0x1E3F, + 0x1E41, + 0x1E43, + 0x1E45, + 0x1E47, + 0x1E49, + 0x1E4B, + 0x1E4D, + 0x1E4F, + 0x1E51, + 0x1E53, + 0x1E55, + 0x1E57, + 0x1E59, + 0x1E5B, + 0x1E5D, + 0x1E5F, + 0x1E61, + 0x1E63, + 0x1E65, + 0x1E67, + 0x1E69, + 0x1E6B, + 0x1E6D, + 0x1E6F, + 0x1E71, + 0x1E73, + 0x1E75, + 0x1E77, + 0x1E79, + 0x1E7B, + 0x1E7D, + 0x1E7F, + 0x1E81, + 0x1E83, + 0x1E85, + 0x1E87, + 0x1E89, + 0x1E8B, + 0x1E8D, + 0x1E8F, + 0x1E91, + 0x1E93, + 0x1E9F, + 0x1EA1, + 0x1EA3, + 0x1EA5, + 0x1EA7, + 0x1EA9, + 0x1EAB, + 0x1EAD, + 0x1EAF, + 0x1EB1, + 0x1EB3, + 0x1EB5, + 0x1EB7, + 0x1EB9, + 0x1EBB, + 0x1EBD, + 0x1EBF, + 0x1EC1, + 0x1EC3, + 0x1EC5, + 0x1EC7, + 0x1EC9, + 0x1ECB, + 0x1ECD, + 0x1ECF, + 0x1ED1, + 0x1ED3, + 0x1ED5, + 0x1ED7, + 0x1ED9, + 0x1EDB, + 0x1EDD, + 0x1EDF, + 0x1EE1, + 0x1EE3, + 0x1EE5, + 0x1EE7, + 0x1EE9, + 0x1EEB, + 0x1EED, + 0x1EEF, + 0x1EF1, + 0x1EF3, + 0x1EF5, + 0x1EF7, + 0x1EF9, + 0x1EFB, + 0x1EFD, + 0x1FBE, + 0x210A, + 0x2113, + 0x212F, + 0x2134, + 0x2139, + 0x214E, + 0x2184, + 0x2C61, + 0x2C68, + 0x2C6A, + 0x2C6C, + 0x2C71, + 0x2C81, + 0x2C83, + 0x2C85, + 0x2C87, + 0x2C89, + 0x2C8B, + 0x2C8D, + 0x2C8F, + 0x2C91, + 0x2C93, + 0x2C95, + 0x2C97, + 0x2C99, + 0x2C9B, + 0x2C9D, + 0x2C9F, + 0x2CA1, + 0x2CA3, + 0x2CA5, + 0x2CA7, + 0x2CA9, + 0x2CAB, + 0x2CAD, + 0x2CAF, + 0x2CB1, + 0x2CB3, + 0x2CB5, + 0x2CB7, + 0x2CB9, + 0x2CBB, + 0x2CBD, + 0x2CBF, + 0x2CC1, + 0x2CC3, + 0x2CC5, + 0x2CC7, + 0x2CC9, + 0x2CCB, + 0x2CCD, + 0x2CCF, + 0x2CD1, + 0x2CD3, + 0x2CD5, + 0x2CD7, + 0x2CD9, + 0x2CDB, + 0x2CDD, + 0x2CDF, + 0x2CE1, + 0x2CEC, + 0x2CEE, + 0x2CF3, + 0x2D27, + 0x2D2D, + 0xA641, + 0xA643, + 0xA645, + 0xA647, + 0xA649, + 0xA64B, + 0xA64D, + 0xA64F, + 0xA651, + 0xA653, + 0xA655, + 0xA657, + 0xA659, + 0xA65B, + 0xA65D, + 0xA65F, + 0xA661, + 0xA663, + 0xA665, + 0xA667, + 0xA669, + 0xA66B, + 0xA66D, + 0xA681, + 0xA683, + 0xA685, + 0xA687, + 0xA689, + 0xA68B, + 0xA68D, + 0xA68F, + 0xA691, + 0xA693, + 0xA695, + 0xA697, + 0xA723, + 0xA725, + 0xA727, + 0xA729, + 0xA72B, + 0xA72D, + 0xA733, + 0xA735, + 0xA737, + 0xA739, + 0xA73B, + 0xA73D, + 0xA73F, + 0xA741, + 0xA743, + 0xA745, + 0xA747, + 0xA749, + 0xA74B, + 0xA74D, + 0xA74F, + 0xA751, + 0xA753, + 0xA755, + 0xA757, + 0xA759, + 0xA75B, + 0xA75D, + 0xA75F, + 0xA761, + 0xA763, + 0xA765, + 0xA767, + 0xA769, + 0xA76B, + 0xA76D, + 0xA76F, + 0xA77A, + 0xA77C, + 0xA77F, + 0xA781, + 0xA783, + 0xA785, + 0xA787, + 0xA78C, + 0xA78E, + 0xA791, + 0xA793, + 0xA7A1, + 0xA7A3, + 0xA7A5, + 0xA7A7, + 0xA7A9, + 0xA7FA, + 0x1D4BB, + 0x1D7CB, +}; + +int +islowerrune(Rune r) +{ + if(bsearch(&r, lower2, nelem(lower2), sizeof *lower2, &rune2cmp)) + return 1; + if(bsearch(&r, lower1, nelem(lower1), sizeof *lower1, &rune1cmp)) + return 1; + return 0; +} + +static Rune title2[][2] = { + { 0x1F88, 0x1F8F }, + { 0x1F98, 0x1F9F }, + { 0x1FA8, 0x1FAF }, +}; + +static Rune title1[] = { + 0x01C5, + 0x01C8, + 0x01CB, + 0x01F2, + 0x1FBC, + 0x1FCC, + 0x1FFC, +}; + +int +istitlerune(Rune r) +{ + if(bsearch(&r, title2, nelem(title2), sizeof *title2, &rune2cmp)) + return 1; + if(bsearch(&r, title1, nelem(title1), sizeof *title1, &rune1cmp)) + return 1; + return 0; +} + +static Rune digit2[][2] = { + { 0x0030, 0x0039 }, + { 0x0660, 0x0669 }, + { 0x06F0, 0x06F9 }, + { 0x07C0, 0x07C9 }, + { 0x0966, 0x096F }, + { 0x09E6, 0x09EF }, + { 0x0A66, 0x0A6F }, + { 0x0AE6, 0x0AEF }, + { 0x0B66, 0x0B6F }, + { 0x0BE6, 0x0BEF }, + { 0x0C66, 0x0C6F }, + { 0x0CE6, 0x0CEF }, + { 0x0D66, 0x0D6F }, + { 0x0E50, 0x0E59 }, + { 0x0ED0, 0x0ED9 }, + { 0x0F20, 0x0F29 }, + { 0x1040, 0x1049 }, + { 0x1090, 0x1099 }, + { 0x17E0, 0x17E9 }, + { 0x1810, 0x1819 }, + { 0x1946, 0x194F }, + { 0x19D0, 0x19D9 }, + { 0x1A80, 0x1A89 }, + { 0x1A90, 0x1A99 }, + { 0x1B50, 0x1B59 }, + { 0x1BB0, 0x1BB9 }, + { 0x1C40, 0x1C49 }, + { 0x1C50, 0x1C59 }, + { 0xA620, 0xA629 }, + { 0xA8D0, 0xA8D9 }, + { 0xA900, 0xA909 }, + { 0xA9D0, 0xA9D9 }, + { 0xAA50, 0xAA59 }, + { 0xABF0, 0xABF9 }, + { 0xFF10, 0xFF19 }, + { 0x104A0, 0x104A9 }, + { 0x11066, 0x1106F }, + { 0x110F0, 0x110F9 }, + { 0x11136, 0x1113F }, + { 0x111D0, 0x111D9 }, + { 0x116C0, 0x116C9 }, + { 0x1D7CE, 0x1D7FF }, +}; + +int +isdigitrune(Rune r) +{ + if(bsearch(&r, digit2, nelem(digit2), sizeof *digit2, &rune2cmp)) + return 1; + return 0; +} + diff --git a/utf.c b/utf.c @@ -1,296 +0,0 @@ -/* See LICENSE file for copyright and license details. */ -#include <string.h> -#include "utf.h" - -#define MIN(x,y) ((x) < (y) ? (x) : (y)) - -#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \ - : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \ - : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \ - : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \ - : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \ - : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \ - : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \ - : 0 ) - -/* - * runetochar copies one rune at p to at most UTFmax bytes starting at s and - * returns the number of bytes copied. UTFmax is the maximum number of bytes - * required to represent a legal rune. - * - * If the rune is illegal, runetochar will return 0. - */ -int -runetochar(char *s, Rune *p) -{ - Rune r = *p; - - switch(runelen(r)) { - case 1: /* 0aaaaaaa */ - s[0] = r; - return 1; - case 2: /* 00000aaa aabbbbbb */ - s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ - s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ - return 2; - case 3: /* aaaabbbb bbcccccc */ - s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ - s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ - s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ - return 3; - case 4: /* 000aaabb bbbbcccc ccdddddd */ - s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ - s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ - s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ - s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ - return 4; - default: - return 0; /* error */ - } -} - -/* - * chartorune copies at most UTFmax bytes starting at s to one rune at p and - * returns the number of bytes copied. If the input is not valid UTF-8, - * chartorune will convert the sequence to Runeerror (0xFFFD), returning the - * number of bytes in the invalid sequence. - */ -int -chartorune(Rune *p, const char *s) -{ - return charntorune(p, s, UTFmax); -} - -/* - * charntorune copies at most len bytes starting at s to one rune at p and - * returns the number of bytes copied. If the input is not valid UTF-8, - * charntorune will convert the sequence to Runeerror (0xFFFD), returning the - * number of bytes in the invalid sequence. - * - * If a potentially valid sequence is cut off by the len limit, charntorune will - * return 0. - */ -int -charntorune(Rune *p, const char *s, size_t len) -{ - unsigned int i, n; - Rune r; - - if(len == 0) /* can't even look at s[0] */ - return 0; - - switch((n = UTFSEQ(s[0]))) { - case 1: r = s[0]; break; /* 0xxxxxxx */ - case 2: r = s[0] & 0x1F; break; /* 110xxxxx */ - case 3: r = s[0] & 0x0F; break; /* 1110xxxx */ - case 4: r = s[0] & 0x07; break; /* 11110xxx */ - case 5: r = s[0] & 0x03; break; /* 111110xx */ - case 6: r = s[0] & 0x01; break; /* 1111110x */ - default: /* invalid sequence */ - *p = Runeerror; - return 1; - } - /* add values from continuation bytes */ - for(i = 1; i < MIN(n, len); i++) - if((s[i] & 0xC0) != 0x80) { - /* expected continuation */ - *p = Runeerror; - return i; - } - else - r = (r << 6) | (s[i] & 0x3F); - - if(i < n) /* must have reached len limit */ - return 0; - - /* reject invalid runes and overlong sequences */ - if(n > UTFmax || r > 0x10FFFF || runelen(r) < (int)n || (r & 0xFFFE) == 0xFFFE - || (r >= 0xD800 && r <= 0xDFFF) || (r >= 0xFDD0 && r <= 0xFDEF)) - r = Runeerror; - - *p = r; - return n; -} - -/* - * runelen returns the number of bytes required to convert r into UTF-8. If the - * rune is illegal, runelen will return 0. - */ -int -runelen(Rune r) -{ - if(r <= 0x7F) - return 1; - else if(r <= 0x07FF) - return 2; - else if(r <= 0xFFFF) - return 3; - else if(r <= Runemax) - return 4; - else - return 0; /* error */ -} - -/* - * runelen returns the number of bytes required to convert the rune-string of - * length len pointed to by p into UTF-8. - */ -size_t -runenlen(Rune *p, size_t len) -{ - size_t i, n = 0; - - for(i = 0; i < len; i++) - n += runelen(p[i]); - return n; -} - -/* - * fullrune returns 1 if the string s of length len is long enough to be - * decoded by chartorune, and 0 otherwise. - */ -int -fullrune(const char *s, size_t len) -{ - Rune r; - - return charntorune(&r, s, len) > 0; -} - -/* - * utfecpy copies UTF-8 sequences until a null sequence has been copied, but - * writes no sequences beyond end. If any sequences are copied, the to string is - * terminated by a null sequence, and a pointer to that sequence is returned. - * Otherwise, the original to string is returned. - */ -char * -utfecpy(char *to, char *end, const char *from) -{ - Rune r = Runeerror; - size_t i, n; - - /* seek through to find final full rune */ - for(i = 0; r != '\0' && (n = charntorune(&r, &from[i], end - &to[i])); i += n) - ; - memcpy(to, from, i); /* copy over bytes up to this rune */ - - if(i > 0 && r != '\0') - to[i] = '\0'; /* terminate if unterminated */ - return &to[i]; -} - -/* - * utflen returns the number of runes that are represented by the UTF-8 string - * s. - */ -size_t -utflen(const char *s) -{ - const char *p = s; - size_t i; - Rune r; - - for(i = 0; *p != '\0'; i++) - p += chartorune(&r, p); - return i; -} - -/* - * utfnlen returns the number of runes that are represented by the UTF-8 string - * s of length len. If the last few bytes contain an incompletely coded rune, - * utfnlen will not count them; in this way it differs from utflen, which - * includes every byte of the string. - */ -size_t -utfnlen(const char *s, size_t len) -{ - const char *p = s; - size_t i; - Rune r; - int n; - - for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++) - p += n; - return i; -} - -/* - * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s, - * or NULL if r does not occur in s. The null byte terminating a string is - * considered to be part of the string s. - */ -char * -utfrune(const char *s, Rune r) -{ - if(r < Runeself) { - return strchr(s, r); - } - else if(r == Runeerror) { - Rune r0; - int n; - - for(; *s != '\0'; s += n) { - n = chartorune(&r0, s); - if(r == r0) - return (char *)s; - } - } - else { - char buf[UTFmax+1]; - int n; - - if(!(n = runetochar(buf, &r))) - return NULL; - buf[n] = '\0'; - return strstr(s, buf); - } - return NULL; -} - -/* - * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s, - * or NULL if r does not occur in s. The null byte terminating a string is - * considered to be part of the string s. - */ -char * -utfrrune(const char *s, Rune r) -{ - const char *p = NULL; - Rune r0; - int n; - - if(r < Runeself) - return strrchr(s, r); - - for(; *s != '\0'; s += n) { - n = chartorune(&r0, s); - if(r == r0) - p = s; - } - return (char *)p; -} - -/* - * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a - * UTF-8 substring of s, or NULL if there is none. If t is the null string, - * utfutf returns s. - */ -char * -utfutf(const char *s, const char *t) -{ - const char *p, *q; - Rune r0, r1, r2; - int n, m; - - for(chartorune(&r0, t); (s = utfrune(s, r0)); s++) { - for(p = s, q = t; *q && *p; p += n, q += m) { - n = chartorune(&r1, p); - m = chartorune(&r2, q); - if(r1 != r2) - break; - } - if(!*q) - return (char *)s; - } - return NULL; -} diff --git a/utf.h b/utf.h @@ -25,4 +25,11 @@ char *utfrune(const char *, Rune); char *utfrrune(const char *, Rune); char *utfutf(const char *, const char *); +int isalpharune(Rune); +int isspacerune(Rune); +int isupperrune(Rune); +int islowerrune(Rune); +int istitlerune(Rune); +int isdigitrune(Rune); + #endif diff --git a/utftest.c b/utftest.c @@ -1,49 +1,112 @@ /* See LICENSE file for copyright and license details. */ +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include "utf.h" -static void utferror(const char *, int); +#define ARGBEGIN \ + { \ + Rune _argr; \ + \ + if(!argv0) \ + argv0 = argv[0]; \ + for(argc--, argv++; *argv && (*argv)[0] == '-' && (*argv)[1] != '\0'; argc--, argv++) { \ + if((*argv)[1] == '-' && (*argv)[2] == '\0') { /* -- signifies end of flags */ \ + argc--; argv++; \ + break; \ + } \ + (*argv)++; \ + while(**argv != '\0' && (*argv += chartorune(&_argr, *argv))) \ + switch(_argr) +#define ARGEND \ + } \ + } +#define ARGC() _argr + +static void eprintf(const char *, ...); +static void usage(void); + +static const char *argv0; int -main(void) +main(int argc, char *argv[]) { - char buf[BUFSIZ], buf2[UTFmax]; + int aflag = 0, dflag = 0, lflag = 0, sflag = 0, tflag = 0, uflag = 0, vflag = 0; + char buf[BUFSIZ], rbuf[UTFmax]; int len, len2; - size_t i, n; + size_t i, j, n; Rune r; + ARGBEGIN { + case 'a': aflag = 1; break; + case 'd': dflag = 1; break; + case 'l': lflag = 1; break; + case 's': sflag = 1; break; + case 't': tflag = 1; break; + case 'u': uflag = 1; break; + case 'v': vflag = 1; break; + default: + r = ARGC(); + rbuf[runetochar(rbuf, &r)] = '\0'; + eprintf("unknown flag -%s\n", rbuf); + usage(); + } ARGEND; + + if(argc != 0) + usage(); + for(i = 0; (n = fread(&buf[i], 1, sizeof buf - i, stdin)); i = n-i) { for(n += i, i = 0; (len = charntorune(&r, &buf[i], n-i)); i += len) { - if(r == Runeerror) - utferror(&buf[i], len); - if((len2 = runetochar(buf2, &r))) { - if(!fwrite(buf2, len2, 1, stdout)) { - perror("write error"); - exit(EXIT_FAILURE); - } + if(r == Runeerror) { + fprintf(stderr, "%s: error converting char to rune:", argv0); + for(j = i; j < i+len; j++) + fprintf(stderr, " %02X", (unsigned char)buf[j]); + fputc('\n', stderr); + } + if(((aflag && isalpharune(r)) || (dflag && isdigitrune(r)) + || (lflag && islowerrune(r)) || (sflag && isspacerune(r)) + || (tflag && istitlerune(r)) || (uflag && isupperrune(r)) + || (!aflag && !dflag && !lflag && !sflag && !tflag && !uflag)) == vflag) + continue; + + if((len2 = runetochar(rbuf, &r))) { + if(!fwrite(rbuf, len2, 1, stdout)) + eprintf("write error:"); } else - fprintf(stderr, "error converting rune to char: U+%02X\n", r); + fprintf(stderr, "%s: error converting rune to char: U+%02X\n", argv0, r); } if(i < n) memcpy(buf, &buf[i], n-i); } - if(ferror(stdin)) { - perror("read error"); - exit(EXIT_FAILURE); - } + if(ferror(stdin)) + eprintf("read error:"); + return EXIT_SUCCESS; } void -utferror(const char *s, int n) +eprintf(const char *fmt, ...) { - int i = 0; + va_list ap; + + fprintf(stderr, "%s: ", argv0); + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + + if(fmt[0] && fmt[strlen(fmt)-1] == ':') { + fputc(' ', stderr); + perror(NULL); + } + exit(EXIT_FAILURE); +} - fprintf(stderr, "error converting char to rune:"); - for(i = 0; i < n; i++) - fprintf(stderr, " %02X", (unsigned char)s[i]); - fputc('\n', stderr); +void +usage(void) +{ + fprintf(stderr, "usage: %s [-adlstu]\n", argv0); + exit(EXIT_FAILURE); }