libutf

UTF-8 library
git clone git://git.suckless.org/libutf
Log | Files | Refs | README | LICENSE

commit ea0ee3134966b07a683ebe14eec49644f7092964
Author: Connor Lane Smith <cls@lubutu.com>
Date:   Sat, 14 Apr 2012 17:33:50 +0100

initial commit
Diffstat:
LICENSE | 21+++++++++++++++++++++
Makefile | 18++++++++++++++++++
UTF-8-test.txt | 0
utf.c | 296+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
utf.h | 28++++++++++++++++++++++++++++
utftest.c | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 461 insertions(+), 0 deletions(-)

diff --git a/LICENSE b/LICENSE @@ -0,0 +1,21 @@ +MIT/X Consortium License + +© 2012 Connor Lane Smith <cls@lubutu.com> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,18 @@ +# See LICENSE file for copyright and license details. + +SRC = utf.c utftest.c +OBJ = $(SRC:.c=.o) + +CFLAGS = -ansi -pedantic -Wall -Wextra -D_ANSI_SOURCE +LDFLAGS = -Os -s + +all: utftest + +utftest: $(OBJ) + cc $(LDFLAGS) -o $@ $(OBJ) + +.c.o: + cc $(CFLAGS) -c $< + +clean: + rm -f utftest $(OBJ) diff --git a/UTF-8-test.txt b/UTF-8-test.txt Binary files differ. diff --git a/utf.c b/utf.c @@ -0,0 +1,296 @@ +/* See LICENSE file for copyright and license details. */ +#include <string.h> +#include "utf.h" + +#define MIN(x,y) ((x) < (y) ? (x) : (y)) + +#define UTFSEQ(x) ((((x) & 0x80) == 0x00) ? 1 /* 0xxxxxxx */ \ + : (((x) & 0xC0) == 0x80) ? 0 /* 10xxxxxx */ \ + : (((x) & 0xE0) == 0xC0) ? 2 /* 110xxxxx */ \ + : (((x) & 0xF0) == 0xE0) ? 3 /* 1110xxxx */ \ + : (((x) & 0xF8) == 0xF0) ? 4 /* 11110xxx */ \ + : (((x) & 0xFC) == 0xF8) ? 5 /* 111110xx */ \ + : (((x) & 0xFE) == 0xFC) ? 6 /* 1111110x */ \ + : 0 ) + +/* + * runetochar copies one rune at p to at most UTFmax bytes starting at s and + * returns the number of bytes copied. UTFmax is the maximum number of bytes + * required to represent a legal rune. + * + * If the rune is illegal, runetochar will return 0. + */ +int +runetochar(char *s, rune_t *p) +{ + rune_t r = *p; + + switch(runelen(r)) { + case 1: /* 0aaaaaaa */ + s[0] = r; + return 1; + case 2: /* 00000aaa aabbbbbb */ + s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ + s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ + return 2; + case 3: /* aaaabbbb bbcccccc */ + s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ + s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ + s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ + return 3; + case 4: /* 000aaabb bbbbcccc ccdddddd */ + s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ + s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ + s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ + s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ + return 4; + default: + return 0; /* error */ + } +} + +/* + * chartorune copies at most UTFmax bytes starting at s to one rune at p and + * returns the number of bytes copied. If the input is not valid UTF-8, + * chartorune will convert the sequence to Runeerror (0xFFFD), returning the + * number of bytes in the invalid sequence. + */ +int +chartorune(rune_t *p, const char *s) +{ + return charntorune(p, s, UTFmax); +} + +/* + * charntorune copies at most len bytes starting at s to one rune at p and + * returns the number of bytes copied. If the input is not valid UTF-8, + * charntorune will convert the sequence to Runeerror (0xFFFD), returning the + * number of bytes in the invalid sequence. + * + * If a potentially valid sequence is cut off by the len limit, charntorune will + * return 0. + */ +int +charntorune(rune_t *p, const char *s, size_t len) +{ + unsigned int i, n; + rune_t r; + + if(len == 0) /* can't even look at s[0] */ + return 0; + + switch((n = UTFSEQ(s[0]))) { + case 1: r = s[0]; break; /* 0xxxxxxx */ + case 2: r = s[0] & 0x1F; break; /* 110xxxxx */ + case 3: r = s[0] & 0x0F; break; /* 1110xxxx */ + case 4: r = s[0] & 0x07; break; /* 11110xxx */ + case 5: r = s[0] & 0x03; break; /* 111110xx */ + case 6: r = s[0] & 0x01; break; /* 1111110x */ + default: /* invalid sequence */ + *p = Runeerror; + return 1; + } + /* add values from continuation bytes */ + for(i = 1; i < MIN(n, len); i++) + if((s[i] & 0xC0) != 0x80) { + /* expected continuation */ + *p = Runeerror; + return i; + } + else + r = (r << 6) | (s[i] & 0x3F); + + if(i < n) /* must have reached len limit */ + return 0; + + /* reject invalid runes and overlong sequences */ + if(n > UTFmax || r > 0x10FFFF || runelen(r) < (int)n || (r & 0xFFFE) == 0xFFFE + || (r >= 0xD800 && r <= 0xDFFF) || (r >= 0xFDD0 && r <= 0xFDEF)) + r = Runeerror; + + *p = r; + return n; +} + +/* + * runelen returns the number of bytes required to convert r into UTF-8. If the + * rune is illegal, runelen will return 0. + */ +int +runelen(rune_t r) +{ + if(r <= 0x7F) + return 1; + else if(r <= 0x07FF) + return 2; + else if(r <= 0xFFFF) + return 3; + else if(r <= 0x10FFFF) + return 4; + else + return 0; /* error */ +} + +/* + * runelen returns the number of bytes required to convert the rune-string of + * length len pointed to by p into UTF-8. + */ +size_t +runenlen(rune_t *p, size_t len) +{ + size_t i, n = 0; + + for(i = 0; i < len; i++) + n += runelen(p[i]); + return n; +} + +/* + * fullrune returns true if the string s of length len is long enough to be + * decoded by chartorune, and false otherwise. + */ +bool +fullrune(const char *s, size_t len) +{ + rune_t r; + + return charntorune(&r, s, len) > 0; +} + +/* + * utfecpy copies UTF-8 sequences until a null sequence has been copied, but + * writes no sequences beyond end. If any sequences are copied, the to string is + * terminated by a null sequence, and a pointer to that sequence is returned. + * Otherwise, the original to string is returned. + */ +char * +utfecpy(char *to, char *end, const char *from) +{ + rune_t r = Runeerror; + size_t i, n; + + /* seek through to find final full rune */ + for(i = 0; r != '\0' && (n = charntorune(&r, &from[i], end - &to[i])); i += n) + ; + memcpy(to, from, i); /* copy over bytes up to this rune */ + + if(i > 0 && r != '\0') + to[i] = '\0'; /* terminate if unterminated */ + return &to[i]; +} + +/* + * utflen returns the number of runes that are represented by the UTF-8 string + * s. + */ +size_t +utflen(const char *s) +{ + const char *p = s; + size_t i; + rune_t r; + + for(i = 0; *p != '\0'; i++) + p += chartorune(&r, p); + return i; +} + +/* + * utfnlen returns the number of runes that are represented by the UTF-8 string + * s of length len. If the last few bytes contain an incompletely coded rune, + * utfnlen will not count them; in this way it differs from utflen, which + * includes every byte of the string. + */ +size_t +utfnlen(const char *s, size_t len) +{ + const char *p = s; + size_t i; + rune_t r; + int n; + + for(i = 0; (n = charntorune(&r, p, len-(p-s))) && r != '\0'; i++) + p += n; + return i; +} + +/* + * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s, + * or NULL if r does not occur in s. The null byte terminating a string is + * considered to be part of the string s. + */ +char * +utfrune(const char *s, rune_t r) +{ + if(r <= 0x7F) { + return strchr(s, r); + } + else if(r == Runeerror) { + rune_t r0; + int n; + + for(; *s != '\0'; s += n) { + n = chartorune(&r0, s); + if(r == r0) + return (char *)s; + } + } + else { + char buf[UTFmax+1]; + int n; + + if(!(n = runetochar(buf, &r))) + return NULL; + buf[n] = '\0'; + return strstr(s, buf); + } + return NULL; +} + +/* + * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s, + * or NULL if r does not occur in s. The null byte terminating a string is + * considered to be part of the string s. + */ +char * +utfrrune(const char *s, rune_t r) +{ + const char *p = NULL; + rune_t r0; + int n; + + if(r <= 0x7F) + return strrchr(s, r); + + for(; *s != '\0'; s += n) { + n = chartorune(&r0, s); + if(r == r0) + p = s; + } + return (char *)p; +} + +/* + * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a + * UTF-8 substring of s, or NULL if there is none. If t is the null string, + * utfutf returns s. + */ +char * +utfutf(const char *s, const char *t) +{ + const char *p, *q; + rune_t r0, r1, r2; + int n, m; + + for(chartorune(&r0, t); (s = utfrune(s, r0)); s++) { + for(p = s, q = t; *q && *p; p += n, q += m) { + n = chartorune(&r1, p); + m = chartorune(&r2, q); + if(r1 != r2) + break; + } + if(!*q) + return (char *)s; + } + return NULL; +} diff --git a/utf.h b/utf.h @@ -0,0 +1,28 @@ +/* See LICENSE file for copyright and license details. */ +#ifndef UTF_H +#define UTF_H + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +typedef uint32_t rune_t; + +enum { + UTFmax = 6, + Runeerror = 0xFFFD +}; + +int runetochar(char *, rune_t *); +int chartorune(rune_t *, const char *); +int charntorune(rune_t *, const char *, size_t); +int runelen(rune_t); +bool fullrune(const char *, size_t); +char *utfecpy(char *, char *, const char *); +size_t utflen(const char *); +size_t utfnlen(const char *, size_t); +char *utfrune(const char *, rune_t); +char *utfrrune(const char *, rune_t); +char *utfutf(const char *, const char *); + +#endif diff --git a/utftest.c b/utftest.c @@ -0,0 +1,98 @@ +/* See LICENSE file for copyright and license details. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "utf.h" + +static void utferror(const char *, int); + +int +main(int argc, char *argv[]) +{ + char buf[BUFSIZ], buf2[UTFmax], *p; + int len, len2; + size_t i, n; + rune_t r; + + switch(argc) { + case 1: + for(i = 0; (n = fread(&buf[i], 1, sizeof buf - i, stdin)); i = n-i) { + for(n += i, i = 0; (len = charntorune(&r, &buf[i], n-i)); i += len) { + if(r == Runeerror) + utferror(&buf[i], len); + if((len2 = runetochar(buf2, &r))) { + if(!fwrite(buf2, len2, 1, stdout)) { + perror("write error"); + exit(EXIT_FAILURE); + } + } + else + fprintf(stderr, "error converting rune to char: U+%02X\n", r); + } + if(i < n) + memcpy(buf, &buf[i], n-i); + } + if(ferror(stdin)) { + perror("read error"); + exit(EXIT_FAILURE); + } + break; + case 2: + fprintf(stderr, "\"%s\" (%lu,%lu)\n", + argv[1], utflen(argv[1]), strlen(argv[1])); + break; + case 3: + if((n = strtol(argv[2], NULL, 0)) >= sizeof buf) { + fprintf(stderr, "%lu is too large (max %lu)\n", n, sizeof buf - 1); + exit(EXIT_FAILURE); + } + if((p = utfecpy(buf, &buf[n], argv[1])) == buf) + buf[0] = '\0'; + printf("\"%s\" (%lu,%lu; %lu,%lu) -> \"%s\" (%lu,%lu)\n", + argv[1], utflen(argv[1]), strlen(argv[1]), utfnlen(argv[1], n), + n, buf, utflen(buf), p-buf); + break; + case 4: + case 5: + if(utflen(argv[2]) <= 1 && utflen(argv[3]) <= 1) { + n = chartorune(&r, argv[2]); + if(r == Runeerror) + utferror(argv[2], n); + if((p = utfrune(argv[1], r))) + printf("utfrune '%s': \"%s\"\n", argv[2], p); + else + printf("utfrune '%s': null\n", argv[2]); + + n = chartorune(&r, argv[3]); + if(r == Runeerror) + utferror(argv[2], n); + if((p = utfrrune(argv[1], r))) + printf("utfrrune '%s': \"%s\"\n", argv[3], p); + else + printf("utfrrune '%s': null\n", argv[3]); + + if(argc > 4) { + if((p = utfutf(argv[1], argv[4]))) + printf("utfutf \"%s\": \"%s\"\n", argv[4], p); + else + printf("utfutf \"%s\": null\n", argv[4]); + } + break; + } + default: + fprintf(stderr, "%s [<string> [<len> | <rune> <rune> [<string>]]]\n", argv[0]); + exit(EXIT_FAILURE); + } + return EXIT_SUCCESS; +} + +void +utferror(const char *s, int n) +{ + int i = 0; + + fprintf(stderr, "error converting char to rune:"); + for(i = 0; i < n; i++) + fprintf(stderr, " %02X", (uint8_t)s[i]); + fputc('\n', stderr); +}