libutf

UTF-8 library
git clone git://git.suckless.org/libutf
Log | Files | Refs | README | LICENSE

commit 74d431464aada2142f77fc661bd6ecc289d780ed
parent c18c526e623588d636ca179efd0ad4ff3d149634
Author: Connor Lane Smith <cls@lubutu.com>
Date:   Mon, 21 May 2012 19:00:45 +0100

add manpages rune.3, isalpharune.3
Diffstat:
Makefile | 22+++++++++++++---------
config.mk | 2++
isalpharune.3 | 29+++++++++++++++++++++++++++++
rune.3 | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rune.c | 71+++--------------------------------------------------------------------
utf.h | 4++--
6 files changed, 206 insertions(+), 79 deletions(-)

diff --git a/Makefile b/Makefile @@ -8,8 +8,6 @@ OBJ = $(SRC:.c=.o) LIB = libutf.a INC = utf.h -UCD = UnicodeData-6.1.0.txt - all: $(LIB) utftest $(LIB): $(OBJ) @@ -21,18 +19,24 @@ utftest: utftest.o $(LIB) .c.o: $(CC) $(CFLAGS) -c $< -runetypebody.h: mkrunetype.awk $(UCD) - $(AWK) -f mkrunetype.awk $(UCD) > $@ +runetypebody.h: mkrunetype.awk UnicodeData-$(UNICODE).txt + $(AWK) -f mkrunetype.awk UnicodeData-$(UNICODE).txt > $@ -install: $(LIB) - mkdir -p $(DESTDIR)$(PREFIX)/lib - cp $(LIB) $(DESTDIR)$(PREFIX)/lib/$(LIB) - mkdir -p $(DESTDIR)$(PREFIX)/include - cp $(INC) $(DESTDIR)$(PREFIX)/include/$(INC) +install: $(LIB) $(INC) $(MAN) + @echo @ install libutf + @mkdir -p $(DESTDIR)$(PREFIX)/lib + @cp $(LIB) $(DESTDIR)$(PREFIX)/lib/$(LIB) + @mkdir -p $(DESTDIR)$(PREFIX)/include + @cp $(INC) $(DESTDIR)$(PREFIX)/include/$(INC) + @mkdir -p $(DESTDIR)$(PREFIX)/share/man/man3 + @cp rune.3 $(DESTDIR)$(PREFIX)/share/man/man3/rune.3 + @sed 's/$$UNICODE/$(UNICODE)/g' isalpharune.3 > $(DESTDIR)$(PREFIX)/share/man/man3/isalpharune.3 uninstall: rm -f $(DESTDIR)$(PREFIX)/lib/$(LIB) rm -f $(DESTDIR)$(PREFIX)/include/$(INC) + rm -f $(DESTDIR)$(PREFIX)/share/man/man3/rune.3 + rm -f $(DESTDIR)$(PREFIX)/share/man/man3/isalpharune.3 clean: rm -f $(LIB) utftest utftest.o $(OBJ) diff --git a/config.mk b/config.mk @@ -2,6 +2,8 @@ PREFIX = /usr/local +UNICODE = 6.1.0 + CFLAGS = -ansi -pedantic -Os -Wall -Wextra LDFLAGS = -s diff --git a/isalpharune.3 b/isalpharune.3 @@ -0,0 +1,29 @@ +.Dd $Mdocdate$ +.Dt ISALPHARUNE 3 +.Os +.Sh NAME +.Nm isalpharune, islowerrune, isspacerune, istitlerune, isupperrune, isdigitrune +.Nd Unicode rune classification +.Sh SYNOPSIS +.Ft int +.Fn isalpharune "Rune r" +.Ft int +.Fn islowerrune "Rune r" +.Ft int +.Fn isspacerune "Rune r" +.Ft int +.Fn istitlerune "Rune r" +.Ft int +.Fn isupperrune "Rune r" +.Ft int +.Fn isdigitrune "Rune r" +.Sh DESCRIPTION +These functions classify Unicode runes according to their properties defined in the Unicode standard, analogously to +.Xr ctype 3 +for ASCII. +.Sh CONFORMING TO +These functions are compatible with those defined in the Plan 9 C library, but are generated automatically from the Unicode $UNICODE Character Database, so classifications may differ. +.Sh SEE ALSO +.Xr ctype 3 , +.Xr rune 3 , +The Unicode $UNICODE Standard diff --git a/rune.3 b/rune.3 @@ -0,0 +1,157 @@ +.Dd $Mdocdate$ +.Dt RUNE 3 +.Os +.Sh NAME +.Nm runetochar, chartorune, charntorune, runelen, runenlen, fullrune, utfecpy, utflen, utfnlen, utfrune, utfrrune, utfutf +.Nd UTF-8 rune conversion +.Sh SYNOPSIS +.In utf.h +.Ft int +.Fn runetochar "char *s" "Rune *p" +.Ft int +.Fn chartorune "Rune *p" "char *s" +.Ft int +.Fn charntorune "Rune *p" "char *s" "size_t len" +.Ft int +.Fn runelen "Rune r" +.Ft int +.Fn runenlen "Rune *p" "size_t len" +.Ft int +.Fn fullrune "char *s" "size_t len" +.Ft char * +.Fn utfecpy "char *to" "char *end" "char *from" +.Ft size_t +.Fn utflen "char *s" +.Ft size_t +.Fn utfnlen "char *s" "size_t len" +.Ft char * +.Fn utfrune "char *s" "Rune r" +.Ft char * +.Fn utfrrune "char *s" "Rune r" +.Ft char * +.Fn utfutf "char *s" "char *t" +.Sh DESCRIPTION +The following functions convert to and from a UTF-8 byte stream and Unicode runes. +.Pp +.Fn runetochar +converts one rune at +.Fa p +to at most +.Dv UTFmax +bytes starting at +.Fa s , +and returns the number of bytes copied. +.Dv UTFmax +is the maximum number of bytes required to represent a rune. +If the rune is illegal, +.Fn runetochar +will return 0. +.Pp +.Fn chartorune +converts at most +.Dv UTFmax +bytes starting at +.Fa s +to one rune at +.Fa p , +and returns the number of bytes copied. +If the input is invalid UTF-8, +.Fn chartorune +will convert the sequence to +.Dv Runeerror +(0xFFFD) and return the number of bytes in the invalid sequence. +.Pp +.Fn charntorune +converts at most +.Fa len +bytes starting at +.Fa s +to one rune at +.Fa p , +and returns the number of bytes copied. +If the next sequence is longer than +.Fa len +bytes, +.Fn charntorune +will return 0. +.Pp +.Fn runelen +returns the number of bytes required to convert the rune +.Fa r +into UTF-8. +If the rune is illegal, +.Fn runelen +will return 0. +.Pp +.Fn runenlen +returns the number of bytes required to convert the +.Fa len +runes pointed to by +.Fa p +into UTF-8. +.Pp +.Fn fullrune +returns 1 if the first +.Fa len +bytes of the UTF-8 string +.Fa s +form a complete rune, and 0 otherwise. +.Pp +The following functions are analogous to the corresponding string routines, with `utf' substituted for `str', and `rune' for `chr'. +.Pp +.Fn utfecpy +copies UTF-8 sequences until a nul byte has been copied, but writes no sequences beyond +.Fa end . +If any sequences are copied, +.Fa to +is terminated with a nul byte and a pointer to that byte is returned. +Otherwise the original +.Fa to +is returned. +.Pp +.Fn utflen +returns the number of runes represented by the UTF-8 string +.Fa s . +.Pp +.Fn utfnlen +returns the number of runes represented by the first +.Fa len +bytes of the UTF-8 string +.Fa s . +If the final sequence is incomplete it will not be counted. +.Pp +.Fn utfrune +.Pq Fn utfrrune +returns a pointer to the first +.Pq last +occurrence of the rune +.Fa r +in the UTF-8 string +.Fa s , +or +.Dv NULL +if there is none. +The terminating nul byte is considered a part of the string +.Fa s . +.Pp +.Fn utfutf +returns a pointer to the first occurrence of the UTF-8 string +.Fa t +as a UTF-8 substring of +.Fa s , +or +.Dv NULL +if there is none. +If +.Fa t +is the null string, +.Fn utfutf +returns +.Fa s . +.Sh CONFORMING TO +These functions are compatible with those defined in the Plan 9 C library, with the exception of +.Fn charntorune , +which is an extension. +However, these functions are much stricter about UTF-8 validity than their Plan 9 counterparts (the kind from up there). +.Sh SEE ALSO +.Xr isalpharune 3 diff --git a/rune.c b/rune.c @@ -18,13 +18,6 @@ || ((x) >= 0xD800 && (x) <= 0xDFFF) \ || ((x) >= 0xFDD0 && (x) <= 0xFDEF)) -/* - * runetochar copies one rune at p to at most UTFmax bytes starting at s and - * returns the number of bytes copied. UTFmax is the maximum number of bytes - * required to represent a legal rune. - * - * If the rune is illegal, runetochar will return 0. - */ int runetochar(char *s, const Rune *p) { @@ -54,27 +47,12 @@ runetochar(char *s, const Rune *p) } } -/* - * chartorune copies at most UTFmax bytes starting at s to one rune at p and - * returns the number of bytes copied. If the input is not valid UTF-8, - * chartorune will convert the sequence to Runeerror (0xFFFD), returning the - * number of bytes in the invalid sequence. - */ int chartorune(Rune *p, const char *s) { return charntorune(p, s, UTFmax); } -/* - * charntorune copies at most len bytes starting at s to one rune at p and - * returns the number of bytes copied. If the input is not valid UTF-8, - * charntorune will convert the sequence to Runeerror (0xFFFD), returning the - * number of bytes in the invalid sequence. - * - * If a potentially valid sequence is cut off by the len limit, charntorune will - * return 0. - */ int charntorune(Rune *p, const char *s, size_t len) { @@ -116,29 +94,21 @@ charntorune(Rune *p, const char *s, size_t len) return n; } -/* - * runelen returns the number of bytes required to convert r into UTF-8. If the - * rune is illegal, runelen will return 0. - */ int runelen(Rune r) { - if(BADRUNE(r)) - return 0; /* error */ - else if(r <= 0x7F) + if(r <= 0x7F) return 1; else if(r <= 0x07FF) return 2; + else if(BADRUNE(r)) + return 0; /* error */ else if(r <= 0xFFFF) return 3; else return 4; } -/* - * runelen returns the number of bytes required to convert the rune-string of - * length len pointed to by p into UTF-8. - */ size_t runenlen(const Rune *p, size_t len) { @@ -149,10 +119,6 @@ runenlen(const Rune *p, size_t len) return n; } -/* - * fullrune returns 1 if the string s of length len is long enough to be - * decoded by chartorune, and 0 otherwise. - */ int fullrune(const char *s, size_t len) { @@ -161,12 +127,6 @@ fullrune(const char *s, size_t len) return charntorune(&r, s, len) > 0; } -/* - * utfecpy copies UTF-8 sequences until a null sequence has been copied, but - * writes no sequences beyond end. If any sequences are copied, the to string is - * terminated by a null sequence, and a pointer to that sequence is returned. - * Otherwise, the original to string is returned. - */ char * utfecpy(char *to, char *end, const char *from) { @@ -183,10 +143,6 @@ utfecpy(char *to, char *end, const char *from) return &to[i]; } -/* - * utflen returns the number of runes that are represented by the UTF-8 string - * s. - */ size_t utflen(const char *s) { @@ -199,12 +155,6 @@ utflen(const char *s) return i; } -/* - * utfnlen returns the number of runes that are represented by the UTF-8 string - * s of length len. If the last few bytes contain an incompletely coded rune, - * utfnlen will not count them; in this way it differs from utflen, which - * includes every byte of the string. - */ size_t utfnlen(const char *s, size_t len) { @@ -218,11 +168,6 @@ utfnlen(const char *s, size_t len) return i; } -/* - * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s, - * or NULL if r does not occur in s. The null byte terminating a string is - * considered to be part of the string s. - */ char * utfrune(const char *s, Rune r) { @@ -251,11 +196,6 @@ utfrune(const char *s, Rune r) return NULL; } -/* - * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s, - * or NULL if r does not occur in s. The null byte terminating a string is - * considered to be part of the string s. - */ char * utfrrune(const char *s, Rune r) { @@ -274,11 +214,6 @@ utfrrune(const char *s, Rune r) return (char *)p; } -/* - * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a - * UTF-8 substring of s, or NULL if there is none. If t is the null string, - * utfutf returns s. - */ char * utfutf(const char *s, const char *t) { diff --git a/utf.h b/utf.h @@ -27,10 +27,10 @@ char *utfrrune(const char *, Rune); char *utfutf(const char *, const char *); int isalpharune(Rune); -int isspacerune(Rune); -int isupperrune(Rune); int islowerrune(Rune); +int isspacerune(Rune); int istitlerune(Rune); +int isupperrune(Rune); int isdigitrune(Rune); #endif