commit 74d431464aada2142f77fc661bd6ecc289d780ed
parent c18c526e623588d636ca179efd0ad4ff3d149634
Author: Connor Lane Smith <cls@lubutu.com>
Date: Mon, 21 May 2012 19:00:45 +0100
add manpages rune.3, isalpharune.3
Diffstat:
Makefile | | | 22 | +++++++++++++--------- |
config.mk | | | 2 | ++ |
isalpharune.3 | | | 29 | +++++++++++++++++++++++++++++ |
rune.3 | | | 157 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
rune.c | | | 71 | +++-------------------------------------------------------------------- |
utf.h | | | 4 | ++-- |
6 files changed, 206 insertions(+), 79 deletions(-)
diff --git a/Makefile b/Makefile
@@ -8,8 +8,6 @@ OBJ = $(SRC:.c=.o)
LIB = libutf.a
INC = utf.h
-UCD = UnicodeData-6.1.0.txt
-
all: $(LIB) utftest
$(LIB): $(OBJ)
@@ -21,18 +19,24 @@ utftest: utftest.o $(LIB)
.c.o:
$(CC) $(CFLAGS) -c $<
-runetypebody.h: mkrunetype.awk $(UCD)
- $(AWK) -f mkrunetype.awk $(UCD) > $@
+runetypebody.h: mkrunetype.awk UnicodeData-$(UNICODE).txt
+ $(AWK) -f mkrunetype.awk UnicodeData-$(UNICODE).txt > $@
-install: $(LIB)
- mkdir -p $(DESTDIR)$(PREFIX)/lib
- cp $(LIB) $(DESTDIR)$(PREFIX)/lib/$(LIB)
- mkdir -p $(DESTDIR)$(PREFIX)/include
- cp $(INC) $(DESTDIR)$(PREFIX)/include/$(INC)
+install: $(LIB) $(INC) $(MAN)
+ @echo @ install libutf
+ @mkdir -p $(DESTDIR)$(PREFIX)/lib
+ @cp $(LIB) $(DESTDIR)$(PREFIX)/lib/$(LIB)
+ @mkdir -p $(DESTDIR)$(PREFIX)/include
+ @cp $(INC) $(DESTDIR)$(PREFIX)/include/$(INC)
+ @mkdir -p $(DESTDIR)$(PREFIX)/share/man/man3
+ @cp rune.3 $(DESTDIR)$(PREFIX)/share/man/man3/rune.3
+ @sed 's/$$UNICODE/$(UNICODE)/g' isalpharune.3 > $(DESTDIR)$(PREFIX)/share/man/man3/isalpharune.3
uninstall:
rm -f $(DESTDIR)$(PREFIX)/lib/$(LIB)
rm -f $(DESTDIR)$(PREFIX)/include/$(INC)
+ rm -f $(DESTDIR)$(PREFIX)/share/man/man3/rune.3
+ rm -f $(DESTDIR)$(PREFIX)/share/man/man3/isalpharune.3
clean:
rm -f $(LIB) utftest utftest.o $(OBJ)
diff --git a/config.mk b/config.mk
@@ -2,6 +2,8 @@
PREFIX = /usr/local
+UNICODE = 6.1.0
+
CFLAGS = -ansi -pedantic -Os -Wall -Wextra
LDFLAGS = -s
diff --git a/isalpharune.3 b/isalpharune.3
@@ -0,0 +1,29 @@
+.Dd $Mdocdate$
+.Dt ISALPHARUNE 3
+.Os
+.Sh NAME
+.Nm isalpharune, islowerrune, isspacerune, istitlerune, isupperrune, isdigitrune
+.Nd Unicode rune classification
+.Sh SYNOPSIS
+.Ft int
+.Fn isalpharune "Rune r"
+.Ft int
+.Fn islowerrune "Rune r"
+.Ft int
+.Fn isspacerune "Rune r"
+.Ft int
+.Fn istitlerune "Rune r"
+.Ft int
+.Fn isupperrune "Rune r"
+.Ft int
+.Fn isdigitrune "Rune r"
+.Sh DESCRIPTION
+These functions classify Unicode runes according to their properties defined in the Unicode standard, analogously to
+.Xr ctype 3
+for ASCII.
+.Sh CONFORMING TO
+These functions are compatible with those defined in the Plan 9 C library, but are generated automatically from the Unicode $UNICODE Character Database, so classifications may differ.
+.Sh SEE ALSO
+.Xr ctype 3 ,
+.Xr rune 3 ,
+The Unicode $UNICODE Standard
diff --git a/rune.3 b/rune.3
@@ -0,0 +1,157 @@
+.Dd $Mdocdate$
+.Dt RUNE 3
+.Os
+.Sh NAME
+.Nm runetochar, chartorune, charntorune, runelen, runenlen, fullrune, utfecpy, utflen, utfnlen, utfrune, utfrrune, utfutf
+.Nd UTF-8 rune conversion
+.Sh SYNOPSIS
+.In utf.h
+.Ft int
+.Fn runetochar "char *s" "Rune *p"
+.Ft int
+.Fn chartorune "Rune *p" "char *s"
+.Ft int
+.Fn charntorune "Rune *p" "char *s" "size_t len"
+.Ft int
+.Fn runelen "Rune r"
+.Ft int
+.Fn runenlen "Rune *p" "size_t len"
+.Ft int
+.Fn fullrune "char *s" "size_t len"
+.Ft char *
+.Fn utfecpy "char *to" "char *end" "char *from"
+.Ft size_t
+.Fn utflen "char *s"
+.Ft size_t
+.Fn utfnlen "char *s" "size_t len"
+.Ft char *
+.Fn utfrune "char *s" "Rune r"
+.Ft char *
+.Fn utfrrune "char *s" "Rune r"
+.Ft char *
+.Fn utfutf "char *s" "char *t"
+.Sh DESCRIPTION
+The following functions convert to and from a UTF-8 byte stream and Unicode runes.
+.Pp
+.Fn runetochar
+converts one rune at
+.Fa p
+to at most
+.Dv UTFmax
+bytes starting at
+.Fa s ,
+and returns the number of bytes copied.
+.Dv UTFmax
+is the maximum number of bytes required to represent a rune.
+If the rune is illegal,
+.Fn runetochar
+will return 0.
+.Pp
+.Fn chartorune
+converts at most
+.Dv UTFmax
+bytes starting at
+.Fa s
+to one rune at
+.Fa p ,
+and returns the number of bytes copied.
+If the input is invalid UTF-8,
+.Fn chartorune
+will convert the sequence to
+.Dv Runeerror
+(0xFFFD) and return the number of bytes in the invalid sequence.
+.Pp
+.Fn charntorune
+converts at most
+.Fa len
+bytes starting at
+.Fa s
+to one rune at
+.Fa p ,
+and returns the number of bytes copied.
+If the next sequence is longer than
+.Fa len
+bytes,
+.Fn charntorune
+will return 0.
+.Pp
+.Fn runelen
+returns the number of bytes required to convert the rune
+.Fa r
+into UTF-8.
+If the rune is illegal,
+.Fn runelen
+will return 0.
+.Pp
+.Fn runenlen
+returns the number of bytes required to convert the
+.Fa len
+runes pointed to by
+.Fa p
+into UTF-8.
+.Pp
+.Fn fullrune
+returns 1 if the first
+.Fa len
+bytes of the UTF-8 string
+.Fa s
+form a complete rune, and 0 otherwise.
+.Pp
+The following functions are analogous to the corresponding string routines, with `utf' substituted for `str', and `rune' for `chr'.
+.Pp
+.Fn utfecpy
+copies UTF-8 sequences until a nul byte has been copied, but writes no sequences beyond
+.Fa end .
+If any sequences are copied,
+.Fa to
+is terminated with a nul byte and a pointer to that byte is returned.
+Otherwise the original
+.Fa to
+is returned.
+.Pp
+.Fn utflen
+returns the number of runes represented by the UTF-8 string
+.Fa s .
+.Pp
+.Fn utfnlen
+returns the number of runes represented by the first
+.Fa len
+bytes of the UTF-8 string
+.Fa s .
+If the final sequence is incomplete it will not be counted.
+.Pp
+.Fn utfrune
+.Pq Fn utfrrune
+returns a pointer to the first
+.Pq last
+occurrence of the rune
+.Fa r
+in the UTF-8 string
+.Fa s ,
+or
+.Dv NULL
+if there is none.
+The terminating nul byte is considered a part of the string
+.Fa s .
+.Pp
+.Fn utfutf
+returns a pointer to the first occurrence of the UTF-8 string
+.Fa t
+as a UTF-8 substring of
+.Fa s ,
+or
+.Dv NULL
+if there is none.
+If
+.Fa t
+is the null string,
+.Fn utfutf
+returns
+.Fa s .
+.Sh CONFORMING TO
+These functions are compatible with those defined in the Plan 9 C library, with the exception of
+.Fn charntorune ,
+which is an extension.
+However, these functions are much stricter about UTF-8 validity than their Plan 9 counterparts (the kind from up there).
+.Sh SEE ALSO
+.Xr isalpharune 3
diff --git a/rune.c b/rune.c
@@ -18,13 +18,6 @@
|| ((x) >= 0xD800 && (x) <= 0xDFFF) \
|| ((x) >= 0xFDD0 && (x) <= 0xFDEF))
-/*
- * runetochar copies one rune at p to at most UTFmax bytes starting at s and
- * returns the number of bytes copied. UTFmax is the maximum number of bytes
- * required to represent a legal rune.
- *
- * If the rune is illegal, runetochar will return 0.
- */
int
runetochar(char *s, const Rune *p)
{
@@ -54,27 +47,12 @@ runetochar(char *s, const Rune *p)
}
}
-/*
- * chartorune copies at most UTFmax bytes starting at s to one rune at p and
- * returns the number of bytes copied. If the input is not valid UTF-8,
- * chartorune will convert the sequence to Runeerror (0xFFFD), returning the
- * number of bytes in the invalid sequence.
- */
int
chartorune(Rune *p, const char *s)
{
return charntorune(p, s, UTFmax);
}
-/*
- * charntorune copies at most len bytes starting at s to one rune at p and
- * returns the number of bytes copied. If the input is not valid UTF-8,
- * charntorune will convert the sequence to Runeerror (0xFFFD), returning the
- * number of bytes in the invalid sequence.
- *
- * If a potentially valid sequence is cut off by the len limit, charntorune will
- * return 0.
- */
int
charntorune(Rune *p, const char *s, size_t len)
{
@@ -116,29 +94,21 @@ charntorune(Rune *p, const char *s, size_t len)
return n;
}
-/*
- * runelen returns the number of bytes required to convert r into UTF-8. If the
- * rune is illegal, runelen will return 0.
- */
int
runelen(Rune r)
{
- if(BADRUNE(r))
- return 0; /* error */
- else if(r <= 0x7F)
+ if(r <= 0x7F)
return 1;
else if(r <= 0x07FF)
return 2;
+ else if(BADRUNE(r))
+ return 0; /* error */
else if(r <= 0xFFFF)
return 3;
else
return 4;
}
-/*
- * runelen returns the number of bytes required to convert the rune-string of
- * length len pointed to by p into UTF-8.
- */
size_t
runenlen(const Rune *p, size_t len)
{
@@ -149,10 +119,6 @@ runenlen(const Rune *p, size_t len)
return n;
}
-/*
- * fullrune returns 1 if the string s of length len is long enough to be
- * decoded by chartorune, and 0 otherwise.
- */
int
fullrune(const char *s, size_t len)
{
@@ -161,12 +127,6 @@ fullrune(const char *s, size_t len)
return charntorune(&r, s, len) > 0;
}
-/*
- * utfecpy copies UTF-8 sequences until a null sequence has been copied, but
- * writes no sequences beyond end. If any sequences are copied, the to string is
- * terminated by a null sequence, and a pointer to that sequence is returned.
- * Otherwise, the original to string is returned.
- */
char *
utfecpy(char *to, char *end, const char *from)
{
@@ -183,10 +143,6 @@ utfecpy(char *to, char *end, const char *from)
return &to[i];
}
-/*
- * utflen returns the number of runes that are represented by the UTF-8 string
- * s.
- */
size_t
utflen(const char *s)
{
@@ -199,12 +155,6 @@ utflen(const char *s)
return i;
}
-/*
- * utfnlen returns the number of runes that are represented by the UTF-8 string
- * s of length len. If the last few bytes contain an incompletely coded rune,
- * utfnlen will not count them; in this way it differs from utflen, which
- * includes every byte of the string.
- */
size_t
utfnlen(const char *s, size_t len)
{
@@ -218,11 +168,6 @@ utfnlen(const char *s, size_t len)
return i;
}
-/*
- * utfrune returns a pointer to the first ocurrence of r in the UTF-8 string s,
- * or NULL if r does not occur in s. The null byte terminating a string is
- * considered to be part of the string s.
- */
char *
utfrune(const char *s, Rune r)
{
@@ -251,11 +196,6 @@ utfrune(const char *s, Rune r)
return NULL;
}
-/*
- * utfrrune returns a pointer to the last ocurrence of r in the UTF-8 string s,
- * or NULL if r does not occur in s. The null byte terminating a string is
- * considered to be part of the string s.
- */
char *
utfrrune(const char *s, Rune r)
{
@@ -274,11 +214,6 @@ utfrrune(const char *s, Rune r)
return (char *)p;
}
-/*
- * utfutf returns a pointer to the first occurrence of the UTF-8 string t as a
- * UTF-8 substring of s, or NULL if there is none. If t is the null string,
- * utfutf returns s.
- */
char *
utfutf(const char *s, const char *t)
{
diff --git a/utf.h b/utf.h
@@ -27,10 +27,10 @@ char *utfrrune(const char *, Rune);
char *utfutf(const char *, const char *);
int isalpharune(Rune);
-int isspacerune(Rune);
-int isupperrune(Rune);
int islowerrune(Rune);
+int isspacerune(Rune);
int istitlerune(Rune);
+int isupperrune(Rune);
int isdigitrune(Rune);
#endif