--- /n/sources/plan9/sys/include/ape/utf.h Tue Dec 2 01:09:19 2003 +++ /sys/include/ape/utf.h Sat Jan 19 00:00:00 2013 @@ -14,7 +14,8 @@ UTFmax = 3, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* --- /n/sources/plan9/sys/include/libc.h Fri Jun 29 18:22:31 2012 +++ /sys/include/libc.h Sat Jan 19 00:00:00 2013 @@ -45,6 +45,7 @@ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* --- /n/sources/plan9/sys/src/9/port/lib.h Mon Oct 4 22:16:43 2010 +++ /sys/src/9/port/lib.h Sat Jan 19 00:00:00 2013 @@ -36,9 +36,10 @@ enum { UTFmax = 3, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0x80, /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0xFFFF, /* 16 bit rune */ }; /* --- /n/sources/plan9/sys/src/ape/lib/ap/gen/mbwc.c Thu Feb 28 18:46:27 2002 +++ /sys/src/ape/lib/ap/gen/mbwc.c Sat Jan 19 00:00:00 2013 @@ -1,4 +1,5 @@ #include +#include /* * Use the FSS-UTF transformation proposed by posix. @@ -7,12 +8,14 @@ * Tx 10xxxxxx 6 free bits * T1 110xxxxx 5 free bits * T2 1110xxxx 4 free bits + * T3 11110xxx 3 free bits * * Encoding is as follows. * From hex Thru hex Sequence Bits * 00000000 0000007F T0 7 * 00000080 000007FF T1 Tx 11 * 00000800 0000FFFF T2 Tx Tx 16 + * 00010000 0010FFFF T3 Tx Tx Tx 20 (and change) */ int @@ -25,7 +28,7 @@ int mbtowc(wchar_t *pwc, const char *s, size_t n) { - int c, c1, c2; + int c, c1, c2, c3; long l; if(!s) @@ -70,6 +73,24 @@ return 3; } + if(n < 4) + goto bad; + if(UTFmax >= 4) { + c3 = (s[3] ^ 0x80) & 0xff; + if(c3 & 0xC0) + goto bad; + if(c < 0xf8) { + l = ((((((c << 6) | c1) << 6) | c2) << 6) | c3) & 0x3fffff; + if(l <= 0x10000) + goto bad; + if(l > Runemax) + goto bad; + if(pwc) + *pwc = l; + return 4; + } + } + /* * bad decoding */ @@ -86,7 +107,9 @@ if(!s) return 0; - c = wchar & 0xFFFF; + c = wchar; + if(c > Runemax) + c = Runeerror; if(c < 0x80) { s[0] = c; return 1; @@ -98,10 +121,18 @@ return 2; } - s[0] = 0xE0 | (c >> 12); - s[1] = 0x80 | ((c >> 6) & 0x3F); - s[2] = 0x80 | (c & 0x3F); - return 3; + if(c < 0x10000){ + s[0] = 0xE0 | (c >> 12); + s[1] = 0x80 | ((c >> 6) & 0x3F); + s[2] = 0x80 | (c & 0x3F); + return 3; + } + + s[0] = 0xf0 | c >> 18; + s[1] = 0x80 | (c >> 12) & 0x3F; + s[2] = 0x80 | (c >> 6) & 0x3F; + s[3] = 0x80 | (c & 0x3F); + return 4; } size_t @@ -117,7 +148,7 @@ break; s++; } else { - d = mbtowc(pwcs, s, 3); + d = mbtowc(pwcs, s, UTFmax); if(d <= 0) return (size_t)((d<0) ? -1 : i); s += d; @@ -133,10 +164,10 @@ int i, d; long c; char *p, *pe; - char buf[3]; + char buf[UTFmax]; p = s; - pe = p+n-3; + pe = p+n-UTFmax; while(p < pe) { c = *pwcs++; if(c < 0x80) @@ -146,16 +177,13 @@ if(c == 0) return p-s; } - while(p < pe+3) { + while(p < pe+UTFmax) { c = *pwcs++; d = wctomb(buf, c); - if(p+d <= pe+3) { - *p++ = buf[0]; - if(d > 1) { - *p++ = buf[2]; - if(d > 2) - *p++ = buf[3]; - } + if(p+d <= pe+UTFmax) { + for(i = 0; i < d; i++) + p[i] = buf[i]; + p += d; } if(c == 0) break; --- /n/sources/plan9/sys/src/ape/lib/utf/rune.c Tue Dec 2 01:04:50 2003 +++ /sys/src/ape/lib/utf/rune.c Sat Jan 19 00:00:00 2013 @@ -23,16 +23,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -113,7 +135,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -123,12 +145,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -155,7 +191,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -165,13 +204,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /n/sources/plan9/sys/src/cmd/acme/regx.c Sat Jan 12 21:22:05 2008 +++ /sys/src/cmd/acme/regx.c Sat Jan 19 00:00:00 2013 @@ -487,7 +487,7 @@ exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -509,7 +509,7 @@ p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; --- /n/sources/plan9/sys/src/cmd/cc/cc.h Mon Mar 4 21:15:21 2013 +++ /sys/src/cmd/cc/cc.h Tue Apr 2 00:00:00 2013 @@ -51,7 +51,7 @@ double fconst; /* fp constant */ vlong vconst; /* non fp const */ char* cstring; /* character string */ - ushort* rstring; /* rune string */ + Rune* rstring; /* rune string */ Sym* sym; Type* type; @@ -336,6 +336,8 @@ TFILE, TOLD, NALLTYPES, + + TRUNE = sizeof(Rune)==4? TUINT: TUSHORT, }; enum { @@ -740,7 +742,7 @@ void gextern(Sym*, Node*, long, long); void ginit(void); long outstring(char*, long); -long outlstring(ushort*, long); +long outlstring(Rune*, long); void xcom(Node*); long exreg(Type*); long align(long, Type*, int); --- /n/sources/plan9/sys/src/cmd/cc/cc.y Wed Nov 16 23:34:38 2011 +++ /sys/src/cmd/cc/cc.y Sat Jan 19 00:00:00 2013 @@ -855,9 +855,9 @@ LLSTRING { $$ = new(OLSTRING, Z, Z); - $$->type = typ(TARRAY, types[TUSHORT]); - $$->type->width = $1.l + sizeof(ushort); - $$->rstring = (ushort*)$1.s; + $$->type = typ(TARRAY, types[TRUNE]); + $$->type->width = $1.l + sizeof(Rune); + $$->rstring = (Rune*)$1.s; $$->sym = symstring; $$->etype = TARRAY; $$->class = CSTATIC; @@ -867,16 +867,16 @@ char *s; int n; - n = $1->type->width - sizeof(ushort); + n = $1->type->width - sizeof(Rune); s = alloc(n+$2.l+MAXALIGN); memcpy(s, $1->rstring, n); memcpy(s+n, $2.s, $2.l); - *(ushort*)(s+n+$2.l) = 0; + *(Rune*)(s+n+$2.l) = 0; $$ = $1; $$->type->width += $2.l; - $$->rstring = (ushort*)s; + $$->rstring = (Rune*)s; } zelist: --- /n/sources/plan9/sys/src/cmd/cc/com.c Wed Nov 16 23:39:03 2011 +++ /sys/src/cmd/cc/com.c Sat Jan 19 00:00:00 2013 @@ -633,10 +633,12 @@ break; case OLSTRING: - if(n->type->link != types[TUSHORT]) { + if(n->type->link != types[TRUNE]) { o = outstring(0, 0); while(o & 3) { - outlstring(L"", sizeof(ushort)); + /* outlstring(L"", sizeof(Rune)); */ + Rune str[1] = {0}; + outlstring(str, sizeof(Rune)); o = outlstring(0, 0); } } --- /n/sources/plan9/sys/src/cmd/cc/lex.c Wed Oct 3 15:56:45 2012 +++ /sys/src/cmd/cc/lex.c Sat Jan 19 00:00:00 2013 @@ -465,7 +465,7 @@ yyerror("missing '"); peekc = c1; } - yylval.vval = convvtox(c, TUSHORT); + yylval.vval = convvtox(c, TRUNE); return LUCONST; } if(c == '"') { @@ -539,15 +539,15 @@ c = escchar('"', 1, 0); if(c == EOF) break; - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = c; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = c; + c1 += sizeof(Rune); } yylval.sval.l = c1; do { - cp = allocn(cp, c1, sizeof(ushort)); - *(ushort*)(cp + c1) = 0; - c1 += sizeof(ushort); + cp = allocn(cp, c1, sizeof(Rune)); + *(Rune*)(cp + c1) = 0; + c1 += sizeof(Rune); } while(c1 & MAXALIGN); yylval.sval.s = cp; return LLSTRING; @@ -1025,7 +1025,7 @@ } else c = GETC(); for(;;) { - if(!isspace(c)) + if(c >= Runeself || !isspace(c)) return c; if(c == '\n') { lineno++; --- /n/sources/plan9/sys/src/cmd/cc/pswt.c Thu Mar 8 05:37:02 2012 +++ /sys/src/cmd/cc/pswt.c Sat Jan 19 00:00:00 2013 @@ -132,28 +132,28 @@ } long -outlstring(ushort *s, long n) +outlstring(Rune *s, long n) { - char buf[2]; - int c; + char buf[sizeof(Rune)]; + int c, i; long r; if(suppress) return nstring; - while(nstring & 1) + while(nstring & (sizeof(Rune)-1)) outstring("", 1); r = nstring; while(n > 0) { c = *s++; if(align(0, types[TCHAR], Aarg1)) { - buf[0] = c>>8; - buf[1] = c; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*(sizeof(Rune) - i - 1); } else { - buf[0] = c; - buf[1] = c>>8; + for(i = 0; i < sizeof(Rune); i++) + buf[i] = c>>8*i; } - outstring(buf, 2); - n -= sizeof(ushort); + outstring(buf, sizeof(Rune)); + n -= sizeof(Rune); } return r; } --- /n/sources/plan9/sys/src/cmd/ed.c Fri Oct 15 20:53:45 2010 +++ /sys/src/cmd/ed.c Sat Jan 19 00:00:00 2013 @@ -54,7 +54,7 @@ int peekc; int pflag; int rescuing; -Rune rhsbuf[LBSIZE/2]; +Rune rhsbuf[LBSIZE/sizeof(Rune)]; char savedfile[FNSIZE]; jmp_buf savej; int subnewa; @@ -1010,7 +1010,7 @@ tl = tline; bp = getblock(tl, OWRITE); nl = nleft; - tl &= ~((BLKSIZE/2)-1); + tl &= ~((BLKSIZE/sizeof(Rune))-1); while(*bp = *lp++) { if(*bp++ == '\n') { bp[-1] = 0; @@ -1019,7 +1019,7 @@ } nl -= sizeof(Rune); if(nl == 0) { - tl += BLKSIZE/2; + tl += BLKSIZE/sizeof(Rune); bp = getblock(tl, OWRITE); nl = nleft; } @@ -1046,8 +1046,8 @@ static uchar ibuff[BLKSIZE]; static uchar obuff[BLKSIZE]; - bno = atl / (BLKSIZE/2); - off = (atl<<1) & (BLKSIZE-1) & ~03; + bno = atl / (BLKSIZE/sizeof(Rune)); + off = (atl*sizeof(Rune)) & (BLKSIZE-1) & ~03; if(bno >= NBLK) { lastc = '\n'; error(T); @@ -1238,7 +1238,7 @@ if(c == '\\') { c = getchr(); *p++ = ESCFLG; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[LBSIZE/sizeof(Rune)]) error(Q); } else if(c == '\n' && (!globp || !globp[0])) { @@ -1249,7 +1249,7 @@ if(c == seof) break; *p++ = c; - if(p >= &rhsbuf[LBSIZE/2]) + if(p >= &rhsbuf[LBSIZE/sizeof(Rune)]) error(Q); } *p = 0; --- /n/sources/plan9/sys/src/cmd/sam/cmd.c Sun Nov 20 01:09:35 2005 +++ /sys/src/cmd/sam/cmd.c Sat Jan 19 00:00:00 2013 @@ -71,7 +71,7 @@ inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: --- /n/sources/plan9/sys/src/cmd/sam/regexp.c Sat Jan 12 21:20:47 2008 +++ /sys/src/cmd/sam/regexp.c Sat Jan 19 00:00:00 2013 @@ -494,7 +494,7 @@ exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -516,7 +516,7 @@ p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; --- /n/sources/plan9/sys/src/cmd/sed.c Thu Feb 5 04:24:16 2009 +++ /sys/src/cmd/sed.c Sat Jan 19 00:00:00 2013 @@ -623,7 +623,7 @@ while ((r = *cp++) != '\0') { if(r == '\\') { if (rhs < end) - *rhs++ = 0xFFFF; + *rhs++ = Runemax; else return 0; r = *cp++; @@ -1050,7 +1050,7 @@ sp = place(sp, loc1, loc2); continue; } - if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB + '0') { + if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB + '0') { n = c-'0'; if (subexp[n].rsp && subexp[n].rep) { sp = place(sp, subexp[n].rsp, subexp[n].rep); --- /n/sources/plan9/sys/src/cmd/tr.c Thu Feb 19 16:33:23 2009 +++ /sys/src/cmd/tr.c Sat Jan 19 00:00:00 2013 @@ -15,7 +15,7 @@ #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF +#define MAXRUNE Runemax uchar f[(MAXRUNE+1)/8]; uchar t[(MAXRUNE+1)/8]; --- /n/sources/plan9/sys/src/cmd/unicode.c Sun Dec 12 01:15:51 1999 +++ /sys/src/cmd/unicode.c Sat Jan 19 00:00:00 2013 @@ -51,13 +51,13 @@ return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || maxRunemax || max0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) --- /n/sources/plan9/sys/src/cmd/unix/drawterm/libc/rune.c Thu Dec 29 23:56:08 2005 +++ /sys/src/cmd/unix/drawterm/libc/rune.c Sat Jan 19 00:00:00 2013 @@ -8,16 +8,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +176,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +189,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /n/sources/plan9/sys/src/cmd/unix/u9fs/rune.c Sat Mar 2 18:05:53 2002 +++ /sys/src/cmd/unix/u9fs/rune.c Sat Jan 19 00:00:00 2013 @@ -8,27 +8,30 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -127,22 +163,40 @@ } int -utflen(char *s) +runenlen(Rune *r, int nrune) { - int c; - long n; - Rune rune; + int nb, c; - n = 0; - for(;;) { - c = *(uchar*)s; - if(c < Runeself) { - if(c == 0) - return n; - s++; - } else - s += chartorune(&rune, s); - n++; + nb = 0; + while(nrune--) { + c = *r++; + if(c <= Rune1) + nb++; + else + if(c <= Rune2) + nb += 2; + else + if(c <= Rune3 || c > Runemax) + nb += 3; + else + nb += 4; } - return 0; + return nb; +} + +int +fullrune(char *str, int n) +{ + int c; + + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /n/sources/plan9/sys/src/libbio/bgetrune.c Sun Dec 12 01:21:47 1999 +++ /sys/src/libbio/bgetrune.c Sat Jan 19 00:00:00 2013 @@ -7,7 +7,7 @@ { int c, i; Rune rune; - char str[4]; + char str[UTFmax]; c = Bgetc(bp); if(c < Runeself) { /* one char */ --- /n/sources/plan9/sys/src/libbio/bputrune.c Sun Dec 12 01:21:47 1999 +++ /sys/src/libbio/bputrune.c Sat Jan 19 00:00:00 2013 @@ -6,7 +6,7 @@ Bputrune(Biobufhdr *bp, long c) { Rune rune; - char str[4]; + char str[UTFmax]; int n; rune = c; --- /n/sources/plan9/sys/src/libc/port/rune.c Thu Feb 28 20:16:42 2002 +++ /sys/src/libc/port/rune.c Sat Jan 19 00:00:00 2013 @@ -8,16 +8,19 @@ Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -98,7 +120,7 @@ /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -108,12 +130,26 @@ /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx + */ + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -140,7 +176,10 @@ if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -150,13 +189,14 @@ { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } --- /n/sources/plan9/sys/src/libdraw/buildfont.c Mon Jul 25 21:19:18 2011 +++ /sys/src/libdraw/buildfont.c Sat Jan 19 00:00:00 2013 @@ -70,7 +70,7 @@ } max = strtol(s, &s, 0); s = skip(s); - if(*s==0 || min>=65536 || max>=65536 || min>max){ + if(*s==0 || min>Runemax || max>Runemax || min>max){ werrstr("illegal subfont range"); Err3: freefont(fnt); --- /n/sources/plan9/sys/src/libdraw/event.c Fri May 9 22:00:25 2008 +++ /sys/src/libdraw/event.c Sat Jan 19 00:00:00 2013 @@ -4,6 +4,10 @@ #include #include +enum { + Kbdmsgsz = 1 + 4 /* allow for 32-bit runes */ +}; + typedef struct Slave Slave; typedef struct Ebuf Ebuf; @@ -199,7 +203,7 @@ ekeyslave(int fd) { Rune r; - char t[3], k[10]; + char t[Kbdmsgsz], k[10]; int kr, kn, w; if(eforkslave(Ekeyboard) < MAXSLAVE) @@ -218,7 +222,9 @@ memmove(k, &k[w], kn); t[1] = r; t[2] = r>>8; - if(write(epipe[1], t, 3) != 3) + t[3] = r>>16; + t[4] = r>>24; + if(write(epipe[1], t, sizeof t) != sizeof t) break; } breakout:; @@ -302,7 +308,7 @@ s->head = (Ebuf *)1; return; } - if(i == Skeyboard && n != 3) + if(i == Skeyboard && n != Kbdmsgsz) drawerror(display, "events: protocol error: keyboard"); if(i == Smouse){ if(n < 1+1+2*12) @@ -417,13 +423,15 @@ int ekbd(void) { - Ebuf *eb; + uchar *t; int c; + Ebuf *eb; if(Skeyboard < 0) drawerror(display, "events: keyboard not initialzed"); eb = ebread(&eslave[Skeyboard]); - c = eb->buf[0] + (eb->buf[1]<<8); + t = eb->buf; + c = t[0] | t[1]<<8 | t[2]<<16 | t[3]<<24; free(eb); return c; } --- /n/sources/plan9/sys/src/cmd/freq.c Tue Jan 19 21:57:26 2010 +++ /sys/src/cmd/freq.c Sat Jan 19 00:00:00 2013 @@ -2,7 +2,7 @@ #include #include -uvlong count[1<<16]; +uvlong count[Runemax+1]; Biobuf bout; void usage(void);