--- /dev/null +++ /sys/src/9/port64/alloc.c @@ -0,0 +1,315 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" +#include + +static void poolprint(Pool*, char*, ...); +static void ppanic(Pool*, char*, ...); +static void plock(Pool*); +static void punlock(Pool*); + +typedef struct Private Private; +struct Private { + Lock lk; + char msg[256]; /* a rock for messages to be printed at unlock */ +}; + +static Private pmainpriv; +static Pool pmainmem = { + .name= "Main", + .maxsize= 4*1024*1024, + .minarena= 128*1024, + .quantum= 32, + .alloc= xalloc, + .merge= xmerge, + .flags= POOL_TOLERANCE, + + .lock= plock, + .unlock= punlock, + .print= poolprint, + .panic= ppanic, + + .private= &pmainpriv, +}; + +static Private pimagpriv; +static Pool pimagmem = { + .name= "Image", + .maxsize= 16*1024*1024, + .minarena= 2*1024*1024, + .quantum= 32, + .alloc= xalloc, + .merge= xmerge, + .flags= 0, + + .lock= plock, + .unlock= punlock, + .print= poolprint, + .panic= ppanic, + + .private= &pimagpriv, +}; + +Pool* mainmem = &pmainmem; +Pool* imagmem = &pimagmem; + +/* + * because we can't print while we're holding the locks, + * we have the save the message and print it once we let go. + */ +static void +poolprint(Pool *p, char *fmt, ...) +{ + va_list v; + Private *pv; + + pv = p->private; + va_start(v, fmt); + vseprint(pv->msg+strlen(pv->msg), pv->msg+sizeof pv->msg, fmt, v); + va_end(v); +} + +static void +ppanic(Pool *p, char *fmt, ...) +{ + va_list v; + Private *pv; + char msg[sizeof pv->msg]; + + pv = p->private; + va_start(v, fmt); + vseprint(pv->msg+strlen(pv->msg), pv->msg+sizeof pv->msg, fmt, v); + va_end(v); + memmove(msg, pv->msg, sizeof msg); + iunlock(&pv->lk); + panic("%s", msg); +} + +static void +plock(Pool *p) +{ + Private *pv; + + pv = p->private; + ilock(&pv->lk); + pv->lk.pc = getcallerpc(&p); + pv->msg[0] = 0; +} + +static void +punlock(Pool *p) +{ + Private *pv; + char msg[sizeof pv->msg]; + + pv = p->private; + if(pv->msg[0] == 0){ + iunlock(&pv->lk); + return; + } + + memmove(msg, pv->msg, sizeof msg); + iunlock(&pv->lk); + iprint("%.*s", sizeof pv->msg, msg); +} + +void +poolsummary(Pool *p) +{ + print("%s max %lud cur %lud free %lud alloc %lud\n", p->name, + p->maxsize, p->cursize, p->curfree, p->curalloc); +} + +void +mallocsummary(void) +{ + poolsummary(mainmem); + poolsummary(imagmem); +} + +/* everything from here down should be the same in libc, libdebugmalloc, and the kernel */ +/* - except the code for malloc(), which alternately doesn't clear or does. */ +/* - except the code for smalloc(), which lives only in the kernel. */ + +/* + * Npadlong is the number of 32-bit longs to leave at the beginning of + * each allocated buffer for our own bookkeeping. We return to the callers + * a pointer that points immediately after our bookkeeping area. Incoming pointers + * must be decremented by that much, and outgoing pointers incremented. + * The malloc tag is stored at MallocOffset from the beginning of the block, + * and the realloc tag at ReallocOffset. The offsets are from the true beginning + * of the block, not the beginning the caller sees. + * + * The extra if(Npadlong != 0) in various places is a hint for the compiler to + * compile out function calls that would otherwise be no-ops. + */ + +/* non tracing + * +enum { + Npadlong = 0, + MallocOffset = 0, + ReallocOffset = 0, +}; + * + */ + +/* tracing */ +enum { + Npadlong = 2, + MallocOffset = 0, + ReallocOffset = 1 +}; + + +void* +smalloc(ulong size) +{ + void *v; + + for(;;) { + v = poolalloc(mainmem, size+Npadlong*sizeof(ulong)); + if(v != nil) + break; + tsleep(&up->sleep, return0, 0, 100); + } + if(Npadlong){ + v = (ulong*)v+Npadlong; + setmalloctag(v, getcallerpc(&size)); + } + memset(v, 0, size); + return v; +} + +void* +malloc(ulong size) +{ + void *v; + + v = poolalloc(mainmem, size+Npadlong*sizeof(ulong)); + if(v == nil) + return nil; + if(Npadlong){ + v = (ulong*)v+Npadlong; + setmalloctag(v, getcallerpc(&size)); + setrealloctag(v, 0); + } + memset(v, 0, size); + return v; +} + +void* +mallocz(ulong size, int clr) +{ + void *v; + + v = poolalloc(mainmem, size+Npadlong*sizeof(ulong)); + if(Npadlong && v != nil){ + v = (ulong*)v+Npadlong; + setmalloctag(v, getcallerpc(&size)); + setrealloctag(v, 0); + } + if(clr && v != nil) + memset(v, 0, size); + return v; +} + +void* +mallocalign(ulong size, ulong align, long offset, ulong span) +{ + void *v; + + v = poolallocalign(mainmem, size+Npadlong*sizeof(ulong), align, offset-Npadlong*sizeof(ulong), span); + if(Npadlong && v != nil){ + v = (ulong*)v+Npadlong; + setmalloctag(v, getcallerpc(&size)); + setrealloctag(v, 0); + } + if(v) + memset(v, 0, size); + return v; +} + +void +free(void *v) +{ + if(v != nil) + poolfree(mainmem, (ulong*)v-Npadlong); +} + +void* +realloc(void *v, ulong size) +{ + void *nv; + + if(v != nil) + v = (ulong*)v-Npadlong; + if(Npadlong !=0 && size != 0) + size += Npadlong*sizeof(ulong); + + if(nv = poolrealloc(mainmem, v, size)){ + nv = (ulong*)nv+Npadlong; + setrealloctag(nv, getcallerpc(&v)); + if(v == nil) + setmalloctag(nv, getcallerpc(&v)); + } + return nv; +} + +ulong +msize(void *v) +{ + return poolmsize(mainmem, (ulong*)v-Npadlong)-Npadlong*sizeof(ulong); +} + +void* +calloc(ulong n, ulong szelem) +{ + void *v; + if(v = mallocz(n*szelem, 1)) + setmalloctag(v, getcallerpc(&n)); + return v; +} + +void +setmalloctag(void *v, uintptr pc) +{ + ulong *u; + USED(v, pc); + if(Npadlong <= MallocOffset || v == nil) + return; + u = v; + u[-Npadlong+MallocOffset] = (ulong)pc; +} + +void +setrealloctag(void *v, uintptr pc) +{ + ulong *u; + USED(v, pc); + if(Npadlong <= ReallocOffset || v == nil) + return; + u = v; + u[-Npadlong+ReallocOffset] = (ulong)pc; +} + +uintptr +getmalloctag(void *v) +{ + USED(v); + if(Npadlong <= MallocOffset) + return ~0; + return (int)((ulong*)v)[-Npadlong+MallocOffset]; +} + +uintptr +getrealloctag(void *v) +{ + USED(v); + if(Npadlong <= ReallocOffset) + return (int)((ulong*)v)[-Npadlong+ReallocOffset]; + return ~0; +} --- /dev/null +++ /sys/src/9/port64/allocb.c @@ -0,0 +1,218 @@ +/* Block allocation */ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "error.h" + +#define ALIGNUP(a) ROUND((uintptr)(a), BLOCKALIGN) + +enum +{ + Hdrspc = 64, /* leave room for high-level headers */ + Bdead = 0x51494F42, /* "QIOB" */ + Bmagic = 0x0910b10c, +}; + +struct +{ + Lock; + ulong bytes; +} ialloc; + +/* + * convert the size of a desired buffer to the size needed + * to include Block overhead and alignment. + */ +ulong +blocksize(ulong size) +{ + return ALIGNUP(sizeof(Block)) + Hdrspc + ALIGNUP(size); +} + +/* + * convert malloced or non-malloced buffer to a Block. + * used to build custom Block allocators. + * + * buf must be at least blocksize(usable) bytes. + */ +Block * +mem2block(void *buf, ulong usable, int malloced) +{ + Block *b; + + if(buf == nil) + return nil; + + b = (Block *)buf; + b->next = nil; + b->list = nil; + b->free = 0; + b->flag = 0; + b->ref = 0; + b->magic = Bmagic; + _xinc(&b->ref); + + /* align start of data portion by rounding up */ + b->base = (uchar*)ALIGNUP((uintptr)b + sizeof(Block)); + + /* align end of data portion by rounding down */ + b->lim = (uchar*)b + (malloced? msize(b): blocksize(usable)); + b->lim = (uchar*)((uintptr)b->lim & ~(BLOCKALIGN-1)); + + /* leave sluff at beginning for added headers */ + b->wp = b->rp = b->lim - ALIGNUP(usable); + if(b->rp < b->base) + panic("mem2block: b->rp < b->base"); + if(b->lim > (uchar*)b + (malloced? msize(b): blocksize(usable))) + panic("mem2block: b->lim beyond Block end"); + return b; +} + +static Block* +_allocb(int size) +{ + return mem2block(mallocz(blocksize(size), 0), size, 1); +} + +Block* +allocb(int size) +{ + Block *b; + + /* + * Check in a process and wait until successful. + * Can still error out of here, though. + */ + if(up == nil) + panic("allocb without up: %#p", getcallerpc(&size)); + if((b = _allocb(size)) == nil){ + splhi(); + xsummary(); + mallocsummary(); + delay(500); + panic("allocb: no memory for %d bytes; caller %#p", size, + getcallerpc(&size)); + } + setmalloctag(b, getcallerpc(&size)); + + return b; +} + +Block* +iallocb(int size) +{ + Block *b; + static int m1, m2, mp; + + if(ialloc.bytes > conf.ialloc){ + if((m1++%10000)==0){ + if(mp++ > 1000){ + active.exiting = 1; + exit(0); + } + iprint("iallocb: limited %lud/%lud\n", + ialloc.bytes, conf.ialloc); + } + return nil; + } + + if((b = _allocb(size)) == nil){ + if((m2++%10000)==0){ + if(mp++ > 1000){ + active.exiting = 1; + exit(0); + } + iprint("iallocb: no memory %lud/%lud\n", + ialloc.bytes, conf.ialloc); + } + return nil; + } + setmalloctag(b, getcallerpc(&size)); + b->flag = BINTR; + + ilock(&ialloc); + ialloc.bytes += b->lim - b->base; + iunlock(&ialloc); + + return b; +} + +void +freeb(Block *b) +{ + void *dead = (void*)Bdead; + long ref; + + if(b == nil) + return; + if(Bmagic && b->magic != Bmagic) + panic("freeb: bad magic %#lux in Block %#p; caller pc %#p", + b->magic, b, getcallerpc(&b)); + + if((ref = _xdec(&b->ref)) > 0) + return; + if(ref < 0){ + dumpstack(); + panic("freeb: ref %ld; caller pc %#p", ref, getcallerpc(&b)); + } + + /* + * drivers which perform non cache coherent DMA manage their own buffer + * pool of uncached buffers and provide their own free routine. + */ + if(b->free) { + b->free(b); + return; + } + if(b->flag & BINTR) { + ilock(&ialloc); + ialloc.bytes -= b->lim - b->base; + iunlock(&ialloc); + } + + /* poison the block in case someone is still holding onto it */ + b->next = dead; + b->rp = dead; + b->wp = dead; + b->lim = dead; + b->base = dead; + b->magic = 0; + + free(b); +} + +void +checkb(Block *b, char *msg) +{ + void *dead = (void*)Bdead; + + if(b == dead) + panic("checkb b %s %#p", msg, b); + if(b->base == dead || b->lim == dead || b->next == dead + || b->rp == dead || b->wp == dead){ + print("checkb: base %#p lim %#p next %#p\n", + b->base, b->lim, b->next); + print("checkb: rp %#p wp %#p\n", b->rp, b->wp); + panic("checkb dead: %s", msg); + } + if(Bmagic && b->magic != Bmagic) + panic("checkb: bad magic %#lux in Block %#p", b->magic, b); + if(b->base > b->lim) + panic("checkb 0 %s %#p %#p", msg, b->base, b->lim); + if(b->rp < b->base) + panic("checkb 1 %s %#p %#p", msg, b->base, b->rp); + if(b->wp < b->base) + panic("checkb 2 %s %#p %#p", msg, b->base, b->wp); + if(b->rp > b->lim) + panic("checkb 3 %s %#p %#p", msg, b->rp, b->lim); + if(b->wp > b->lim) + panic("checkb 4 %s %#p %#p", msg, b->wp, b->lim); +} + +void +iallocsummary(void) +{ + print("ialloc %lud/%lud\n", ialloc.bytes, conf.ialloc); +} --- /dev/null +++ /sys/src/9/port64/auth.c @@ -0,0 +1,157 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +#include + +char *eve; +char hostdomain[DOMLEN]; + +/* + * return true if current user is eve + */ +int +iseve(void) +{ + return strcmp(eve, up->user) == 0; +} + +uintptr +sysfversion(uintptr *arg) +{ + char *vers; + uint arglen, m, msize; + Chan *c; + + msize = arg[1]; + vers = (char*)arg[2]; + arglen = arg[3]; + validaddr(arg[2], arglen, 1); + /* check there's a NUL in the version string */ + if(arglen==0 || memchr(vers, 0, arglen)==0) + error(Ebadarg); + c = fdtochan(arg[0], ORDWR, 0, 1); + if(waserror()){ + cclose(c); + nexterror(); + } + + m = mntversion(c, vers, msize, arglen); + + cclose(c); + poperror(); + return m; +} + +uintptr +sys_fsession(uintptr *arg) +{ + /* deprecated; backwards compatibility only */ + + if(arg[2] == 0) + error(Ebadarg); + validaddr(arg[1], arg[2], 1); + ((uchar*)arg[1])[0] = '\0'; + return 0; +} + +uintptr +sysfauth(uintptr *arg) +{ + Chan *c, *ac; + char *aname; + int fd; + + validaddr(arg[1], 1, 0); + aname = validnamedup((char*)arg[1], 1); + if(waserror()){ + free(aname); + nexterror(); + } + c = fdtochan(arg[0], ORDWR, 0, 1); + if(waserror()){ + cclose(c); + nexterror(); + } + + ac = mntauth(c, aname); + /* at this point ac is responsible for keeping c alive */ + poperror(); /* c */ + cclose(c); + poperror(); /* aname */ + free(aname); + + if(waserror()){ + cclose(ac); + nexterror(); + } + + fd = newfd(ac); + if(fd < 0) + error(Enofd); + poperror(); /* ac */ + + /* always mark it close on exec */ + ac->flag |= CCEXEC; + + return fd; +} + +/* + * called by devcons() for user device + * + * anyone can become none + */ +long +userwrite(char *a, int n) +{ + if(n!=4 || strncmp(a, "none", 4)!=0) + error(Eperm); + kstrdup(&up->user, "none"); + up->basepri = PriNormal; + return n; +} + +/* + * called by devcons() for host owner/domain + * + * writing hostowner also sets user + */ +long +hostownerwrite(char *a, int n) +{ + char buf[128]; + + if(!iseve()) + error(Eperm); + if(n <= 0 || n >= sizeof buf) + error(Ebadarg); + memmove(buf, a, n); + buf[n] = 0; + + renameuser(eve, buf); + kstrdup(&eve, buf); + kstrdup(&up->user, buf); + up->basepri = PriNormal; + return n; +} + +long +hostdomainwrite(char *a, int n) +{ + char buf[DOMLEN]; + + if(!iseve()) + error(Eperm); + if(n >= DOMLEN) + error(Ebadarg); + memset(buf, 0, DOMLEN); + strncpy(buf, a, n); + if(buf[0] == 0) + error(Ebadarg); + memmove(hostdomain, buf, DOMLEN); + return n; +} --- /dev/null +++ /sys/src/9/port64/chan.c @@ -0,0 +1,1795 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +int chandebug=0; /* toggled by sysr1 */ +#define DBG if(chandebug)iprint + +enum +{ + PATHSLOP = 20, + PATHMSLOP = 20, +}; + +struct +{ + Lock; + int fid; + Chan *free; + Chan *list; +}chanalloc; + +typedef struct Elemlist Elemlist; + +struct Elemlist +{ + char *aname; /* original name */ + char *name; /* copy of name, so '/' can be overwritten */ + int nelems; + char **elems; + int *off; + int mustbedir; + int nerror; + int prefix; +}; + +#define SEP(c) ((c) == 0 || (c) == '/') + +static void +dumpmount(void) /* DEBUGGING */ +{ + Pgrp *pg; + Mount *t; + Mhead **h, **he, *f; + + if(up == nil){ + print("no process for dumpmount\n"); + return; + } + pg = up->pgrp; + if(pg == nil){ + print("no pgrp for dumpmount\n"); + return; + } + rlock(&pg->ns); + if(waserror()){ + runlock(&pg->ns); + nexterror(); + } + + he = &pg->mnthash[MNTHASH]; + for(h = pg->mnthash; h < he; h++){ + for(f = *h; f; f = f->hash){ + print("head: %#p: %s %#llux.%lud %C %lud -> \n", f, + f->from->path->s, f->from->qid.path, + f->from->qid.vers, devtab[f->from->type]->dc, + f->from->dev); + for(t = f->mount; t; t = t->next) + print("\t%#p: %s (umh %#p) (path %#.8llux dev %C %lud)\n", t, t->to->path->s, t->to->umh, t->to->qid.path, devtab[t->to->type]->dc, t->to->dev); + } + } + poperror(); + runlock(&pg->ns); +} + +char* +chanpath(Chan *c) +{ + if(c == nil) + return ""; + if(c->path == nil) + return ""; + if(c->path->s == nil) + return ""; + return c->path->s; +} + +int +isdotdot(char *p) +{ + return p[0]=='.' && p[1]=='.' && p[2]=='\0'; +} + +long +incref(Ref *r) +{ + long x; + + lock(r); + x = ++r->ref; + unlock(r); + return x; +} + +long +decref(Ref *r) +{ + long x; + + lock(r); + x = --r->ref; + unlock(r); + if(x < 0) + panic("decref pc=%#p", getcallerpc(&r)); + + return x; +} + +/* + * Rather than strncpy, which zeros the rest of the buffer, kstrcpy + * truncates if necessary, always zero terminates, does not zero fill, + * and puts ... at the end of the string if it's too long. Usually used to + * save a string in up->genbuf; + */ +void +kstrcpy(char *s, char *t, int ns) +{ + int nt; + + nt = strlen(t); + if(nt+1 <= ns){ + memmove(s, t, nt+1); + return; + } + /* too long */ + if(ns < 4){ + /* but very short! */ + strncpy(s, t, ns); + return; + } + /* truncate with ... at character boundary (very rare case) */ + memmove(s, t, ns-4); + ns -= 4; + s[ns] = '\0'; + /* look for first byte of UTF-8 sequence by skipping continuation bytes */ + while(ns>0 && (s[--ns]&0xC0)==0x80) + ; + strcpy(s+ns, "..."); +} + +int +emptystr(char *s) +{ + if(s == nil) + return 1; + if(s[0] == '\0') + return 1; + return 0; +} + +/* + * Atomically replace *p with copy of s + */ +void +kstrdup(char **p, char *s) +{ + int n; + char *t, *prev; + + n = strlen(s)+1; + /* if it's a user, we can wait for memory; if not, something's very wrong */ + if(up){ + t = smalloc(n); + setmalloctag(t, getcallerpc(&p)); + }else{ + t = malloc(n); + if(t == nil) + panic("kstrdup: no memory"); + } + memmove(t, s, n); + prev = *p; + *p = t; + free(prev); +} + +static int debugstart = 1; + +void +chandevreset(void) +{ + int i; + + todinit(); /* avoid later reentry causing infinite recursion */ + debugstart = getconf("*debugstart") != nil; + if(debugstart) + iprint("reset:"); + for(i=0; devtab[i] != nil; i++) { + if(debugstart) + iprint(" %s", devtab[i]->name); + devtab[i]->reset(); + } + if(debugstart) + iprint("\n"); +} + +void +chandevinit(void) +{ + int i; + + if(debugstart) + iprint("init:"); + for(i=0; devtab[i] != nil; i++) { + if(debugstart) + iprint(" %s", devtab[i]->name); + devtab[i]->init(); + } + if(debugstart) + iprint("\n"); +} + +void +chandevshutdown(void) +{ + int i; + + /* shutdown in reverse order */ + for(i=0; devtab[i] != nil; i++) + ; + for(i--; i >= 0; i--) + devtab[i]->shutdown(); +} + +Chan* +newchan(void) +{ + Chan *c; + + lock(&chanalloc); + c = chanalloc.free; + if(c != 0) + chanalloc.free = c->next; + unlock(&chanalloc); + + if(c == nil){ + c = smalloc(sizeof(Chan)); + lock(&chanalloc); + c->fid = ++chanalloc.fid; + c->link = chanalloc.list; + chanalloc.list = c; + unlock(&chanalloc); + } + + /* if you get an error before associating with a dev, + close calls rootclose, a nop */ + c->type = 0; + c->flag = 0; + c->ref = 1; + c->dev = 0; + c->offset = 0; + c->devoffset = 0; + c->iounit = 0; + c->umh = 0; + c->uri = 0; + c->dri = 0; + c->aux = 0; + c->mchan = 0; + c->mcp = 0; + c->mux = 0; + memset(&c->mqid, 0, sizeof(c->mqid)); + c->path = 0; + c->ismtpt = 0; + + return c; +} + +Ref npath; + +Path* +newpath(char *s) +{ + int i; + Path *p; + + p = smalloc(sizeof(Path)); + i = strlen(s); + p->len = i; + p->alen = i+PATHSLOP; + p->s = smalloc(p->alen); + memmove(p->s, s, i+1); + p->ref = 1; + incref(&npath); + + /* + * Cannot use newpath for arbitrary names because the mtpt + * array will not be populated correctly. The names #/ and / are + * allowed, but other names with / in them draw warnings. + */ + if(strchr(s, '/') && strcmp(s, "#/") != 0 && strcmp(s, "/") != 0) + print("newpath: %s from %#p\n", s, getcallerpc(&s)); + + p->mlen = 1; + p->malen = PATHMSLOP; + p->mtpt = smalloc(p->malen*sizeof p->mtpt[0]); + return p; +} + +static Path* +copypath(Path *p) +{ + int i; + Path *pp; + + pp = smalloc(sizeof(Path)); + pp->ref = 1; + incref(&npath); + DBG("copypath %s %p => %p\n", p->s, p, pp); + + pp->len = p->len; + pp->alen = p->alen; + pp->s = smalloc(p->alen); + memmove(pp->s, p->s, p->len+1); + + pp->mlen = p->mlen; + pp->malen = p->malen; + pp->mtpt = smalloc(p->malen*sizeof pp->mtpt[0]); + for(i=0; imlen; i++){ + pp->mtpt[i] = p->mtpt[i]; + if(pp->mtpt[i]) + incref(pp->mtpt[i]); + } + + return pp; +} + +void +pathclose(Path *p) +{ + int i; + + if(p == nil) + return; +//XXX + DBG("pathclose %p %s ref=%ld =>", p, p->s, p->ref); + for(i=0; imlen; i++) + DBG(" %p", p->mtpt[i]); + DBG("\n"); + + if(decref(p)) + return; + decref(&npath); + free(p->s); + for(i=0; imlen; i++) + if(p->mtpt[i]) + cclose(p->mtpt[i]); + free(p->mtpt); + free(p); +} + +/* + * In place, rewrite name to compress multiple /, eliminate ., and process .. + * (Really only called to remove a trailing .. that has been added. + * Otherwise would need to update n->mtpt as well.) + */ +static void +fixdotdotname(Path *p) +{ + char *r; + + if(p->s[0] == '#'){ + r = strchr(p->s, '/'); + if(r == nil) + return; + cleanname(r); + + /* + * The correct name is #i rather than #i/, + * but the correct name of #/ is #/. + */ + if(strcmp(r, "/")==0 && p->s[1] != '/') + *r = '\0'; + }else + cleanname(p->s); + p->len = strlen(p->s); +} + +static Path* +uniquepath(Path *p) +{ + Path *new; + + if(p->ref > 1){ + /* copy on write */ + new = copypath(p); + pathclose(p); + p = new; + } + return p; +} + +static Path* +addelem(Path *p, char *s, Chan *from) +{ + char *t; + int a, i; + Chan *c, **tt; + + if(s[0]=='.' && s[1]=='\0') + return p; + + p = uniquepath(p); + + i = strlen(s); + if(p->len+1+i+1 > p->alen){ + a = p->len+1+i+1 + PATHSLOP; + t = smalloc(a); + memmove(t, p->s, p->len+1); + free(p->s); + p->s = t; + p->alen = a; + } + /* don't insert extra slash if one is present */ + if(p->len>0 && p->s[p->len-1]!='/' && s[0]!='/') + p->s[p->len++] = '/'; + memmove(p->s+p->len, s, i+1); + p->len += i; + if(isdotdot(s)){ + fixdotdotname(p); + DBG("addelem %s .. => rm %p\n", p->s, p->mtpt[p->mlen-1]); + if(p->mlen>1 && (c = p->mtpt[--p->mlen])){ + p->mtpt[p->mlen] = nil; + cclose(c); + } + }else{ + if(p->mlen >= p->malen){ + p->malen = p->mlen+1+PATHMSLOP; + tt = smalloc(p->malen*sizeof tt[0]); + memmove(tt, p->mtpt, p->mlen*sizeof tt[0]); + free(p->mtpt); + p->mtpt = tt; + } + DBG("addelem %s %s => add %p\n", p->s, s, from); + p->mtpt[p->mlen++] = from; + if(from) + incref(from); + } + return p; +} + +void +chanfree(Chan *c) +{ + c->flag = CFREE; + + if(c->dirrock != nil){ + free(c->dirrock); + c->dirrock = 0; + c->nrock = 0; + c->mrock = 0; + } + if(c->umh != nil){ + putmhead(c->umh); + c->umh = nil; + } + if(c->umc != nil){ + cclose(c->umc); + c->umc = nil; + } + if(c->mux != nil){ + muxclose(c->mux); + c->mux = nil; + } + if(c->mchan != nil){ + cclose(c->mchan); + c->mchan = nil; + } + + pathclose(c->path); + c->path = nil; + + lock(&chanalloc); + c->next = chanalloc.free; + chanalloc.free = c; + unlock(&chanalloc); +} + +void +cclose(Chan *c) +{ + if(c->flag&CFREE) + panic("cclose %#p", getcallerpc(&c)); + + DBG("cclose %p name=%s ref=%ld\n", c, c->path->s, c->ref); + if(decref(c)) + return; + + if(!waserror()){ + devtab[c->type]->close(c); + poperror(); + } + chanfree(c); +} + +/* + * Queue a chan to be closed by one of the clunk procs. + */ +struct { + Chan *head; + Chan *tail; + int nqueued; + int nclosed; + Lock l; + QLock q; + Rendez r; +} clunkq; +void closeproc(void*); + +void +ccloseq(Chan *c) +{ + if(c->flag&CFREE) + panic("cclose %#p", getcallerpc(&c)); + + DBG("ccloseq %p name=%s ref=%ld\n", c, c->path->s, c->ref); + + if(decref(c)) + return; + + lock(&clunkq.l); + clunkq.nqueued++; + c->next = nil; + if(clunkq.head) + clunkq.tail->next = c; + else + clunkq.head = c; + clunkq.tail = c; + unlock(&clunkq.l); + + if(!wakeup(&clunkq.r)) + kproc("closeproc", closeproc, nil); +} + +static int +clunkwork(void*) +{ + return clunkq.head != nil; +} + +void +closeproc(void*) +{ + Chan *c; + + for(;;){ + qlock(&clunkq.q); + if(clunkq.head == nil){ + if(!waserror()){ + tsleep(&clunkq.r, clunkwork, nil, 5000); + poperror(); + } + if(clunkq.head == nil){ + qunlock(&clunkq.q); + pexit("no work", 1); + } + } + lock(&clunkq.l); + c = clunkq.head; + clunkq.head = c->next; + clunkq.nclosed++; + unlock(&clunkq.l); + qunlock(&clunkq.q); + if(!waserror()){ + devtab[c->type]->close(c); + poperror(); + } + chanfree(c); + } +} + +/* + * Make sure we have the only copy of c. (Copy on write.) + */ +Chan* +cunique(Chan *c) +{ + Chan *nc; + + if(c->ref != 1){ + nc = cclone(c); + cclose(c); + c = nc; + } + + return c; +} + +int +eqqid(Qid a, Qid b) +{ + return a.path==b.path && a.vers==b.vers; +} + +int +eqchan(Chan *a, Chan *b, int skipvers) +{ + if(a->qid.path != b->qid.path) + return 0; + if(!skipvers && a->qid.vers!=b->qid.vers) + return 0; + if(a->type != b->type) + return 0; + if(a->dev != b->dev) + return 0; + return 1; +} + +int +eqchantdqid(Chan *a, int type, int dev, Qid qid, int skipvers) +{ + if(a->qid.path != qid.path) + return 0; + if(!skipvers && a->qid.vers!=qid.vers) + return 0; + if(a->type != type) + return 0; + if(a->dev != dev) + return 0; + return 1; +} + +Mhead* +newmhead(Chan *from) +{ + Mhead *mh; + + mh = smalloc(sizeof(Mhead)); + mh->ref = 1; + mh->from = from; + incref(from); + return mh; +} + +int +cmount(Chan **newp, Chan *old, int flag, char *spec) +{ + int order, flg; + Chan *new; + Mhead *m, **l, *mh; + Mount *nm, *f, *um, **h; + Pgrp *pg; + + if(QTDIR & (old->qid.type^(*newp)->qid.type)) + error(Emount); + + if(old->umh) + print("cmount: unexpected umh, caller %#p\n", getcallerpc(&newp)); + + order = flag&MORDER; + + if((old->qid.type&QTDIR)==0 && order != MREPL) + error(Emount); + + new = *newp; + mh = new->umh; + + /* + * Not allowed to bind when the old directory is itself a union. + * (Maybe it should be allowed, but I don't see what the semantics + * would be.) + * + * We need to check mh->mount->next to tell unions apart from + * simple mount points, so that things like + * mount -c fd /root + * bind -c /root / + * work. + * + * The check of mount->mflag allows things like + * mount fd /root + * bind -c /root / + * + * This is far more complicated than it should be, but I don't + * see an easier way at the moment. + */ + if((flag&MCREATE) && mh && mh->mount + && (mh->mount->next || !(mh->mount->mflag&MCREATE))) + error(Emount); + + pg = up->pgrp; + wlock(&pg->ns); + + l = &MOUNTH(pg, old->qid); + for(m = *l; m; m = m->hash){ + if(eqchan(m->from, old, 1)) + break; + l = &m->hash; + } + + if(m == nil){ + /* + * nothing mounted here yet. create a mount + * head and add to the hash table. + */ + m = newmhead(old); + *l = m; + + /* + * if this is a union mount, add the old + * node to the mount chain. + */ + if(order != MREPL) + m->mount = newmount(m, old, 0, 0); + } + wlock(&m->lock); + if(waserror()){ + wunlock(&m->lock); + nexterror(); + } + wunlock(&pg->ns); + + nm = newmount(m, new, flag, spec); + if(mh != nil && mh->mount != nil){ + /* + * copy a union when binding it onto a directory + */ + flg = order; + if(order == MREPL) + flg = MAFTER; + h = &nm->next; + um = mh->mount; + for(um = um->next; um; um = um->next){ + f = newmount(m, um->to, flg, um->spec); + *h = f; + h = &f->next; + } + } + + if(m->mount && order == MREPL){ + mountfree(m->mount); + m->mount = 0; + } + + if(flag & MCREATE) + nm->mflag |= MCREATE; + + if(m->mount && order == MAFTER){ + for(f = m->mount; f->next; f = f->next) + ; + f->next = nm; + }else{ + for(f = nm; f->next; f = f->next) + ; + f->next = m->mount; + m->mount = nm; + } + + wunlock(&m->lock); + poperror(); + return nm->mountid; +} + +void +cunmount(Chan *mnt, Chan *mounted) +{ + Pgrp *pg; + Mhead *m, **l; + Mount *f, **p; + + if(mnt->umh) /* should not happen */ + print("cunmount newp extra umh %p has %p\n", mnt, mnt->umh); + + /* + * It _can_ happen that mounted->umh is non-nil, + * because mounted is the result of namec(Aopen) + * (see sysfile.c:/^sysunmount). + * If we open a union directory, it will have a umh. + * Although surprising, this is okay, since the + * cclose will take care of freeing the umh. + */ + + pg = up->pgrp; + wlock(&pg->ns); + + l = &MOUNTH(pg, mnt->qid); + for(m = *l; m; m = m->hash){ + if(eqchan(m->from, mnt, 1)) + break; + l = &m->hash; + } + + if(m == 0){ + wunlock(&pg->ns); + error(Eunmount); + } + + wlock(&m->lock); + if(mounted == 0){ + *l = m->hash; + wunlock(&pg->ns); + mountfree(m->mount); + m->mount = nil; + cclose(m->from); + wunlock(&m->lock); + putmhead(m); + return; + } + + p = &m->mount; + for(f = *p; f; f = f->next){ + /* BUG: Needs to be 2 pass */ + if(eqchan(f->to, mounted, 1) || + (f->to->mchan && eqchan(f->to->mchan, mounted, 1))){ + *p = f->next; + f->next = 0; + mountfree(f); + if(m->mount == nil){ + *l = m->hash; + cclose(m->from); + wunlock(&m->lock); + wunlock(&pg->ns); + putmhead(m); + return; + } + wunlock(&m->lock); + wunlock(&pg->ns); + return; + } + p = &f->next; + } + wunlock(&m->lock); + wunlock(&pg->ns); + error(Eunion); +} + +Chan* +cclone(Chan *c) +{ + Chan *nc; + Walkqid *wq; + + wq = devtab[c->type]->walk(c, nil, nil, 0); + if(wq == nil) + error("clone failed"); + nc = wq->clone; + free(wq); + nc->path = c->path; + if(c->path) + incref(c->path); + return nc; +} + +/* also used by sysfile.c:/^mountfix */ +int +findmount(Chan **cp, Mhead **mp, int type, int dev, Qid qid) +{ + Pgrp *pg; + Mhead *m; + + pg = up->pgrp; + rlock(&pg->ns); + for(m = MOUNTH(pg, qid); m; m = m->hash){ + rlock(&m->lock); + if(m->from == nil){ + print("m %p m->from 0\n", m); + runlock(&m->lock); + continue; + } + if(eqchantdqid(m->from, type, dev, qid, 1)){ + runlock(&pg->ns); + if(mp != nil){ + incref(m); + if(*mp != nil) + putmhead(*mp); + *mp = m; + } + if(*cp != nil) + cclose(*cp); + incref(m->mount->to); + *cp = m->mount->to; + runlock(&m->lock); + return 1; + } + runlock(&m->lock); + } + + runlock(&pg->ns); + return 0; +} + +/* + * Calls findmount but also updates path. + */ +static int +domount(Chan **cp, Mhead **mp, Path **path) +{ + Chan **lc; + Path *p; + + if(findmount(cp, mp, (*cp)->type, (*cp)->dev, (*cp)->qid) == 0) + return 0; + + if(path){ + p = *path; + p = uniquepath(p); + if(p->mlen <= 0) + print("domount: path %s has mlen==%d\n", p->s, p->mlen); + else{ + lc = &p->mtpt[p->mlen-1]; +DBG("domount %p %s => add %p (was %p)\n", p, p->s, (*mp)->from, p->mtpt[p->mlen-1]); + incref((*mp)->from); + if(*lc) + cclose(*lc); + *lc = (*mp)->from; + } + *path = p; + } + return 1; +} + +/* + * If c is the right-hand-side of a mount point, returns the left hand side. + * Changes name to reflect the fact that we've uncrossed the mountpoint, + * so name had better be ours to change! + */ +static Chan* +undomount(Chan *c, Path *path) +{ + Chan *nc; + + if(path->ref != 1 || path->mlen == 0) + print("undomount: path %s ref %ld mlen %d caller %#p\n", + path->s, path->ref, path->mlen, getcallerpc(&c)); + + if(path->mlen>0 && (nc=path->mtpt[path->mlen-1]) != nil){ +DBG("undomount %p %s => remove %p\n", path, path->s, nc); + cclose(c); + path->mtpt[path->mlen-1] = nil; + c = nc; + } + return c; +} + +/* + * Call dev walk but catch errors. + */ +static Walkqid* +ewalk(Chan *c, Chan *nc, char **name, int nname) +{ + Walkqid *wq; + + if(waserror()) + return nil; + wq = devtab[c->type]->walk(c, nc, name, nname); + poperror(); + return wq; +} + +/* + * Either walks all the way or not at all. No partial results in *cp. + * *nerror is the number of names to display in an error message. + */ +static char Edoesnotexist[] = "does not exist"; +int +walk(Chan **cp, char **names, int nnames, int nomount, int *nerror) +{ + int dev, didmount, dotdot, i, n, nhave, ntry, type; + Chan *c, *nc, *mtpt; + Path *path; + Mhead *mh, *nmh; + Mount *f; + Walkqid *wq; + + c = *cp; + incref(c); + path = c->path; + incref(path); + mh = nil; + + /* + * While we haven't gotten all the way down the path: + * 1. step through a mount point, if any + * 2. send a walk request for initial dotdot or initial prefix without dotdot + * 3. move to the first mountpoint along the way. + * 4. repeat. + * + * Each time through the loop: + * + * If didmount==0, c is on the undomount side of the mount point. + * If didmount==1, c is on the domount side of the mount point. + * Either way, c's full path is path. + */ + didmount = 0; + for(nhave=0; nhaveqid.type&QTDIR)==0){ + if(nerror) + *nerror = nhave; + pathclose(path); + cclose(c); + strcpy(up->errstr, Enotdir); + if(mh != nil) + putmhead(mh); + return -1; + } + ntry = nnames - nhave; + if(ntry > MAXWELEM) + ntry = MAXWELEM; + dotdot = 0; + for(i=0; itype; + dev = c->dev; + + if((wq = ewalk(c, nil, names+nhave, ntry)) == nil){ + /* try a union mount, if any */ + if(mh && !nomount){ + /* + * mh->mount->to == c, so start at mh->mount->next + */ + rlock(&mh->lock); + f = mh->mount; + for(f = (f? f->next: f); f; f = f->next) + if((wq = ewalk(f->to, nil, names+nhave, ntry)) != nil) + break; + runlock(&mh->lock); + if(f != nil){ + type = f->to->type; + dev = f->to->dev; + } + } + if(wq == nil){ + cclose(c); + pathclose(path); + if(nerror) + *nerror = nhave+1; + if(mh != nil) + putmhead(mh); + return -1; + } + } + + didmount = 0; + if(dotdot){ + assert(wq->nqid == 1); + assert(wq->clone != nil); + + path = addelem(path, "..", nil); + nc = undomount(wq->clone, path); + nmh = nil; + n = 1; + }else{ + nc = nil; + nmh = nil; + if(!nomount){ + for(i=0; inqid && iqid[i])){ + didmount = 1; + break; + } + } + } + if(nc == nil){ /* no mount points along path */ + if(wq->clone == nil){ + cclose(c); + pathclose(path); + if(wq->nqid==0 || (wq->qid[wq->nqid-1].type&QTDIR)){ + if(nerror) + *nerror = nhave+wq->nqid+1; + strcpy(up->errstr, Edoesnotexist); + }else{ + if(nerror) + *nerror = nhave+wq->nqid; + strcpy(up->errstr, Enotdir); + } + free(wq); + if(mh != nil) + putmhead(mh); + return -1; + } + n = wq->nqid; + nc = wq->clone; + }else{ /* stopped early, at a mount point */ + didmount = 1; + if(wq->clone != nil){ + cclose(wq->clone); + wq->clone = nil; + } + n = i+1; + } + for(i=0; ifrom; + path = addelem(path, names[nhave+i], mtpt); + } + } + cclose(c); + c = nc; + putmhead(mh); + mh = nmh; + free(wq); + } + + putmhead(mh); + + c = cunique(c); + + if(c->umh != nil){ //BUG + print("walk umh\n"); + putmhead(c->umh); + c->umh = nil; + } + + pathclose(c->path); + c->path = path; + + cclose(*cp); + *cp = c; + if(nerror) + *nerror = nhave; + return 0; +} + +/* + * c is a mounted non-creatable directory. find a creatable one. + */ +Chan* +createdir(Chan *c, Mhead *m) +{ + Chan *nc; + Mount *f; + + rlock(&m->lock); + if(waserror()){ + runlock(&m->lock); + nexterror(); + } + for(f = m->mount; f; f = f->next){ + if(f->mflag&MCREATE){ + nc = cclone(f->to); + runlock(&m->lock); + poperror(); + cclose(c); + return nc; + } + } + error(Enocreate); + return 0; +} + +void +saveregisters(void) +{ +} + +static void +growparse(Elemlist *e) +{ + char **new; + int *inew; + enum { Delta = 8 }; + + if(e->nelems % Delta == 0){ + new = smalloc((e->nelems+Delta) * sizeof(char*)); + memmove(new, e->elems, e->nelems*sizeof(char*)); + free(e->elems); + e->elems = new; + inew = smalloc((e->nelems+Delta+1) * sizeof(int)); + memmove(inew, e->off, (e->nelems+1)*sizeof(int)); + free(e->off); + e->off = inew; + } +} + +/* + * The name is known to be valid. + * Copy the name so slashes can be overwritten. + * An empty string will set nelem=0. + * A path ending in / or /. or /.//./ etc. will have + * e.mustbedir = 1, so that we correctly + * reject, e.g., "/adm/users/." when /adm/users is a file + * rather than a directory. + */ +static void +parsename(char *aname, Elemlist *e) +{ + char *name, *slash; + + kstrdup(&e->name, aname); + name = e->name; + e->nelems = 0; + e->elems = nil; + e->off = smalloc(sizeof(int)); + e->off[0] = skipslash(name) - name; + for(;;){ + name = skipslash(name); + if(*name == '\0'){ + e->off[e->nelems] = name+strlen(name) - e->name; + e->mustbedir = 1; + break; + } + growparse(e); + e->elems[e->nelems++] = name; + slash = utfrune(name, '/'); + if(slash == nil){ + e->off[e->nelems] = name+strlen(name) - e->name; + e->mustbedir = 0; + break; + } + e->off[e->nelems] = slash - e->name; + *slash++ = '\0'; + name = slash; + } + + if(0 && chandebug){ + int i; + + print("parsename %s:", e->name); + for(i=0; i<=e->nelems; i++) + print(" %d", e->off[i]); + print("\n"); + } +} + +void* +memrchr(void *va, int c, long n) +{ + uchar *a, *e; + + a = va; + for(e=a+n-1; e>a; e--) + if(*e == c) + return e; + return nil; +} + +void +namelenerror(char *aname, int len, char *err) +{ + char *ename, *name, *next; + int i, errlen; + + /* + * If the name is short enough, just use the whole thing. + */ + errlen = strlen(err); + if(len < ERRMAX/3 || len+errlen < 2*ERRMAX/3) + snprint(up->genbuf, sizeof up->genbuf, "%.*s", + utfnlen(aname, len), aname); + else{ + /* + * Print a suffix of the name, but try to get a little info. + */ + ename = aname+len; + next = ename; + do{ + name = next; + next = memrchr(aname, '/', name-aname); + if(next == nil) + next = aname; + len = ename-next; + }while(len < ERRMAX/3 || len + errlen < 2*ERRMAX/3); + + /* + * If the name is ridiculously long, chop it. + */ + if(name == ename){ + name = ename-ERRMAX/4; + if(name <= aname) + panic("bad math in namelenerror"); + /* walk out of current UTF sequence */ + for(i=0; (*name&0xC0)==0x80 && igenbuf, sizeof up->genbuf, "...%.*s", + utfnlen(name, ename-name), name); + } + snprint(up->errstr, ERRMAX, "%#q %s", up->genbuf, err); + nexterror(); +} + +void +nameerror(char *name, char *err) +{ + namelenerror(name, strlen(name), err); +} + +/* + * Turn a name into a channel. + * &name[0] is known to be a valid address. It may be a kernel address. + * + * Opening with amode Aopen, Acreate, Aremove, or Aaccess guarantees + * that the result will be the only reference to that particular fid. + * This is necessary since we might pass the result to + * devtab[]->remove(). + * + * Opening Atodir or Amount does not guarantee this. + * + * Under certain circumstances, opening Aaccess will cause + * an unnecessary clone in order to get a cunique Chan so it + * can attach the correct name. Sysstat and sys_stat need the + * correct name so they can rewrite the stat info. + */ +Chan* +namec(char *aname, int amode, int omode, ulong perm) +{ + int len, n, t, nomount; + Chan *c, *cnew; + Path *path; + Elemlist e; + Rune r; + Mhead *m; + char *createerr, tmperrbuf[ERRMAX]; + char *name; + + if(aname[0] == '\0') + error("empty file name"); + aname = validnamedup(aname, 1); + if(waserror()){ + free(aname); + nexterror(); + } + DBG("namec %s %d %d\n", aname, amode, omode); + name = aname; + + /* + * Find the starting off point (the current slash, the root of + * a device tree, or the current dot) as well as the name to + * evaluate starting there. + */ + nomount = 0; + switch(name[0]){ + case '/': + c = up->slash; + incref(c); + break; + + case '#': + nomount = 1; + up->genbuf[0] = '\0'; + n = 0; + while(*name != '\0' && (*name != '/' || n < 2)){ + if(n >= sizeof(up->genbuf)-1) + error(Efilename); + up->genbuf[n++] = *name++; + } + up->genbuf[n] = '\0'; + /* + * noattach is sandboxing. + * + * the OK exceptions are: + * | it only gives access to pipes you create + * d this process's file descriptors + * e this process's environment + * the iffy exceptions are: + * c time and pid, but also cons and consctl + * p control of your own processes (and unfortunately + * any others left unprotected) + */ + n = chartorune(&r, up->genbuf+1)+1; + /* actually / is caught by parsing earlier */ + if(utfrune("M", r)) + error(Enoattach); + if(up->pgrp->noattach && utfrune("|decp", r)==nil) + error(Enoattach); + t = devno(r, 1); + if(t == -1) + error(Ebadsharp); + if(debugstart && !devtab[t]->attached) + print("#%C...", devtab[t]->dc); + c = devtab[t]->attach(up->genbuf+n); + if(debugstart && c != nil) + devtab[t]->attached = 1; + break; + + default: + c = up->dot; + incref(c); + break; + } + + e.aname = aname; + e.prefix = name - aname; + e.name = nil; + e.elems = nil; + e.off = nil; + e.nelems = 0; + e.nerror = 0; + if(waserror()){ + cclose(c); + free(e.name); + free(e.elems); + /* + * Prepare nice error, showing first e.nerror elements of name. + */ + if(e.nerror == 0) + nexterror(); + strcpy(tmperrbuf, up->errstr); + if(e.off[e.nerror]==0) + print("nerror=%d but off=%d\n", + e.nerror, e.off[e.nerror]); + if(0 && chandebug) + print("showing %d+%d/%d (of %d) of %s (%d %d)\n", e.prefix, e.off[e.nerror], e.nerror, e.nelems, aname, e.off[0], e.off[1]); + len = e.prefix+e.off[e.nerror]; + free(e.off); + namelenerror(aname, len, tmperrbuf); + } + + /* + * Build a list of elements in the name. + */ + parsename(name, &e); + + /* + * On create, .... + */ + if(amode == Acreate){ + /* perm must have DMDIR if last element is / or /. */ + if(e.mustbedir && !(perm&DMDIR)){ + e.nerror = e.nelems; + error("create without DMDIR"); + } + + /* don't try to walk the last path element just yet. */ + if(e.nelems == 0) + error(Eexist); + e.nelems--; + } + + if(walk(&c, e.elems, e.nelems, nomount, &e.nerror) < 0){ + if(e.nerror < 0 || e.nerror > e.nelems){ + print("namec %s walk error nerror=%d\n", aname, e.nerror); + e.nerror = 0; + } + nexterror(); + } + + if(e.mustbedir && !(c->qid.type&QTDIR)) + error("not a directory"); + + if(amode == Aopen && (omode&3) == OEXEC && (c->qid.type&QTDIR)) + error("cannot exec directory"); + + switch(amode){ + case Abind: + /* no need to maintain path - cannot dotdot an Abind */ + m = nil; + if(!nomount) + domount(&c, &m, nil); + if(c->umh != nil) + putmhead(c->umh); + c->umh = m; + break; + + case Aaccess: + case Aremove: + case Aopen: + Open: + /* save&update the name; domount might change c */ + path = c->path; + incref(path); + m = nil; + if(!nomount) + domount(&c, &m, &path); + + /* our own copy to open or remove */ + c = cunique(c); + + /* now it's our copy anyway, we can put the name back */ + pathclose(c->path); + c->path = path; + + /* record whether c is on a mount point */ + c->ismtpt = m!=nil; + + switch(amode){ + case Aaccess: + case Aremove: + putmhead(m); + break; + + case Aopen: + case Acreate: +if(c->umh != nil){ + print("cunique umh Open\n"); + putmhead(c->umh); + c->umh = nil; +} + /* only save the mount head if it's a multiple element union */ + if(m && m->mount && m->mount->next) + c->umh = m; + else + putmhead(m); + + /* save registers else error() in open has wrong value of c saved */ + saveregisters(); + + if(omode == OEXEC) + c->flag &= ~CCACHE; + + c = devtab[c->type]->open(c, omode&~OCEXEC); + + if(omode & OCEXEC) + c->flag |= CCEXEC; + if(omode & ORCLOSE) + c->flag |= CRCLOSE; + break; + } + break; + + case Atodir: + /* + * Directories (e.g. for cd) are left before the mount point, + * so one may mount on / or . and see the effect. + */ + if(!(c->qid.type & QTDIR)) + error(Enotdir); + break; + + case Amount: + /* + * When mounting on an already mounted upon directory, + * one wants subsequent mounts to be attached to the + * original directory, not the replacement. Don't domount. + */ + break; + + case Acreate: + /* + * We've already walked all but the last element. + * If the last exists, try to open it OTRUNC. + * If omode&OEXCL is set, just give up. + */ + e.nelems++; + e.nerror++; + if(walk(&c, e.elems+e.nelems-1, 1, nomount, nil) == 0){ + if(omode&OEXCL) + error(Eexist); + omode |= OTRUNC; + goto Open; + } + + /* + * The semantics of the create(2) system call are that if the + * file exists and can be written, it is to be opened with truncation. + * On the other hand, the create(5) message fails if the file exists. + * If we get two create(2) calls happening simultaneously, + * they might both get here and send create(5) messages, but only + * one of the messages will succeed. To provide the expected create(2) + * semantics, the call with the failed message needs to try the above + * walk again, opening for truncation. This correctly solves the + * create/create race, in the sense that any observable outcome can + * be explained as one happening before the other. + * The create/create race is quite common. For example, it happens + * when two rc subshells simultaneously update the same + * environment variable. + * + * The implementation still admits a create/create/remove race: + * (A) walk to file, fails + * (B) walk to file, fails + * (A) create file, succeeds, returns + * (B) create file, fails + * (A) remove file, succeeds, returns + * (B) walk to file, return failure. + * + * This is hardly as common as the create/create race, and is really + * not too much worse than what might happen if (B) got a hold of a + * file descriptor and then the file was removed -- either way (B) can't do + * anything with the result of the create call. So we don't care about this race. + * + * Applications that care about more fine-grained decision of the races + * can use the OEXCL flag to get at the underlying create(5) semantics; + * by default we provide the common case. + * + * We need to stay behind the mount point in case we + * need to do the first walk again (should the create fail). + * + * We also need to cross the mount point and find the directory + * in the union in which we should be creating. + * + * The channel staying behind is c, the one moving forward is cnew. + */ + m = nil; + cnew = nil; /* is this assignment necessary? */ + if(!waserror()){ /* try create */ + if(!nomount && findmount(&cnew, &m, c->type, c->dev, c->qid)) + cnew = createdir(cnew, m); + else{ + cnew = c; + incref(cnew); + } + + /* + * We need our own copy of the Chan because we're + * about to send a create, which will move it. Once we have + * our own copy, we can fix the name, which might be wrong + * if findmount gave us a new Chan. + */ + cnew = cunique(cnew); + pathclose(cnew->path); + cnew->path = c->path; + incref(cnew->path); + + devtab[cnew->type]->create(cnew, e.elems[e.nelems-1], omode&~(OEXCL|OCEXEC), perm); + poperror(); + if(omode & OCEXEC) + cnew->flag |= CCEXEC; + if(omode & ORCLOSE) + cnew->flag |= CRCLOSE; + if(m) + putmhead(m); + cclose(c); + c = cnew; + c->path = addelem(c->path, e.elems[e.nelems-1], nil); + break; + } + + /* create failed */ + cclose(cnew); + if(m) + putmhead(m); + if(omode & OEXCL) + nexterror(); + /* save error */ + createerr = up->errstr; + up->errstr = tmperrbuf; + /* note: we depend that walk does not error */ + if(walk(&c, e.elems+e.nelems-1, 1, nomount, nil) < 0){ + up->errstr = createerr; + error(createerr); /* report true error */ + } + up->errstr = createerr; + omode |= OTRUNC; + goto Open; + + default: + panic("unknown namec access %d\n", amode); + } + + /* place final element in genbuf for e.g. exec */ + if(e.nelems > 0) + kstrcpy(up->genbuf, e.elems[e.nelems-1], sizeof up->genbuf); + else + kstrcpy(up->genbuf, ".", sizeof up->genbuf); + free(e.name); + free(e.elems); + free(e.off); + poperror(); /* e c */ + free(aname); + poperror(); /* aname */ + + return c; +} + +/* + * name is valid. skip leading / and ./ as much as possible + */ +char* +skipslash(char *name) +{ + while(name[0]=='/' || (name[0]=='.' && (name[1]==0 || name[1]=='/'))) + name++; + return name; +} + +char isfrog[256]={ + /*NUL*/ 1, 1, 1, 1, 1, 1, 1, 1, + /*BKS*/ 1, 1, 1, 1, 1, 1, 1, 1, + /*DLE*/ 1, 1, 1, 1, 1, 1, 1, 1, + /*CAN*/ 1, 1, 1, 1, 1, 1, 1, 1, + ['/'] 1, + [0x7f] 1, +}; + +/* + * Check that the name + * a) is in valid memory. + * b) is shorter than 2^16 bytes, so it can fit in a 9P string field. + * c) contains no frogs. + * The first byte is known to be addressible by the requester, so the + * routine works for kernel and user memory both. + * The parameter slashok flags whether a slash character is an error + * or a valid character. + * + * The parameter dup flags whether the string should be copied + * out of user space before being scanned the second time. + * (Otherwise a malicious thread could remove the NUL, causing us + * to access unchecked addresses.) + */ +static char* +validname0(char *aname, int slashok, int dup, uintptr pc) +{ + char *ename, *name, *s; + int c, n; + Rune r; + + name = aname; + if((uintptr)name < KZERO){ + if(!dup) + print("warning: validname called from %#p with user pointer", pc); + ename = vmemchr(name, 0, (1<<16)); + }else + ename = memchr(name, 0, (1<<16)); + + if(ename==nil || ename-name>=(1<<16)) + error("name too long"); + + s = nil; + if(dup){ + n = ename-name; + s = smalloc(n+1); + memmove(s, name, n); + s[n] = 0; + aname = s; + name = s; + setmalloctag(s, pc); + } + + while(*name){ + /* all characters above '~' are ok */ + c = *(uchar*)name; + if(c >= Runeself) + name += chartorune(&r, name); + else{ + if(isfrog[c]) + if(!slashok || c!='/'){ + snprint(up->genbuf, sizeof(up->genbuf), "%s: %q", Ebadchar, aname); + free(s); + error(up->genbuf); + } + name++; + } + } + return s; +} + +void +validname(char *aname, int slashok) +{ + validname0(aname, slashok, 0, getcallerpc(&aname)); +} + +char* +validnamedup(char *aname, int slashok) +{ + return validname0(aname, slashok, 1, getcallerpc(&aname)); +} + +void +isdir(Chan *c) +{ + if(c->qid.type & QTDIR) + return; + error(Enotdir); +} + +/* + * This is necessary because there are many + * pointers to the top of a given mount list: + * + * - the mhead in the namespace hash table + * - the mhead in chans returned from findmount: + * used in namec and then by unionread. + * - the mhead in chans returned from createdir: + * used in the open/create race protect, which is gone. + * + * The RWlock in the Mhead protects the mount list it contains. + * The mount list is deleted when we cunmount. + * The RWlock ensures that nothing is using the mount list at that time. + * + * It is okay to replace c->mh with whatever you want as + * long as you are sure you have a unique reference to it. + * + * This comment might belong somewhere else. + */ +void +putmhead(Mhead *m) +{ + if(m && decref(m) == 0){ + m->mount = (Mount*)0xCafeBeef; + free(m); + } +} --- /dev/null +++ /sys/src/9/port64/devproc.c @@ -0,0 +1,1598 @@ +#include "u.h" +#include +#include "tos.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" +#include "ureg.h" +#include "../port/edf.h" + +enum +{ + Qdir, + Qtrace, + Qargs, + Qctl, + Qfd, + Qfpregs, + Qkregs, + Qmem, + Qnote, + Qnoteid, + Qnotepg, + Qns, + Qproc, + Qregs, + Qsegment, + Qstatus, + Qtext, + Qwait, + Qprofile, + Qsyscall, +}; + +enum +{ + CMclose, + CMclosefiles, + CMfixedpri, + CMhang, + CMkill, + CMnohang, + CMnoswap, + CMpri, + CMprivate, + CMprofile, + CMstart, + CMstartstop, + CMstartsyscall, + CMstop, + CMwaitstop, + CMwired, + CMtrace, + /* real time */ + CMperiod, + CMdeadline, + CMcost, + CMsporadic, + CMdeadlinenotes, + CMadmit, + CMextra, + CMexpel, + CMevent, +}; + +enum{ + Nevents = 0x4000, + Emask = Nevents - 1, +}; + +#define STATSIZE (2*KNAMELEN+12+9*12) +/* + * Status, fd, and ns are left fully readable (0444) because of their use in debugging, + * particularly on shared servers. + * Arguably, ns and fd shouldn't be readable; if you'd prefer, change them to 0000 + */ +Dirtab procdir[] = +{ + "args", {Qargs}, 0, 0660, + "ctl", {Qctl}, 0, 0000, + "fd", {Qfd}, 0, 0444, + "fpregs", {Qfpregs}, sizeof(FPsave), 0000, + "kregs", {Qkregs}, sizeof(Ureg), 0400, + "mem", {Qmem}, 0, 0000, + "note", {Qnote}, 0, 0000, + "noteid", {Qnoteid}, 0, 0664, + "notepg", {Qnotepg}, 0, 0000, + "ns", {Qns}, 0, 0444, + "proc", {Qproc}, 0, 0400, + "regs", {Qregs}, sizeof(Ureg), 0000, + "segment", {Qsegment}, 0, 0444, + "status", {Qstatus}, STATSIZE, 0444, + "text", {Qtext}, 0, 0000, + "wait", {Qwait}, 0, 0400, + "profile", {Qprofile}, 0, 0400, + "syscall", {Qsyscall}, 0, 0400, +}; + +static +Cmdtab proccmd[] = { + CMclose, "close", 2, + CMclosefiles, "closefiles", 1, + CMfixedpri, "fixedpri", 2, + CMhang, "hang", 1, + CMnohang, "nohang", 1, + CMnoswap, "noswap", 1, + CMkill, "kill", 1, + CMpri, "pri", 2, + CMprivate, "private", 1, + CMprofile, "profile", 1, + CMstart, "start", 1, + CMstartstop, "startstop", 1, + CMstartsyscall, "startsyscall", 1, + CMstop, "stop", 1, + CMwaitstop, "waitstop", 1, + CMwired, "wired", 2, + CMtrace, "trace", 0, + CMperiod, "period", 2, + CMdeadline, "deadline", 2, + CMcost, "cost", 2, + CMsporadic, "sporadic", 1, + CMdeadlinenotes, "deadlinenotes", 1, + CMadmit, "admit", 1, + CMextra, "extra", 1, + CMexpel, "expel", 1, + CMevent, "event", 1, +}; + +/* Segment type from portdat.h */ +static char *sname[]={ "Text", "Data", "Bss", "Stack", "Shared", "Phys", }; + +/* + * Qids are, in path: + * 5 bits of file type (qids above) + * 26 bits of process slot number + 1 + * in vers, + * 32 bits of pid, for consistency checking + * If notepg, c->pgrpid.path is pgrp slot, .vers is noteid. + */ +#define QSHIFT 5 /* location in qid of proc slot # */ + +#define QID(q) ((((ulong)(q).path) & ((1<> 0) +#define SLOT(q) (((((ulong)(q).path) & ~(1UL<<31)) >> QSHIFT) - 1) +#define PID(q) ((q).vers) +#define NOTEID(q) ((q).vers) + +void procctlreq(Proc*, char*, int); +int procctlmemio(Proc*, uintptr, int, void*, int); +Chan* proctext(Chan*, Proc*); +Segment* txt2data(Proc*, Segment*); +int procstopped(void*); +void mntscan(Mntwalk*, Proc*); + +static Traceevent *tevents; +static Lock tlock; +static int topens; +static int tproduced, tconsumed; +void (*proctrace)(Proc*, int, vlong); + +extern int unfair; + +static void +profclock(Ureg *ur, Timer *) +{ + Tos *tos; + + if(up == 0 || up->state != Running) + return; + + /* user profiling clock */ + if(userureg(ur)){ + tos = (Tos*)(USTKTOP-sizeof(Tos)); + tos->clock += TK2MS(1); + segclock(ur->pc); + } +} + +static int +procgen(Chan *c, char *name, Dirtab *tab, int, int s, Dir *dp) +{ + Qid qid; + Proc *p; + char *ename; + Segment *q; + ulong pid, path, perm, len; + + if(s == DEVDOTDOT){ + mkqid(&qid, Qdir, 0, QTDIR); + devdir(c, qid, "#p", 0, eve, 0555, dp); + return 1; + } + + if(c->qid.path == Qdir){ + if(s == 0){ + strcpy(up->genbuf, "trace"); + mkqid(&qid, Qtrace, -1, QTFILE); + devdir(c, qid, up->genbuf, 0, eve, 0444, dp); + return 1; + } + + if(name != nil){ + /* ignore s and use name to find pid */ + pid = strtol(name, &ename, 10); + if(pid==0 || ename[0]!='\0') + return -1; + s = procindex(pid); + if(s < 0) + return -1; + } + else if(--s >= conf.nproc) + return -1; + + p = proctab(s); + pid = p->pid; + if(pid == 0) + return 0; + snprint(up->genbuf, sizeof up->genbuf, "%lud", pid); + /* + * String comparison is done in devwalk so name must match its formatted pid + */ + if(name != nil && strcmp(name, up->genbuf) != 0) + return -1; + mkqid(&qid, (s+1)<genbuf, 0, p->user, DMDIR|0555, dp); + return 1; + } + if(c->qid.path == Qtrace){ + strcpy(up->genbuf, "trace"); + mkqid(&qid, Qtrace, -1, QTFILE); + devdir(c, qid, up->genbuf, 0, eve, 0444, dp); + return 1; + } + if(s >= nelem(procdir)) + return -1; + if(tab) + panic("procgen"); + + tab = &procdir[s]; + path = c->qid.path&~(((1<procmode determines default mode for files in /proc */ + p = proctab(SLOT(c->qid)); + perm = tab->perm; + if(perm == 0) + perm = p->procmode; + else /* just copy read bits */ + perm |= p->procmode & 0444; + + len = tab->length; + switch(QID(c->qid)) { + case Qwait: + len = p->nwait; /* incorrect size, but >0 means there's something to read */ + break; + case Qprofile: + q = p->seg[TSEG]; + if(q && q->profile) { + len = (q->top-q->base)>>LRESPROF; + len *= sizeof(*q->profile); + } + break; + } + + mkqid(&qid, path|tab->qid.path, c->qid.vers, QTFILE); + devdir(c, qid, tab->name, len, p->user, perm, dp); + return 1; +} + +static void +_proctrace(Proc* p, Tevent etype, vlong ts) +{ + Traceevent *te; + + if (p->trace == 0 || topens == 0 || + tproduced - tconsumed >= Nevents) + return; + + te = &tevents[tproduced&Emask]; + te->pid = p->pid; + te->etype = etype; + if (ts == 0) + te->time = todget(nil); + else + te->time = ts; + tproduced++; +} + +static void +procinit(void) +{ + if(conf.nproc >= (1<<(31-QSHIFT))-1) + print("warning: too many procs for devproc\n"); + addclock0link((void (*)(void))profclock, 113); /* Relative prime to HZ */ +} + +static Chan* +procattach(char *spec) +{ + return devattach('p', spec); +} + +static Walkqid* +procwalk(Chan *c, Chan *nc, char **name, int nname) +{ + return devwalk(c, nc, name, nname, 0, 0, procgen); +} + +static int +procstat(Chan *c, uchar *db, int n) +{ + return devstat(c, db, n, 0, 0, procgen); +} + +/* + * none can't read or write state on other + * processes. This is to contain access of + * servers running as none should they be + * subverted by, for example, a stack attack. + */ +static void +nonone(Proc *p) +{ + if(p == up) + return; + if(strcmp(up->user, "none") != 0) + return; + if(iseve()) + return; + error(Eperm); +} + +static Chan* +procopen(Chan *c, int omode) +{ + Proc *p; + Pgrp *pg; + Chan *tc; + int pid; + + if(c->qid.type & QTDIR) + return devopen(c, omode, 0, 0, procgen); + + if(QID(c->qid) == Qtrace){ + if (omode != OREAD) + error(Eperm); + lock(&tlock); + if (waserror()){ + unlock(&tlock); + nexterror(); + } + if (topens > 0) + error("already open"); + topens++; + if (tevents == nil){ + tevents = (Traceevent*)malloc(sizeof(Traceevent) * Nevents); + if(tevents == nil) + error(Enomem); + tproduced = tconsumed = 0; + } + proctrace = _proctrace; + unlock(&tlock); + poperror(); + + c->mode = openmode(omode); + c->flag |= COPEN; + c->offset = 0; + return c; + } + + p = proctab(SLOT(c->qid)); + qlock(&p->debug); + if(waserror()){ + qunlock(&p->debug); + nexterror(); + } + pid = PID(c->qid); + if(p->pid != pid) + error(Eprocdied); + + omode = openmode(omode); + + switch(QID(c->qid)){ + case Qtext: + if(omode != OREAD) + error(Eperm); + tc = proctext(c, p); + tc->offset = 0; + qunlock(&p->debug); + poperror(); + cclose(c); + return tc; + + case Qproc: + case Qkregs: + case Qsegment: + case Qprofile: + case Qfd: + if(omode != OREAD) + error(Eperm); + break; + + case Qnote: + if(p->privatemem) + error(Eperm); + break; + + case Qmem: + case Qctl: + if(p->privatemem) + error(Eperm); + nonone(p); + break; + + case Qargs: + case Qnoteid: + case Qstatus: + case Qwait: + case Qregs: + case Qfpregs: + case Qsyscall: + nonone(p); + break; + + case Qns: + if(omode != OREAD) + error(Eperm); + c->aux = malloc(sizeof(Mntwalk)); + break; + + case Qnotepg: + nonone(p); + pg = p->pgrp; + if(pg == nil) + error(Eprocdied); + if(omode!=OWRITE || pg->pgrpid == 1) + error(Eperm); + c->pgrpid.path = pg->pgrpid+1; + c->pgrpid.vers = p->noteid; + break; + + default: + pprint("procopen %#lux\n", QID(c->qid)); + error(Egreg); + } + + /* Affix pid to qid */ + if(p->state != Dead) + c->qid.vers = p->pid; + + /* make sure the process slot didn't get reallocated while we were playing */ + coherence(); + if(p->pid != pid) + error(Eprocdied); + + tc = devopen(c, omode, 0, 0, procgen); + qunlock(&p->debug); + poperror(); + + return tc; +} + +static int +procwstat(Chan *c, uchar *db, int n) +{ + Proc *p; + Dir *d; + + if(c->qid.type&QTDIR) + error(Eperm); + + if(QID(c->qid) == Qtrace) + return devwstat(c, db, n); + + p = proctab(SLOT(c->qid)); + nonone(p); + d = nil; + if(waserror()){ + free(d); + qunlock(&p->debug); + nexterror(); + } + qlock(&p->debug); + + if(p->pid != PID(c->qid)) + error(Eprocdied); + + if(strcmp(up->user, p->user) != 0 && strcmp(up->user, eve) != 0) + error(Eperm); + + d = smalloc(sizeof(Dir)+n); + n = convM2D(db, n, &d[0], (char*)&d[1]); + if(n == 0) + error(Eshortstat); + if(!emptystr(d->uid) && strcmp(d->uid, p->user) != 0){ + if(strcmp(up->user, eve) != 0) + error(Eperm); + else + kstrdup(&p->user, d->uid); + } + /* p->procmode determines default mode for files in /proc */ + if(d->mode != ~0UL) + p->procmode = d->mode&0777; + + poperror(); + free(d); + qunlock(&p->debug); + return n; +} + + +static long +procoffset(long offset, char *va, int *np) +{ + if(offset > 0) { + offset -= *np; + if(offset < 0) { + memmove(va, va+*np+offset, -offset); + *np = -offset; + } + else + *np = 0; + } + return offset; +} + +static int +procqidwidth(Chan *c) +{ + char buf[32]; + + return snprint(buf, sizeof buf, "%lud", c->qid.vers); +} + +int +procfdprint(Chan *c, int fd, int w, char *s, int ns) +{ + int n; + + if(w == 0) + w = procqidwidth(c); + n = snprint(s, ns, "%3d %.2s %C %4ld (%.16llux %*lud %.2ux) %5ld %8lld %s\n", + fd, + &"r w rw"[(c->mode&3)<<1], + devtab[c->type]->dc, c->dev, + c->qid.path, w, c->qid.vers, c->qid.type, + c->iounit, c->offset, c->path->s); + return n; +} + +static int +procfds(Proc *p, char *va, int count, long offset) +{ + Fgrp *f; + Chan *c; + char buf[256]; + int n, i, w, ww; + char *a; + + /* print to buf to avoid holding fgrp lock while writing to user space */ + if(count > sizeof buf) + count = sizeof buf; + a = buf; + + qlock(&p->debug); + f = p->fgrp; + if(f == nil){ + qunlock(&p->debug); + return 0; + } + lock(f); + if(waserror()){ + unlock(f); + qunlock(&p->debug); + nexterror(); + } + + n = readstr(0, a, count, p->dot->path->s); + n += snprint(a+n, count-n, "\n"); + offset = procoffset(offset, a, &n); + /* compute width of qid.path */ + w = 0; + for(i = 0; i <= f->maxfd; i++) { + c = f->fd[i]; + if(c == nil) + continue; + ww = procqidwidth(c); + if(ww > w) + w = ww; + } + for(i = 0; i <= f->maxfd; i++) { + c = f->fd[i]; + if(c == nil) + continue; + n += procfdprint(c, i, w, a+n, count-n); + offset = procoffset(offset, a, &n); + } + unlock(f); + qunlock(&p->debug); + poperror(); + + /* copy result to user space, now that locks are released */ + memmove(va, buf, n); + + return n; +} + +static void +procclose(Chan * c) +{ + if(QID(c->qid) == Qtrace){ + lock(&tlock); + if(topens > 0) + topens--; + if(topens == 0) + proctrace = nil; + unlock(&tlock); + } + if(QID(c->qid) == Qns && c->aux != 0) + free(c->aux); +} + +static void +int2flag(int flag, char *s) +{ + if(flag == 0){ + *s = '\0'; + return; + } + *s++ = '-'; + if(flag & MAFTER) + *s++ = 'a'; + if(flag & MBEFORE) + *s++ = 'b'; + if(flag & MCREATE) + *s++ = 'c'; + if(flag & MCACHE) + *s++ = 'C'; + *s = '\0'; +} + +static int +procargs(Proc *p, char *buf, int nbuf) +{ + int j, k, m; + char *a; + int n; + + a = p->args; + if(p->setargs){ + snprint(buf, nbuf, "%s [%s]", p->text, p->args); + return strlen(buf); + } + n = p->nargs; + for(j = 0; j < nbuf - 1; j += m){ + if(n <= 0) + break; + if(j != 0) + buf[j++] = ' '; + m = snprint(buf+j, nbuf-j, "%q", a); + k = strlen(a) + 1; + a += k; + n -= k; + } + return j; +} + +static int +eventsavailable(void *) +{ + return tproduced > tconsumed; +} + +static long +procread(Chan *c, void *va, long n, vlong off) +{ + /* NSEG*32 was too small for worst cases */ + char *a, flag[10], *sps, *srv, statbuf[NSEG*64]; + int i, j, m, navail, ne, pid, rsize; + long l; + uchar *rptr; + uintptr offset; + Confmem *cm; + Mntwalk *mw; + Proc *p; + Segment *sg, *s; + Ureg kur; + Waitq *wq; + + a = va; + offset = off; + + if(c->qid.type & QTDIR) + return devdirread(c, a, n, 0, 0, procgen); + + if(QID(c->qid) == Qtrace){ + if(!eventsavailable(nil)) + return 0; + + rptr = (uchar*)va; + navail = tproduced - tconsumed; + if(navail > n / sizeof(Traceevent)) + navail = n / sizeof(Traceevent); + while(navail > 0) { + ne = ((tconsumed & Emask) + navail > Nevents)? + Nevents - (tconsumed & Emask): navail; + memmove(rptr, &tevents[tconsumed & Emask], + ne * sizeof(Traceevent)); + + tconsumed += ne; + rptr += ne * sizeof(Traceevent); + navail -= ne; + } + return rptr - (uchar*)va; + } + + p = proctab(SLOT(c->qid)); + if(p->pid != PID(c->qid)) + error(Eprocdied); + + switch(QID(c->qid)){ + case Qargs: + qlock(&p->debug); + j = procargs(p, up->genbuf, sizeof up->genbuf); + qunlock(&p->debug); + if(offset >= j) + return 0; + if(offset+n > j) + n = j-offset; + memmove(a, &up->genbuf[offset], n); + return n; + case Qsyscall: + if(!p->syscalltrace) + return 0; + n = readstr(offset, a, n, p->syscalltrace); + return n; + + case Qmem: + if(offset < KZERO) + return procctlmemio(p, offset, n, va, 1); + + if(!iseve()) + error(Eperm); + + /* validate kernel addresses */ + if(offset < (uintptr)end) { + if(offset+n > (uintptr)end) + n = (uintptr)end - offset; + memmove(a, (char*)offset, n); + return n; + } + for(i=0; ikbase <= offset && offset <= cm->klimit-1){ + if(offset+n >= cm->klimit-1) + n = cm->klimit - offset; + memmove(a, (char*)offset, n); + return n; + } + } + error(Ebadarg); + + case Qprofile: + s = p->seg[TSEG]; + if(s == 0 || s->profile == 0) + error("profile is off"); + i = (s->top-s->base)>>LRESPROF; + i *= sizeof(*s->profile); + if(offset >= i) + return 0; + if(offset+n > i) + n = i - offset; + memmove(a, ((char*)s->profile)+offset, n); + return n; + + case Qnote: + qlock(&p->debug); + if(waserror()){ + qunlock(&p->debug); + nexterror(); + } + if(p->pid != PID(c->qid)) + error(Eprocdied); + if(n < 1) /* must accept at least the '\0' */ + error(Etoosmall); + if(p->nnote == 0) + n = 0; + else { + m = strlen(p->note[0].msg) + 1; + if(m > n) + m = n; + memmove(va, p->note[0].msg, m); + ((char*)va)[m-1] = '\0'; + p->nnote--; + memmove(p->note, p->note+1, p->nnote*sizeof(Note)); + n = m; + } + if(p->nnote == 0) + p->notepending = 0; + poperror(); + qunlock(&p->debug); + return n; + + case Qproc: + if(offset >= sizeof(Proc)) + return 0; + if(offset+n > sizeof(Proc)) + n = sizeof(Proc) - offset; + memmove(a, ((char*)p)+offset, n); + return n; + + case Qregs: + rptr = (uchar*)p->dbgreg; + rsize = sizeof(Ureg); + goto regread; + + case Qkregs: + memset(&kur, 0, sizeof(Ureg)); + setkernur(&kur, p); + rptr = (uchar*)&kur; + rsize = sizeof(Ureg); + goto regread; + + case Qfpregs: + rptr = (uchar*)&p->fpsave; + rsize = sizeof(FPsave); + regread: + if(rptr == 0) + error(Enoreg); + if(offset >= rsize) + return 0; + if(offset+n > rsize) + n = rsize - offset; + memmove(a, rptr+offset, n); + return n; + + case Qstatus: + if(offset >= STATSIZE) + return 0; + if(offset+n > STATSIZE) + n = STATSIZE - offset; + + sps = p->psstate; + if(sps == 0) + sps = statename[p->state]; + memset(statbuf, ' ', sizeof statbuf); + readstr(0, statbuf+0*KNAMELEN, KNAMELEN-1, p->text); + readstr(0, statbuf+1*KNAMELEN, KNAMELEN-1, p->user); + readstr(0, statbuf+2*KNAMELEN, 11, sps); + j = 2*KNAMELEN + 12; + + for(i = 0; i < 6; i++) { + l = p->time[i]; + if(i == TReal) + l = MACHP(0)->ticks - l; + l = TK2MS(l); + readnum(0, statbuf+j+NUMSIZE*i, NUMSIZE, l, NUMSIZE); + } + /* ignore stack, which is mostly non-existent */ + l = 0; + for(i=1; iseg[i]; + if(s) + l += s->top - s->base; + } + readnum(0, statbuf+j+NUMSIZE*6, NUMSIZE, l>>10, NUMSIZE); + readnum(0, statbuf+j+NUMSIZE*7, NUMSIZE, p->basepri, NUMSIZE); + readnum(0, statbuf+j+NUMSIZE*8, NUMSIZE, p->priority, NUMSIZE); + memmove(a, statbuf+offset, n); + return n; + + case Qsegment: + j = 0; + for(i = 0; i < NSEG; i++) { + sg = p->seg[i]; + if(sg == 0) + continue; + j += snprint(statbuf+j, sizeof statbuf - j, + "%-6s %c%c %#p %#p %4ld\n", + sname[sg->type&SG_TYPE], + sg->type&SG_RONLY ? 'R' : ' ', + sg->profile ? 'P' : ' ', + sg->base, sg->top, sg->ref); + } + if(offset >= j) + return 0; + if(offset+n > j) + n = j-offset; + if(n == 0 && offset == 0) + exhausted("segments"); + memmove(a, &statbuf[offset], n); + return n; + + case Qwait: + if(!canqlock(&p->qwaitr)) + error(Einuse); + + if(waserror()) { + qunlock(&p->qwaitr); + nexterror(); + } + + lock(&p->exl); + if(up == p && p->nchild == 0 && p->waitq == 0) { + unlock(&p->exl); + error(Enochild); + } + pid = p->pid; + while(p->waitq == 0) { + unlock(&p->exl); + sleep(&p->waitr, haswaitq, p); + if(p->pid != pid) + error(Eprocdied); + lock(&p->exl); + } + wq = p->waitq; + p->waitq = wq->next; + p->nwait--; + unlock(&p->exl); + + qunlock(&p->qwaitr); + poperror(); + n = snprint(a, n, "%d %lud %lud %lud %q", + wq->w.pid, + wq->w.time[TUser], wq->w.time[TSys], wq->w.time[TReal], + wq->w.msg); + free(wq); + return n; + + case Qns: + qlock(&p->debug); + if(waserror()){ + qunlock(&p->debug); + nexterror(); + } + if(p->pgrp == nil || p->pid != PID(c->qid)) + error(Eprocdied); + mw = c->aux; + if(mw == nil) + error(Enomem); + if(mw->cddone){ + qunlock(&p->debug); + poperror(); + return 0; + } + mntscan(mw, p); + if(mw->mh == 0){ + mw->cddone = 1; + i = snprint(a, n, "cd %s\n", p->dot->path->s); + qunlock(&p->debug); + poperror(); + return i; + } + int2flag(mw->cm->mflag, flag); + if(strcmp(mw->cm->to->path->s, "#M") == 0){ + srv = srvname(mw->cm->to->mchan); + i = snprint(a, n, "mount %s %s %s %s\n", flag, + srv==nil? mw->cm->to->mchan->path->s : srv, + mw->mh->from->path->s, mw->cm->spec? mw->cm->spec : ""); + free(srv); + }else + i = snprint(a, n, "bind %s %s %s\n", flag, + mw->cm->to->path->s, mw->mh->from->path->s); + qunlock(&p->debug); + poperror(); + return i; + + case Qnoteid: + return readnum(offset, va, n, p->noteid, NUMSIZE); + case Qfd: + return procfds(p, va, n, offset); + } + error(Egreg); + return 0; /* not reached */ +} + +void +mntscan(Mntwalk *mw, Proc *p) +{ + Pgrp *pg; + Mount *t; + Mhead *f; + int nxt, i; + ulong last, bestmid; + + pg = p->pgrp; + rlock(&pg->ns); + + nxt = 0; + bestmid = ~0; + + last = 0; + if(mw->mh) + last = mw->cm->mountid; + + for(i = 0; i < MNTHASH; i++) { + for(f = pg->mnthash[i]; f; f = f->hash) { + for(t = f->mount; t; t = t->next) { + if(mw->mh == 0 || + (t->mountid > last && t->mountid < bestmid)) { + mw->cm = t; + mw->mh = f; + bestmid = mw->cm->mountid; + nxt = 1; + } + } + } + } + if(nxt == 0) + mw->mh = 0; + + runlock(&pg->ns); +} + +static long +procwrite(Chan *c, void *va, long n, vlong off) +{ + int id, m; + Proc *p, *t, *et; + char *a, *arg, buf[ERRMAX]; + ulong offset = off; + + a = va; + if(c->qid.type & QTDIR) + error(Eisdir); + + p = proctab(SLOT(c->qid)); + + /* Use the remembered noteid in the channel rather + * than the process pgrpid + */ + if(QID(c->qid) == Qnotepg) { + pgrpnote(NOTEID(c->pgrpid), va, n, NUser); + return n; + } + + qlock(&p->debug); + if(waserror()){ + qunlock(&p->debug); + nexterror(); + } + if(p->pid != PID(c->qid)) + error(Eprocdied); + + switch(QID(c->qid)){ + case Qargs: + if(n == 0) + error(Eshort); + if(n >= ERRMAX) + error(Etoobig); + arg = malloc(n+1); + if(arg == nil) + error(Enomem); + memmove(arg, va, n); + m = n; + if(arg[m-1] != 0) + arg[m++] = 0; + free(p->args); + p->nargs = m; + p->args = arg; + p->setargs = 1; + break; + + case Qmem: + if(p->state != Stopped) + error(Ebadctl); + + n = procctlmemio(p, offset, n, va, 0); + break; + + case Qregs: + if(offset >= sizeof(Ureg)) + n = 0; + else if(offset+n > sizeof(Ureg)) + n = sizeof(Ureg) - offset; + if(p->dbgreg == 0) + error(Enoreg); + setregisters(p->dbgreg, (char*)(p->dbgreg)+offset, va, n); + break; + + case Qfpregs: + if(offset >= sizeof(FPsave)) + n = 0; + else if(offset+n > sizeof(FPsave)) + n = sizeof(FPsave) - offset; + memmove((uchar*)&p->fpsave+offset, va, n); + break; + + case Qctl: + procctlreq(p, va, n); + break; + + case Qnote: + if(p->kp) + error(Eperm); + if(n >= ERRMAX-1) + error(Etoobig); + memmove(buf, va, n); + buf[n] = 0; + if(!postnote(p, 0, buf, NUser)) + error("note not posted"); + break; + case Qnoteid: + id = atoi(a); + if(id == p->pid) { + p->noteid = id; + break; + } + t = proctab(0); + for(et = t+conf.nproc; t < et; t++) { + if(t->state == Dead) + continue; + if(id == t->noteid) { + if(strcmp(p->user, t->user) != 0) + error(Eperm); + p->noteid = id; + break; + } + } + if(p->noteid != id) + error(Ebadarg); + break; + default: + pprint("unknown qid in procwrite\n"); + error(Egreg); + } + poperror(); + qunlock(&p->debug); + return n; +} + +Dev procdevtab = { + 'p', + "proc", + + devreset, + procinit, + devshutdown, + procattach, + procwalk, + procstat, + procopen, + devcreate, + procclose, + procread, + devbread, + procwrite, + devbwrite, + devremove, + procwstat, +}; + +Chan* +proctext(Chan *c, Proc *p) +{ + Chan *tc; + Image *i; + Segment *s; + + s = p->seg[TSEG]; + if(s == 0) + error(Enonexist); + if(p->state==Dead) + error(Eprocdied); + + lock(s); + i = s->image; + if(i == 0) { + unlock(s); + error(Eprocdied); + } + unlock(s); + + lock(i); + if(waserror()) { + unlock(i); + nexterror(); + } + + tc = i->c; + if(tc == 0) + error(Eprocdied); + + if(incref(tc) == 1 || (tc->flag&COPEN) == 0 || tc->mode!=OREAD) { + cclose(tc); + error(Eprocdied); + } + + if(p->pid != PID(c->qid)){ + cclose(tc); + error(Eprocdied); + } + + unlock(i); + poperror(); + + return tc; +} + +void +procstopwait(Proc *p, int ctl) +{ + int pid; + + if(p->pdbg) + error(Einuse); + if(procstopped(p) || p->state == Broken) + return; + + if(ctl != 0) + p->procctl = ctl; + p->pdbg = up; + pid = p->pid; + qunlock(&p->debug); + up->psstate = "Stopwait"; + if(waserror()) { + p->pdbg = 0; + qlock(&p->debug); + nexterror(); + } + sleep(&up->sleep, procstopped, p); + poperror(); + qlock(&p->debug); + if(p->pid != pid) + error(Eprocdied); +} + +static void +procctlcloseone(Proc *p, Fgrp *f, int fd) +{ + Chan *c; + + c = f->fd[fd]; + if(c == nil) + return; + f->fd[fd] = nil; + unlock(f); + qunlock(&p->debug); + cclose(c); + qlock(&p->debug); + lock(f); +} + +void +procctlclosefiles(Proc *p, int all, int fd) +{ + int i; + Fgrp *f; + + f = p->fgrp; + if(f == nil) + error(Eprocdied); + + lock(f); + f->ref++; + if(all) + for(i = 0; i < f->maxfd; i++) + procctlcloseone(p, f, i); + else + procctlcloseone(p, f, fd); + unlock(f); + closefgrp(f); +} + +static char * +parsetime(vlong *rt, char *s) +{ + uvlong ticks; + ulong l; + char *e, *p; + static int p10[] = {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1}; + + if (s == nil) + return("missing value"); + ticks=strtoul(s, &e, 10); + if (*e == '.'){ + p = e+1; + l = strtoul(p, &e, 10); + if(e-p > nelem(p10)) + return "too many digits after decimal point"; + if(e-p == 0) + return "ill-formed number"; + l *= p10[e-p-1]; + }else + l = 0; + if (*e == '\0' || strcmp(e, "s") == 0){ + ticks = 1000000000 * ticks + l; + }else if (strcmp(e, "ms") == 0){ + ticks = 1000000 * ticks + l/1000; + }else if (strcmp(e, "µs") == 0 || strcmp(e, "us") == 0){ + ticks = 1000 * ticks + l/1000000; + }else if (strcmp(e, "ns") != 0) + return "unrecognized unit"; + *rt = ticks; + return nil; +} + +void +procctlreq(Proc *p, char *va, int n) +{ + Segment *s; + int npc, pri; + Cmdbuf *cb; + Cmdtab *ct; + vlong time; + char *e; + void (*pt)(Proc*, int, vlong); + + if(p->kp) /* no ctl requests to kprocs */ + error(Eperm); + + cb = parsecmd(va, n); + if(waserror()){ + free(cb); + nexterror(); + } + + ct = lookupcmd(cb, proccmd, nelem(proccmd)); + + switch(ct->index){ + case CMclose: + procctlclosefiles(p, 0, atoi(cb->f[1])); + break; + case CMclosefiles: + procctlclosefiles(p, 1, 0); + break; + case CMhang: + p->hang = 1; + break; + case CMkill: + switch(p->state) { + case Broken: + unbreak(p); + break; + case Stopped: + p->procctl = Proc_exitme; + postnote(p, 0, "sys: killed", NExit); + ready(p); + break; + default: + p->procctl = Proc_exitme; + postnote(p, 0, "sys: killed", NExit); + } + break; + case CMnohang: + p->hang = 0; + break; + case CMnoswap: + p->noswap = 1; + break; + case CMpri: + pri = atoi(cb->f[1]); + if(pri > PriNormal && !iseve()) + error(Eperm); + procpriority(p, pri, 0); + break; + case CMfixedpri: + pri = atoi(cb->f[1]); + if(pri > PriNormal && !iseve()) + error(Eperm); + procpriority(p, pri, 1); + break; + case CMprivate: + p->privatemem = 1; + break; + case CMprofile: + s = p->seg[TSEG]; + if(s == 0 || (s->type&SG_TYPE) != SG_TEXT) + error(Ebadctl); + if(s->profile != 0) + free(s->profile); + npc = (s->top-s->base)>>LRESPROF; + s->profile = malloc(npc*sizeof(*s->profile)); + if(s->profile == 0) + error(Enomem); + break; + case CMstart: + if(p->state != Stopped) + error(Ebadctl); + ready(p); + break; + case CMstartstop: + if(p->state != Stopped) + error(Ebadctl); + p->procctl = Proc_traceme; + ready(p); + procstopwait(p, Proc_traceme); + break; + case CMstartsyscall: + if(p->state != Stopped) + error(Ebadctl); + p->procctl = Proc_tracesyscall; + ready(p); + procstopwait(p, Proc_tracesyscall); + break; + case CMstop: + procstopwait(p, Proc_stopme); + break; + case CMwaitstop: + procstopwait(p, 0); + break; + case CMwired: + procwired(p, atoi(cb->f[1])); + break; + case CMtrace: + switch(cb->nf){ + case 1: + p->trace ^= 1; + break; + case 2: + p->trace = (atoi(cb->f[1]) != 0); + break; + default: + error("args"); + } + break; + /* real time */ + case CMperiod: + if(p->edf == nil) + edfinit(p); + if(e=parsetime(&time, cb->f[1])) /* time in ns */ + error(e); + edfstop(p); + p->edf->T = time/1000; /* Edf times are in µs */ + break; + case CMdeadline: + if(p->edf == nil) + edfinit(p); + if(e=parsetime(&time, cb->f[1])) + error(e); + edfstop(p); + p->edf->D = time/1000; + break; + case CMcost: + if(p->edf == nil) + edfinit(p); + if(e=parsetime(&time, cb->f[1])) + error(e); + edfstop(p); + p->edf->C = time/1000; + break; + case CMsporadic: + if(p->edf == nil) + edfinit(p); + p->edf->flags |= Sporadic; + break; + case CMdeadlinenotes: + if(p->edf == nil) + edfinit(p); + p->edf->flags |= Sendnotes; + break; + case CMadmit: + if(p->edf == 0) + error("edf params"); + if(e = edfadmit(p)) + error(e); + break; + case CMextra: + if(p->edf == nil) + edfinit(p); + p->edf->flags |= Extratime; + break; + case CMexpel: + if(p->edf) + edfstop(p); + break; + case CMevent: + pt = proctrace; + if(up->trace && pt) + pt(up, SUser, 0); + break; + } + + poperror(); + free(cb); +} + +int +procstopped(void *a) +{ + Proc *p = a; + return p->state == Stopped; +} + +int +procctlmemio(Proc *p, uintptr offset, int n, void *va, int read) +{ + KMap *k; + Pte *pte; + Page *pg; + Segment *s; + uintptr soff, l; + char *a = va, *b; + + for(;;) { + s = seg(p, offset, 1); + if(s == 0) + error(Ebadarg); + + if(offset+n >= s->top) + n = s->top-offset; + + if(!read && (s->type&SG_TYPE) == SG_TEXT) + s = txt2data(p, s); + + s->steal++; + soff = offset-s->base; + if(waserror()) { + s->steal--; + nexterror(); + } + if(fixfault(s, offset, read, 0) == 0) + break; + poperror(); + s->steal--; + } + poperror(); + pte = s->map[soff/PTEMAPMEM]; + if(pte == 0) + panic("procctlmemio"); + pg = pte->pages[(soff&(PTEMAPMEM-1))/BY2PG]; + if(pagedout(pg)) + panic("procctlmemio1"); + + l = BY2PG - (offset&(BY2PG-1)); + if(n > l) + n = l; + + k = kmap(pg); + if(waserror()) { + s->steal--; + kunmap(k); + nexterror(); + } + b = (char*)VA(k); + b += offset&(BY2PG-1); + if(read == 1) + memmove(a, b, n); /* This can fault */ + else + memmove(b, a, n); + kunmap(k); + poperror(); + + /* Ensure the process sees text page changes */ + if(s->flushme) + memset(pg->cachectl, PG_TXTFLUSH, sizeof(pg->cachectl)); + + s->steal--; + + if(read == 0) + p->newtlb = 1; + + return n; +} + +Segment* +txt2data(Proc *p, Segment *s) +{ + int i; + Segment *ps; + + ps = newseg(SG_DATA, s->base, s->size); + ps->image = s->image; + incref(ps->image); + ps->fstart = s->fstart; + ps->flen = s->flen; + ps->flushme = 1; + + qlock(&p->seglock); + for(i = 0; i < NSEG; i++) + if(p->seg[i] == s) + break; + if(i == NSEG) + panic("segment gone"); + + qunlock(&s->lk); + putseg(s); + qlock(&ps->lk); + p->seg[i] = ps; + qunlock(&p->seglock); + + return ps; +} + +Segment* +data2txt(Segment *s) +{ + Segment *ps; + + ps = newseg(SG_TEXT, s->base, s->size); + ps->image = s->image; + incref(ps->image); + ps->fstart = s->fstart; + ps->flen = s->flen; + ps->flushme = 1; + + return ps; +} --- /dev/null +++ /sys/src/9/port64/fault.c @@ -0,0 +1,404 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +int +fault(uintptr addr, int read) +{ + Segment *s; + char *sps; + + if(up == nil) + panic("fault: nil up"); + if(up->nlocks.ref) + print("fault: addr %#p: nlocks %ld\n", addr, up->nlocks.ref); + + sps = up->psstate; + up->psstate = "Fault"; + spllo(); + + m->pfault++; + for(;;) { + s = seg(up, addr, 1); /* leaves s->lk qlocked if seg != nil */ + if(s == 0) { + up->psstate = sps; + return -1; + } + + if(!read && (s->type&SG_RONLY)) { + qunlock(&s->lk); + up->psstate = sps; + return -1; + } + + if(fixfault(s, addr, read, 1) == 0) /* qunlocks s->lk */ + break; + } + + up->psstate = sps; + return 0; +} + +static void +faulterror(char *s, Chan *c, int freemem) +{ + char buf[ERRMAX]; + + if(c && c->path){ + snprint(buf, sizeof buf, "%s accessing %s: %s", s, c->path->s, up->errstr); + s = buf; + } + if(up->nerrlab) { + postnote(up, 1, s, NDebug); + error(s); + } + pexit(s, freemem); +} + +void (*checkaddr)(uintptr, Segment *, Page *); +uintptr addr2check; + +int +fixfault(Segment *s, uintptr addr, int read, int doputmmu) +{ + int type; + int ref; + Pte **p, *etp; + uintptr mmuphys=0, soff; + Page **pg, *lkp, *new; + Page *(*fn)(Segment*, uintptr); + + addr &= ~(BY2PG-1); + soff = addr-s->base; + p = &s->map[soff/PTEMAPMEM]; + if(*p == 0) + *p = ptealloc(); + + etp = *p; + pg = &etp->pages[(soff&(PTEMAPMEM-1))/BY2PG]; + type = s->type&SG_TYPE; + + if(pg < etp->first) + etp->first = pg; + if(pg > etp->last) + etp->last = pg; + + switch(type) { + default: + panic("fault"); + break; + + case SG_TEXT: /* Demand load */ + if(pagedout(*pg)) + pio(s, addr, soff, pg); + + mmuphys = PPN((*pg)->pa) | PTERONLY|PTEVALID; + (*pg)->modref = PG_REF; + break; + + case SG_BSS: + case SG_SHARED: /* Zero fill on demand */ + case SG_STACK: + if(*pg == 0) { + new = newpage(1, &s, addr); + if(s == 0) + return -1; + + *pg = new; + } + goto common; + + case SG_DATA: + common: /* Demand load/pagein/copy on write */ + if(pagedout(*pg)) + pio(s, addr, soff, pg); + + /* + * It's only possible to copy on write if + * we're the only user of the segment. + */ + if(read && conf.copymode == 0 && s->ref == 1) { + mmuphys = PPN((*pg)->pa)|PTERONLY|PTEVALID; + (*pg)->modref |= PG_REF; + break; + } + + lkp = *pg; + lock(lkp); + + if(lkp->image == &swapimage) + ref = lkp->ref + swapcount(lkp->daddr); + else + ref = lkp->ref; + if(ref == 1 && lkp->image){ + /* save a copy of the original for the image cache */ + duppage(lkp); + ref = lkp->ref; + } + unlock(lkp); + if(ref > 1){ + new = newpage(0, &s, addr); + if(s == 0) + return -1; + *pg = new; + copypage(lkp, *pg); + putpage(lkp); + } + mmuphys = PPN((*pg)->pa) | PTEWRITE | PTEVALID; + (*pg)->modref = PG_MOD|PG_REF; + break; + + case SG_PHYSICAL: + if(*pg == 0) { + fn = s->pseg->pgalloc; + if(fn) + *pg = (*fn)(s, addr); + else { + new = smalloc(sizeof(Page)); + new->va = addr; + new->pa = s->pseg->pa+(addr-s->base); + new->ref = 1; + *pg = new; + } + } + + if (checkaddr && addr == addr2check) + (*checkaddr)(addr, s, *pg); + mmuphys = PPN((*pg)->pa) |PTEWRITE|PTEUNCACHED|PTEVALID; + (*pg)->modref = PG_MOD|PG_REF; + break; + } + qunlock(&s->lk); + + if(doputmmu) + putmmu(addr, mmuphys, *pg); + + return 0; +} + +void +pio(Segment *s, uintptr addr, uintptr soff, Page **p) +{ + Page *new; + KMap *k; + Chan *c; + int n, ask; + char *kaddr; + uintptr daddr; + Page *loadrec; + +retry: + loadrec = *p; + if(loadrec == 0) { /* from a text/data image */ + daddr = s->fstart+soff; + new = lookpage(s->image, daddr); + if(new != nil) { + *p = new; + return; + } + + c = s->image->c; + ask = s->flen-soff; + if(ask > BY2PG) + ask = BY2PG; + } + else { /* from a swap image */ + daddr = swapaddr(loadrec); + new = lookpage(&swapimage, daddr); + if(new != nil) { + putswap(loadrec); + *p = new; + return; + } + + c = swapimage.c; + ask = BY2PG; + } + qunlock(&s->lk); + + new = newpage(0, 0, addr); + k = kmap(new); + kaddr = (char*)VA(k); + + while(waserror()) { + if(strcmp(up->errstr, Eintr) == 0) + continue; + kunmap(k); + putpage(new); + faulterror(Eioload, c, 0); + } + + n = devtab[c->type]->read(c, kaddr, ask, daddr); + if(n != ask) + faulterror(Eioload, c, 0); + if(ask < BY2PG) + memset(kaddr+ask, 0, BY2PG-ask); + + poperror(); + kunmap(k); + qlock(&s->lk); + if(loadrec == 0) { /* This is demand load */ + /* + * race, another proc may have gotten here first while + * s->lk was unlocked + */ + if(*p == 0) { + new->daddr = daddr; + cachepage(new, s->image); + *p = new; + } + else + putpage(new); + } + else { /* This is paged out */ + /* + * race, another proc may have gotten here first + * (and the pager may have run on that page) while + * s->lk was unlocked + */ + if(*p != loadrec){ + if(!pagedout(*p)){ + /* another process did it for me */ + putpage(new); + goto done; + } else { + /* another process and the pager got in */ + putpage(new); + goto retry; + } + } + + new->daddr = daddr; + cachepage(new, &swapimage); + *p = new; + putswap(loadrec); + } + +done: + if(s->flushme) + memset((*p)->cachectl, PG_TXTFLUSH, sizeof((*p)->cachectl)); +} + +/* + * Called only in a system call + */ +int +okaddr(uintptr addr, ulong len, int write) +{ + Segment *s; + + if((long)len >= 0) { + for(;;) { + s = seg(up, addr, 0); + if(s == 0 || (write && (s->type&SG_RONLY))) + break; + + if(addr+len > s->top) { + len -= s->top - addr; + addr = s->top; + continue; + } + return 1; + } + } + pprint("suicide: invalid address %#p/%lud in sys call pc=%#p\n", addr, len, userpc()); + return 0; +} + +void +validaddr(uintptr addr, ulong len, int write) +{ + if(!okaddr(addr, len, write)){ + postnote(up, 1, "sys: bad address in syscall", NDebug); + error(Ebadarg); + } +} + +/* + * &s[0] is known to be a valid address. + */ +void* +vmemchr(void *s, int c, int n) +{ + int m; + uintptr a; + void *t; + + a = (uintptr)s; + while(PGROUND(a) != PGROUND(a+n-1)){ + /* spans pages; handle this page */ + m = BY2PG - (a & (BY2PG-1)); + t = memchr((void*)a, c, m); + if(t) + return t; + a += m; + n -= m; + if(a < KZERO) + validaddr(a, 1, 0); + } + + /* fits in one page */ + return memchr((void*)a, c, n); +} + +Segment* +seg(Proc *p, uintptr addr, int dolock) +{ + Segment **s, **et, *n; + + et = &p->seg[NSEG]; + for(s = p->seg; s < et; s++) { + n = *s; + if(n == 0) + continue; + if(addr >= n->base && addr < n->top) { + if(dolock == 0) + return n; + + qlock(&n->lk); + if(addr >= n->base && addr < n->top) + return n; + qunlock(&n->lk); + } + } + + return 0; +} + +extern void checkmmu(uintptr, uintptr); +void +checkpages(void) +{ + int checked; + uintptr addr, off; + Pte *p; + Page *pg; + Segment **sp, **ep, *s; + + if(up == nil) + return; + + checked = 0; + for(sp=up->seg, ep=&up->seg[NSEG]; splk); + for(addr=s->base; addrtop; addr+=BY2PG){ + off = addr - s->base; + p = s->map[off/PTEMAPMEM]; + if(p == 0) + continue; + pg = p->pages[(off&(PTEMAPMEM-1))/BY2PG]; + if(pg == 0 || pagedout(pg)) + continue; + checkmmu(addr, pg->pa); + checked++; + } + qunlock(&s->lk); + } + print("%ld %s: checked %d page table entries\n", up->pid, up->text, checked); +} --- /dev/null +++ /sys/src/9/port64/lib.h @@ -0,0 +1,225 @@ +/* + * functions (possibly) linked in, complete, from libc. + */ +#define nelem(x) (sizeof(x)/sizeof((x)[0])) +#define offsetof(s, m) (ulong)(&(((s*)0)->m)) +#define assert(x) if(x){}else _assert("x") + +/* + * mem routines + */ +extern void* memccpy(void*, void*, int, ulong); +extern void* memset(void*, int, ulong); +extern int memcmp(void*, void*, ulong); +extern void* memmove(void*, void*, ulong); +extern void* memchr(void*, int, ulong); + +/* + * string routines + */ +extern char* strcat(char*, char*); +extern char* strchr(char*, int); +extern char* strrchr(char*, int); +extern int strcmp(char*, char*); +extern char* strcpy(char*, char*); +extern char* strecpy(char*, char*, char*); +extern char* strncat(char*, char*, long); +extern char* strncpy(char*, char*, long); +extern int strncmp(char*, char*, long); +extern long strlen(char*); +extern char* strstr(char*, char*); +extern int atoi(char*); +extern int fullrune(char*, int); +extern int cistrcmp(char*, char*); +extern int cistrncmp(char*, char*, int); + +enum +{ + UTFmax = 4, /* maximum bytes per rune */ + Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ + Runeself = 0x80, /* rune and UTF sequences are the same (<) */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF, /* 24 bit rune */ + Runemask = 0x1FFFFF, /* bits used by runes (see grep) */ +}; + +/* + * rune routines + */ +extern int runetochar(char*, Rune*); +extern int chartorune(Rune*, char*); +extern char* utfrune(char*, long); +extern int utflen(char*); +extern int utfnlen(char*, long); +extern int runelen(long); + +extern int abs(int); + +/* + * print routines + */ +typedef struct Fmt Fmt; +typedef int (*Fmts)(Fmt*); +struct Fmt{ + uchar runes; /* output buffer is runes or chars? */ + void *start; /* of buffer */ + void *to; /* current place in the buffer */ + void *stop; /* end of the buffer; overwritten if flush fails */ + int (*flush)(Fmt *); /* called when to == stop */ + void *farg; /* to make flush a closure */ + int nfmt; /* num chars formatted so far */ + va_list args; /* args passed to dofmt */ + int r; /* % format Rune */ + int width; + int prec; + ulong flags; +}; +extern int print(char*, ...); +extern char* seprint(char*, char*, char*, ...); +extern char* vseprint(char*, char*, char*, va_list); +extern int snprint(char*, int, char*, ...); +extern int vsnprint(char*, int, char*, va_list); +extern int sprint(char*, char*, ...); + +#pragma varargck argpos fmtprint 2 +#pragma varargck argpos print 1 +#pragma varargck argpos seprint 3 +#pragma varargck argpos snprint 3 +#pragma varargck argpos sprint 2 + +#pragma varargck type "lld" vlong +#pragma varargck type "llx" vlong +#pragma varargck type "lld" uvlong +#pragma varargck type "llx" uvlong +#pragma varargck type "ld" long +#pragma varargck type "lx" long +#pragma varargck type "ld" ulong +#pragma varargck type "lx" ulong +#pragma varargck type "d" int +#pragma varargck type "x" int +#pragma varargck type "c" int +#pragma varargck type "C" int +#pragma varargck type "d" uint +#pragma varargck type "x" uint +#pragma varargck type "c" uint +#pragma varargck type "C" uint +#pragma varargck type "s" char* +#pragma varargck type "q" char* +#pragma varargck type "S" Rune* +#pragma varargck type "%" void +#pragma varargck type "p" uintptr +#pragma varargck type "p" void* +#pragma varargck flag ',' + +extern int fmtstrinit(Fmt*); +extern int fmtinstall(int, int (*)(Fmt*)); +extern void quotefmtinstall(void); +extern int fmtprint(Fmt*, char*, ...); +extern int fmtstrcpy(Fmt*, char*); +extern char* fmtstrflush(Fmt*); + +/* + * one-of-a-kind + */ +extern char* cleanname(char*); +extern uintptr getcallerpc(void*); + +extern long strtol(char*, char**, int); +extern ulong strtoul(char*, char**, int); +extern vlong strtoll(char*, char**, int); +extern uvlong strtoull(char*, char**, int); +extern char etext[]; +extern char edata[]; +extern char end[]; +extern int getfields(char*, char**, int, int, char*); +extern int tokenize(char*, char**, int); +extern int dec64(uchar*, int, char*, int); +extern int encodefmt(Fmt*); +extern void qsort(void*, long, long, int (*)(void*, void*)); + +/* + * Syscall data structures + */ +#define MORDER 0x0003 /* mask for bits defining order of mounting */ +#define MREPL 0x0000 /* mount replaces object */ +#define MBEFORE 0x0001 /* mount goes before others in union directory */ +#define MAFTER 0x0002 /* mount goes after others in union directory */ +#define MCREATE 0x0004 /* permit creation in mounted directory */ +#define MCACHE 0x0010 /* cache some data */ +#define MMASK 0x0017 /* all bits on */ + +#define OREAD 0 /* open for read */ +#define OWRITE 1 /* write */ +#define ORDWR 2 /* read and write */ +#define OEXEC 3 /* execute, == read but check execute permission */ +#define OTRUNC 16 /* or'ed in (except for exec), truncate file first */ +#define OCEXEC 32 /* or'ed in, close on exec */ +#define ORCLOSE 64 /* or'ed in, remove on close */ +#define OEXCL 0x1000 /* or'ed in, exclusive create */ + +#define NCONT 0 /* continue after note */ +#define NDFLT 1 /* terminate after note */ +#define NSAVE 2 /* clear note but hold state */ +#define NRSTR 3 /* restore saved state */ + +typedef struct Qid Qid; +typedef struct Dir Dir; +typedef struct OWaitmsg OWaitmsg; +typedef struct Waitmsg Waitmsg; + +#define ERRMAX 128 /* max length of error string */ +#define KNAMELEN 28 /* max length of name held in kernel */ + +/* bits in Qid.type */ +#define QTDIR 0x80 /* type bit for directories */ +#define QTAPPEND 0x40 /* type bit for append only files */ +#define QTEXCL 0x20 /* type bit for exclusive use files */ +#define QTMOUNT 0x10 /* type bit for mounted channel */ +#define QTAUTH 0x08 /* type bit for authentication file */ +#define QTFILE 0x00 /* plain file */ + +/* bits in Dir.mode */ +#define DMDIR 0x80000000 /* mode bit for directories */ +#define DMAPPEND 0x40000000 /* mode bit for append only files */ +#define DMEXCL 0x20000000 /* mode bit for exclusive use files */ +#define DMMOUNT 0x10000000 /* mode bit for mounted channel */ +#define DMREAD 0x4 /* mode bit for read permission */ +#define DMWRITE 0x2 /* mode bit for write permission */ +#define DMEXEC 0x1 /* mode bit for execute permission */ + +struct Qid +{ + uvlong path; + ulong vers; + uchar type; +}; + +struct Dir { + /* system-modified data */ + ushort type; /* server type */ + uint dev; /* server subtype */ + /* file data */ + Qid qid; /* unique id from server */ + ulong mode; /* permissions */ + ulong atime; /* last read time */ + ulong mtime; /* last write time */ + vlong length; /* file length: see */ + char *name; /* last element of path */ + char *uid; /* owner name */ + char *gid; /* group name */ + char *muid; /* last modifier name */ +}; + +struct OWaitmsg +{ + char pid[12]; /* of loved one */ + char time[3*12]; /* of loved one and descendants */ + char msg[64]; /* compatibility BUG */ +}; + +struct Waitmsg +{ + int pid; /* of loved one */ + ulong time[3]; /* of loved one and descendants */ + char msg[ERRMAX]; /* actually variable-size in user mode */ +}; --- /dev/null +++ /sys/src/9/port64/mksystab @@ -0,0 +1,50 @@ +#!/bin/rc + +file=/sys/src/libc/9syscall/sys.h + +cat <<'!' +#include "/sys/src/libc/9syscall/sys.h" + +typedef uintptr Syscall(uintptr*); + +! + +sed 's/#define[ ]*([A-Z0-9_][A-Z0-9_]*).*/SYS\1/; s/SYSSYSR1/SYSR1/' $file | + tr A-Z a-z | + sed 's/.*/Syscall &;/' + +cat <<'!' +Syscall sysdeath; + +Syscall *systab[]={ +! +sam -d $file >[2] /dev/null <<'!' +,s/#define.([A-Z0-9_]+).*/ [\1] SYS\1,/g +,x/SYS[A-Z0-9_]+,/ | tr A-Z a-z +,x/syssysr1/c/sysr1 +,x/sys_x[0-9]*/c/sysdeath +,x v/\[......+\]/ s/\]/] +,p +! + +cat <<'!' +}; + +char *sysctab[]={ +! + +sam -d $file >[2] /dev/null <<'!' +,s/#define.([A-Z0-9_]+).*/ [\1] "\1",/g +,x/"[A-Z0-9_]+",/ y/"[A-Z]/ | tr A-Z a-z +,x/_"/c/" +,x/Sysr1/c/Running +,x/Rendezvous/c/Rendez +,x v/\[......+\]/ s/\]/] +,p +! + +cat <<'!' +}; + +int nsyscall = (sizeof systab/sizeof systab[0]); +! --- /dev/null +++ /sys/src/9/port64/page.c @@ -0,0 +1,651 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +#define pghash(daddr) palloc.hash[(daddr>>PGSHIFT)&(PGHSIZE-1)] + +struct Palloc palloc; + +void +pageinit(void) +{ + int color, i, j; + Page *p; + Pallocmem *pm; + ulong m, np, k, vkb, pkb; + + np = 0; + for(i=0; inpage; + } + palloc.pages = xalloc(np*sizeof(Page)); + if(palloc.pages == 0) + panic("pageinit"); + + color = 0; + palloc.head = palloc.pages; + p = palloc.head; + for(i=0; inpage; j++){ + p->prev = p-1; + p->next = p+1; + p->pa = pm->base+j*BY2PG; + p->color = color; + palloc.freecount++; + color = (color+1)%NCOLOR; + p++; + } + } + palloc.tail = p - 1; + palloc.head->prev = 0; + palloc.tail->next = 0; + + palloc.user = p - palloc.pages; + pkb = palloc.user*BY2PG/1024; + vkb = pkb + (conf.nswap*BY2PG)/1024; + + /* Paging numbers */ + swapalloc.highwater = (palloc.user*5)/100; + swapalloc.headroom = swapalloc.highwater + (swapalloc.highwater/4); + + m = 0; + for(i=0; iprev) + p->prev->next = p->next; + else + palloc.head = p->next; + if(p->next) + p->next->prev = p->prev; + else + palloc.tail = p->prev; + p->prev = p->next = nil; + palloc.freecount--; +} + +void +pagechaintail(Page *p) +{ + if(canlock(&palloc)) + panic("pagechaintail"); + if(palloc.tail) { + p->prev = palloc.tail; + palloc.tail->next = p; + } + else { + palloc.head = p; + p->prev = 0; + } + palloc.tail = p; + p->next = 0; + palloc.freecount++; +} + +void +pagechainhead(Page *p) +{ + if(canlock(&palloc)) + panic("pagechainhead"); + if(palloc.head) { + p->next = palloc.head; + palloc.head->prev = p; + } + else { + palloc.tail = p; + p->next = 0; + } + palloc.head = p; + p->prev = 0; + palloc.freecount++; +} + +Page* +newpage(int clear, Segment **s, uintptr va) +{ + Page *p; + KMap *k; + uchar ct; + int i, hw, dontalloc, color; + + lock(&palloc); + color = getpgcolor(va); + hw = swapalloc.highwater; + for(;;) { + if(palloc.freecount > hw) + break; + if(up->kp && palloc.freecount > 0) + break; + + unlock(&palloc); + dontalloc = 0; + if(s && *s) { + qunlock(&((*s)->lk)); + *s = 0; + dontalloc = 1; + } + qlock(&palloc.pwait); /* Hold memory requesters here */ + + while(waserror()) /* Ignore interrupts */ + ; + + kickpager(); + tsleep(&palloc.r, ispages, 0, 1000); + + poperror(); + + qunlock(&palloc.pwait); + + /* + * If called from fault and we lost the segment from + * underneath don't waste time allocating and freeing + * a page. Fault will call newpage again when it has + * reacquired the segment locks + */ + if(dontalloc) + return 0; + + lock(&palloc); + } + + /* First try for our colour */ + for(p = palloc.head; p; p = p->next) + if(p->color == color) + break; + + ct = PG_NOFLUSH; + if(p == 0) { + p = palloc.head; + p->color = color; + ct = PG_NEWCOL; + } + + pageunchain(p); + + lock(p); + if(p->ref != 0) + panic("newpage: p->ref %d != 0", p->ref); + + uncachepage(p); + p->ref++; + p->va = va; + p->modref = 0; + for(i = 0; i < MAXMACH; i++) + p->cachectl[i] = ct; + unlock(p); + unlock(&palloc); + + if(clear) { + k = kmap(p); + memset((void*)VA(k), 0, BY2PG); + kunmap(k); + } + + return p; +} + +int +ispages(void*) +{ + return palloc.freecount >= swapalloc.highwater; +} + +void +putpage(Page *p) +{ + if(onswap(p)) { + putswap(p); + return; + } + + lock(&palloc); + lock(p); + + if(p->ref == 0) + panic("putpage"); + + if(--p->ref > 0) { + unlock(p); + unlock(&palloc); + return; + } + + if(p->image && p->image != &swapimage) + pagechaintail(p); + else + pagechainhead(p); + + if(palloc.r.p != 0) + wakeup(&palloc.r); + + unlock(p); + unlock(&palloc); +} + +Page* +auxpage(void) +{ + Page *p; + + lock(&palloc); + p = palloc.head; + if(palloc.freecount < swapalloc.highwater) { + unlock(&palloc); + return 0; + } + pageunchain(p); + + lock(p); + if(p->ref != 0) + panic("auxpage"); + p->ref++; + uncachepage(p); + unlock(p); + unlock(&palloc); + + return p; +} + +static int dupretries = 15000; + +int +duppage(Page *p) /* Always call with p locked */ +{ + Page *np; + int color; + int retries; + + retries = 0; +retry: + + if(retries++ > dupretries){ + print("duppage %d, up %p\n", retries, up); + dupretries += 100; + if(dupretries > 100000) + panic("duppage\n"); + uncachepage(p); + return 1; + } + + + /* don't dup pages with no image */ + if(p->ref == 0 || p->image == nil || p->image->notext) + return 0; + + /* + * normal lock ordering is to call + * lock(&palloc) before lock(p). + * To avoid deadlock, we have to drop + * our locks and try again. + */ + if(!canlock(&palloc)){ + unlock(p); + if(up) + sched(); + lock(p); + goto retry; + } + + /* No freelist cache when memory is very low */ + if(palloc.freecount < swapalloc.highwater) { + unlock(&palloc); + uncachepage(p); + return 1; + } + + color = getpgcolor(p->va); + for(np = palloc.head; np; np = np->next) + if(np->color == color) + break; + + /* No page of the correct color */ + if(np == 0) { + unlock(&palloc); + uncachepage(p); + return 1; + } + + pageunchain(np); + pagechaintail(np); +/* +* XXX - here's a bug? - np is on the freelist but it's not really free. +* when we unlock palloc someone else can come in, decide to +* use np, and then try to lock it. they succeed after we've +* run copypage and cachepage and unlock(np). then what? +* they call pageunchain before locking(np), so it's removed +* from the freelist, but still in the cache because of +* cachepage below. if someone else looks in the cache +* before they remove it, the page will have a nonzero ref +* once they finally lock(np). +*/ + lock(np); + unlock(&palloc); + + /* Cache the new version */ + uncachepage(np); + np->va = p->va; + np->daddr = p->daddr; + copypage(p, np); + cachepage(np, p->image); + unlock(np); + uncachepage(p); + + return 0; +} + +void +copypage(Page *f, Page *t) +{ + KMap *ks, *kd; + + ks = kmap(f); + kd = kmap(t); + memmove((void*)VA(kd), (void*)VA(ks), BY2PG); + kunmap(ks); + kunmap(kd); +} + +void +uncachepage(Page *p) /* Always called with a locked page */ +{ + Page **l, *f; + + if(p->image == 0) + return; + + lock(&palloc.hashlock); + l = &pghash(p->daddr); + for(f = *l; f; f = f->hash) { + if(f == p) { + *l = p->hash; + break; + } + l = &f->hash; + } + unlock(&palloc.hashlock); + putimage(p->image); + p->image = 0; + p->daddr = 0; +} + +void +cachepage(Page *p, Image *i) +{ + Page **l; + + /* If this ever happens it should be fixed by calling + * uncachepage instead of panic. I think there is a race + * with pio in which this can happen. Calling uncachepage is + * correct - I just wanted to see if we got here. + */ + if(p->image) + panic("cachepage"); + + incref(i); + lock(&palloc.hashlock); + p->image = i; + l = &pghash(p->daddr); + p->hash = *l; + *l = p; + unlock(&palloc.hashlock); +} + +void +cachedel(Image *i, uintptr daddr) +{ + Page *f, **l; + + lock(&palloc.hashlock); + l = &pghash(daddr); + for(f = *l; f; f = f->hash) { + if(f->image == i && f->daddr == daddr) { + lock(f); + if(f->image == i && f->daddr == daddr){ + *l = f->hash; + putimage(f->image); + f->image = 0; + f->daddr = 0; + } + unlock(f); + break; + } + l = &f->hash; + } + unlock(&palloc.hashlock); +} + +Page * +lookpage(Image *i, uintptr daddr) +{ + Page *f; + + lock(&palloc.hashlock); + for(f = pghash(daddr); f; f = f->hash) { + if(f->image == i && f->daddr == daddr) { + unlock(&palloc.hashlock); + + lock(&palloc); + lock(f); + if(f->image != i || f->daddr != daddr) { + unlock(f); + unlock(&palloc); + return 0; + } + if(++f->ref == 1) + pageunchain(f); + unlock(&palloc); + unlock(f); + + return f; + } + } + unlock(&palloc.hashlock); + + return 0; +} + +Pte* +ptecpy(Pte *old) +{ + Pte *new; + Page **src, **dst; + + new = ptealloc(); + dst = &new->pages[old->first-old->pages]; + new->first = dst; + for(src = old->first; src <= old->last; src++, dst++) + if(*src) { + if(onswap(*src)) + dupswap(*src); + else { + lock(*src); + (*src)->ref++; + unlock(*src); + } + new->last = dst; + *dst = *src; + } + + return new; +} + +Pte* +ptealloc(void) +{ + Pte *new; + + new = smalloc(sizeof(Pte)); + new->first = &new->pages[PTEPERTAB]; + new->last = new->pages; + return new; +} + +void +freepte(Segment *s, Pte *p) +{ + int ref; + void (*fn)(Page*); + Page *pt, **pg, **ptop; + + switch(s->type&SG_TYPE) { + case SG_PHYSICAL: + fn = s->pseg->pgfree; + ptop = &p->pages[PTEPERTAB]; + if(fn) { + for(pg = p->pages; pg < ptop; pg++) { + if(*pg == 0) + continue; + (*fn)(*pg); + *pg = 0; + } + break; + } + for(pg = p->pages; pg < ptop; pg++) { + pt = *pg; + if(pt == 0) + continue; + lock(pt); + ref = --pt->ref; + unlock(pt); + if(ref == 0) + free(pt); + } + break; + default: + for(pg = p->first; pg <= p->last; pg++) + if(*pg) { + putpage(*pg); + *pg = 0; + } + } + free(p); +} + +ulong +pagenumber(Page *p) +{ + return p-palloc.pages; +} + +void +checkpagerefs(void) +{ + int s; + ulong i, np, nwrong; + ulong *ref; + + np = palloc.user; + ref = malloc(np*sizeof ref[0]); + if(ref == nil){ + print("checkpagerefs: out of memory\n"); + return; + } + + /* + * This may not be exact if there are other processes + * holding refs to pages on their stacks. The hope is + * that if you run it on a quiescent system it will still + * be useful. + */ + s = splhi(); + lock(&palloc); + countpagerefs(ref, 0); + portcountpagerefs(ref, 0); + nwrong = 0; + for(i=0; imark avoids double-counting. + */ + n = 0; + ns = 0; + for(i=0; iseg[j]; + if(s) + s->mark = 0; + } + } + for(i=0; iseg[j]; + if(s == nil || s->mark++) + continue; + ns++; + for(k=0; kmapsize; k++){ + pte = s->map[k]; + if(pte == nil) + continue; + for(pg = pte->first; pg <= pte->last; pg++){ + entry = *pg; + if(pagedout(entry)) + continue; + if(print){ + if(ref[pagenumber(entry)]) + iprint("page %#p in segment %#p\n", entry->pa, s); + continue; + } + if(ref[pagenumber(entry)]++ == 0) + n++; + } + } + } + } + if(!print){ + iprint("%lud pages in %lud segments\n", n, ns); + for(i=0; iseg[j]; + if(s == nil) + continue; + if(s->ref != s->mark){ + iprint("segment %#p (used by proc %lud pid %lud) has bad ref count %lud actual %lud\n", + s, i, p->pid, s->ref, s->mark); + } + } + } + } +} --- /dev/null +++ /sys/src/9/port64/portdat.h @@ -0,0 +1,1024 @@ +typedef struct Alarms Alarms; +typedef struct Block Block; +typedef struct Chan Chan; +typedef struct Cmdbuf Cmdbuf; +typedef struct Cmdtab Cmdtab; +typedef struct Confmem Confmem; +typedef struct Dev Dev; +typedef struct Dirtab Dirtab; +typedef struct Edf Edf; +typedef struct Egrp Egrp; +typedef struct Evalue Evalue; +typedef struct Execvals Execvals; +typedef struct Fgrp Fgrp; +typedef struct DevConf DevConf; +typedef struct Image Image; +typedef struct Log Log; +typedef struct Logflag Logflag; +typedef struct Mntcache Mntcache; +typedef struct Mount Mount; +typedef struct Mntrpc Mntrpc; +typedef struct Mntwalk Mntwalk; +typedef struct Mnt Mnt; +typedef struct Mhead Mhead; +typedef struct Note Note; +typedef struct Page Page; +typedef struct Path Path; +typedef struct Palloc Palloc; +typedef struct Pallocmem Pallocmem; +typedef struct Perf Perf; +typedef struct PhysUart PhysUart; +typedef struct Pgrp Pgrp; +typedef struct Physseg Physseg; +typedef struct Proc Proc; +typedef struct Pte Pte; +typedef struct QLock QLock; +typedef struct Queue Queue; +typedef struct Ref Ref; +typedef struct Rendez Rendez; +typedef struct Rgrp Rgrp; +typedef struct RWlock RWlock; +typedef struct Sargs Sargs; +typedef struct Schedq Schedq; +typedef struct Segment Segment; +typedef struct Sema Sema; +typedef struct Timer Timer; +typedef struct Timers Timers; +typedef struct Uart Uart; +typedef struct Waitq Waitq; +typedef struct Walkqid Walkqid; +typedef struct Watchdog Watchdog; +typedef struct Watermark Watermark; +typedef int Devgen(Chan*, char*, Dirtab*, int, int, Dir*); + +#pragma incomplete DevConf +#pragma incomplete Edf +#pragma incomplete Mntcache +#pragma incomplete Mntrpc +#pragma incomplete Queue +#pragma incomplete Timers + +#include + +#define HOWMANY(x, y) (((x)+((y)-1))/(y)) +#define ROUNDUP(x, y) (HOWMANY((x), (y))*(y)) /* ceiling */ +#define ROUNDDN(x, y) (((x)/(y))*(y)) /* floor */ +#define ROUND(s, sz) (((s)+(sz-1))&~(sz-1)) +#define PGROUND(s) ROUNDUP(s, BY2PG) +#define MIN(a, b) ((a) < (b)? (a): (b)) +#define MAX(a, b) ((a) > (b)? (a): (b)) + +/* + * For multi-bit fields use FIELD(v, o, w) where 'v' is the value + * of the bit-field of width 'w' with LSb at bit offset 'o'. + */ +#define FIELD(v, o, w) (((v) & ((1<<(w))-1))<<(o)) + +#define FCLR(d, o, w) ((d) & ~(((1<<(w))-1)<<(o))) +#define FEXT(d, o, w) (((d)>>(o)) & ((1<<(w))-1)) +#define FINS(d, o, w, v) (FCLR((d), (o), (w))|FIELD((v), (o), (w))) +#define FSET(d, o, w) ((d)|(((1<<(w))-1)<<(o))) + +#define FMASK(o, w) (((1<<(w))-1)<<(o)) + +/* let each port override any of these */ +#ifndef KMESGSIZE +#define KMESGSIZE (16*1024) +#endif +#ifndef PCICONSSIZE +#define PCICONSSIZE (16*1024) +#endif +#ifndef STAGESIZE +#define STAGESIZE 64 +#endif +#ifndef MAXBY2PG +#define MAXBY2PG BY2PG /* rounding for UTZERO in executables */ +#endif + +struct Ref +{ + Lock; + long ref; +}; + +struct Rendez +{ + Lock; + Proc *p; +}; + +struct QLock +{ + Lock use; /* to access Qlock structure */ + Proc *head; /* next process waiting for object */ + Proc *tail; /* last process waiting for object */ + int locked; /* flag */ + uintptr qpc; /* pc of the holder */ +}; + +struct RWlock +{ + Lock use; + Proc *head; /* list of waiting processes */ + Proc *tail; + uintptr wpc; /* pc of writer */ + Proc *wproc; /* writing proc */ + int readers; /* number of readers */ + int writer; /* number of writers */ +}; + +struct Alarms +{ + QLock; + Proc *head; +}; + +struct Sargs +{ + uintptr args[MAXSYSARG]; +}; + +/* + * Access types in namec & channel flags + */ +enum +{ + Aaccess, /* as in stat, wstat */ + Abind, /* for left-hand-side of bind */ + Atodir, /* as in chdir */ + Aopen, /* for i/o */ + Amount, /* to be mounted or mounted upon */ + Acreate, /* is to be created */ + Aremove, /* will be removed by caller */ + + COPEN = 0x0001, /* for i/o */ + CMSG = 0x0002, /* the message channel for a mount */ +/*rsc CCREATE = 0x0004, /* permits creation if c->mnt */ + CCEXEC = 0x0008, /* close on exec */ + CFREE = 0x0010, /* not in use */ + CRCLOSE = 0x0020, /* remove on close */ + CCACHE = 0x0080, /* client cache */ +}; + +/* flag values */ +enum +{ + BINTR = (1<<0), + BFREE = (1<<1), + Bipck = (1<<2), /* ip checksum */ + Budpck = (1<<3), /* udp checksum */ + Btcpck = (1<<4), /* tcp checksum */ + Bpktck = (1<<5), /* packet checksum */ +}; + +struct Block +{ + long ref; + Block* next; + Block* list; + uchar* rp; /* first unconsumed byte */ + uchar* wp; /* first empty byte */ + uchar* lim; /* 1 past the end of the buffer */ + uchar* base; /* start of the buffer */ + void (*free)(Block*); + ushort flag; + ushort checksum; /* IP checksum of complete packet (minus media header) */ + ulong magic; +}; + +#define BLEN(s) ((s)->wp - (s)->rp) +#define BALLOC(s) ((s)->lim - (s)->base) + +struct Chan +{ + Ref; /* the Lock in this Ref is also Chan's lock */ + Chan* next; /* allocation */ + Chan* link; + vlong offset; /* in fd */ + vlong devoffset; /* in underlying device; see read */ + ushort type; + ulong dev; + ushort mode; /* read/write */ + ushort flag; + Qid qid; + int fid; /* for devmnt */ + ulong iounit; /* chunk size for i/o; 0==default */ + Mhead* umh; /* mount point that derived Chan; used in unionread */ + Chan* umc; /* channel in union; held for union read */ + QLock umqlock; /* serialize unionreads */ + int uri; /* union read index */ + int dri; /* devdirread index */ + uchar* dirrock; /* directory entry rock for translations */ + int nrock; + int mrock; + QLock rockqlock; + int ismtpt; + Mntcache*mcp; /* Mount cache pointer */ + Mnt* mux; /* Mnt for clients using me for messages */ + union { + void* aux; + Qid pgrpid; /* for #p/notepg */ + ulong mid; /* for ns in devproc */ + }; + Chan* mchan; /* channel to mounted server */ + Qid mqid; /* qid of root of mount point */ + Path* path; +}; + +struct Path +{ + Ref; + char *s; + Chan **mtpt; /* mtpt history */ + int len; /* strlen(s) */ + int alen; /* allocated length of s */ + int mlen; /* number of path elements */ + int malen; /* allocated length of mtpt */ +}; + +struct Dev +{ + int dc; + char* name; + + void (*reset)(void); + void (*init)(void); + void (*shutdown)(void); + Chan* (*attach)(char*); + Walkqid*(*walk)(Chan*, Chan*, char**, int); + int (*stat)(Chan*, uchar*, int); + Chan* (*open)(Chan*, int); + void (*create)(Chan*, char*, int, ulong); + void (*close)(Chan*); + long (*read)(Chan*, void*, long, vlong); + Block* (*bread)(Chan*, long, ulong); + long (*write)(Chan*, void*, long, vlong); + long (*bwrite)(Chan*, Block*, ulong); + void (*remove)(Chan*); + int (*wstat)(Chan*, uchar*, int); + void (*power)(int); /* power mgt: power(1) => on, power (0) => off */ + int (*config)(int, char*, DevConf*); /* returns nil on error */ + + /* not initialised */ + int attached; /* debugging */ +}; + +struct Dirtab +{ + char name[KNAMELEN]; + Qid qid; + vlong length; + long perm; +}; + +struct Walkqid +{ + Chan *clone; + int nqid; + Qid qid[1]; +}; + +enum +{ + NSMAX = 1000, + NSLOG = 7, + NSCACHE = (1<ref; channels on this mount point incref(c->mchan) == Mnt.c */ + Chan *c; /* Channel to file service */ + Proc *rip; /* Reader in progress */ + Mntrpc *queue; /* Queue of pending requests on this channel */ + ulong id; /* Multiplexer id for channel check */ + Mnt *list; /* Free list */ + int flags; /* cache */ + int msize; /* data + IOHDRSZ */ + char *version; /* 9P version */ + Queue *q; /* input queue */ +}; + +enum +{ + NUser, /* note provided externally */ + NExit, /* deliver note quietly */ + NDebug, /* print debug message */ +}; + +struct Note +{ + char msg[ERRMAX]; + int flag; /* whether system posted it */ +}; + +enum +{ + PG_NOFLUSH = 0, + PG_TXTFLUSH = 1, /* flush dcache and invalidate icache */ + PG_DATFLUSH = 2, /* flush both i & d caches (UNUSED) */ + PG_NEWCOL = 3, /* page has been recolored */ + + PG_MOD = 0x01, /* software modified bit */ + PG_REF = 0x02, /* software referenced bit */ +}; + +struct Page +{ + Lock; + uintptr pa; /* Physical address in memory */ + uintptr va; /* Virtual address for user */ + uintptr daddr; /* Disc address on swap */ + ulong gen; /* Generation counter for swap */ + ushort ref; /* Reference count */ + char modref; /* Simulated modify/reference bits */ + char color; /* Cache coloring */ + char cachectl[MAXMACH]; /* Cache flushing control for putmmu */ + Image *image; /* Associated text or swap image */ + Page *next; /* Lru free list */ + Page *prev; + Page *hash; /* Image hash chains */ +}; + +struct Swapalloc +{ + Lock; /* Free map lock */ + int free; /* currently free swap pages */ + uchar* swmap; /* Base of swap map in memory */ + uchar* alloc; /* Round robin allocator */ + uchar* last; /* Speed swap allocation */ + uchar* top; /* Top of swap map */ + Rendez r; /* Pager kproc idle sleep */ + ulong highwater; /* Pager start threshold */ + ulong headroom; /* Space pager frees under highwater */ +}swapalloc; + +struct Image +{ + Ref; + Chan *c; /* channel to text file */ + Qid qid; /* Qid for page cache coherence */ + Qid mqid; + Chan *mchan; + ushort type; /* Device type of owning channel */ + Segment *s; /* TEXT segment for image if running */ + Image *hash; /* Qid hash chains */ + Image *next; /* Free list */ + int notext; /* no file associated */ +}; + +struct Pte +{ + Page *pages[PTEPERTAB]; /* Page map for this chunk of pte */ + Page **first; /* First used entry */ + Page **last; /* Last used entry */ +}; + +/* Segment types */ +enum +{ + SG_TYPE = 07, /* Mask type of segment */ + SG_TEXT = 00, + SG_DATA = 01, + SG_BSS = 02, + SG_STACK = 03, + SG_SHARED = 04, + SG_PHYSICAL = 05, + + SG_RONLY = 0040, /* Segment is read only */ + SG_CEXEC = 0100, /* Detach at exec */ +}; + +#define PG_ONSWAP 1 +#define onswap(s) (((uintptr)s)&PG_ONSWAP) +#define pagedout(s) (((uintptr)s)==0 || onswap(s)) +#define swapaddr(s) (((uintptr)s)&~PG_ONSWAP) + +#define SEGMAXSIZE (SEGMAPSIZE*PTEMAPMEM) + +struct Physseg +{ + ulong attr; /* Segment attributes */ + char *name; /* Attach name */ + uintptr pa; /* Physical address */ + uintptr size; /* Maximum segment size in pages */ + Page *(*pgalloc)(Segment*, uintptr); /* Allocation if we need it */ + void (*pgfree)(Page*); +}; + +struct Sema +{ + Rendez; + long *addr; + int waiting; + Sema *next; + Sema *prev; +}; + +struct Segment +{ + Ref; + QLock lk; + ushort steal; /* Page stealer lock */ + ushort type; /* segment type */ + uintptr base; /* virtual base */ + uintptr top; /* virtual top */ + ulong size; /* size in pages */ + uintptr fstart; /* start address in file for demand load */ + uintptr flen; /* length of segment in file */ + int flushme; /* maintain icache for this segment */ + Image *image; /* text in file attached to this segment */ + Physseg *pseg; + ulong* profile; /* Tick profile area */ + Pte **map; + int mapsize; + Pte *ssegmap[SSEGMAPSIZE]; + Lock semalock; + Sema sema; + ulong mark; /* portcountrefs */ +}; + +enum +{ + RENDLOG = 5, + RENDHASH = 1<rendhash[(s)&((1<mnthash[(qid).path&((1< variadic */ +}; + +/* + * routines to access UART hardware + */ +struct PhysUart +{ + char* name; + Uart* (*pnp)(void); + void (*enable)(Uart*, int); + void (*disable)(Uart*); + void (*kick)(Uart*); + void (*dobreak)(Uart*, int); + int (*baud)(Uart*, int); + int (*bits)(Uart*, int); + int (*stop)(Uart*, int); + int (*parity)(Uart*, int); + void (*modemctl)(Uart*, int); + void (*rts)(Uart*, int); + void (*dtr)(Uart*, int); + long (*status)(Uart*, void*, long, long); + void (*fifo)(Uart*, int); + void (*power)(Uart*, int); + int (*getc)(Uart*); /* polling versions, for iprint, rdb */ + void (*putc)(Uart*, int); +}; + +enum { + Stagesize= STAGESIZE +}; + +/* + * software UART + */ +struct Uart +{ + void* regs; /* hardware stuff */ + void* saveregs; /* place to put registers on power down */ + char* name; /* internal name */ + ulong freq; /* clock frequency */ + int bits; /* bits per character */ + int stop; /* stop bits */ + int parity; /* even, odd or no parity */ + int baud; /* baud rate */ + PhysUart*phys; + int console; /* used as a serial console */ + int special; /* internal kernel device */ + Uart* next; /* list of allocated uarts */ + + QLock; + int type; /* ?? */ + int dev; + int opens; + + int enabled; + Uart *elist; /* next enabled interface */ + + int perr; /* parity errors */ + int ferr; /* framing errors */ + int oerr; /* rcvr overruns */ + int berr; /* no input buffers */ + int serr; /* input queue overflow */ + + /* buffers */ + int (*putc)(Queue*, int); + Queue *iq; + Queue *oq; + + Lock rlock; + uchar istage[Stagesize]; + uchar *iw; + uchar *ir; + uchar *ie; + + Lock tlock; /* transmit */ + uchar ostage[Stagesize]; + uchar *op; + uchar *oe; + int drain; + + int modem; /* hardware flow control on */ + int xonoff; /* software flow control on */ + int blocked; + int cts, dsr, dcd; /* keep track of modem status */ + int ctsbackoff; + int hup_dsr, hup_dcd; /* send hangup upstream? */ + int dohup; + + Rendez r; +}; + +extern Uart* consuart; + +void (*lprint)(char *, int); + +/* + * performance timers, all units in perfticks + */ +struct Perf +{ + ulong intrts; /* time of last interrupt */ + ulong inintr; /* time since last clock tick in interrupt handlers */ + ulong avg_inintr; /* avg time per clock tick in interrupt handlers */ + ulong inidle; /* time since last clock tick in idle loop */ + ulong avg_inidle; /* avg time per clock tick in idle loop */ + ulong last; /* value of perfticks() at last clock tick */ + ulong period; /* perfticks() per clock tick */ +}; + +struct Watchdog +{ + void (*enable)(void); /* watchdog enable */ + void (*disable)(void); /* watchdog disable */ + void (*restart)(void); /* watchdog restart */ + void (*stat)(char*, char*); /* watchdog statistics */ +}; + +struct Watermark +{ + int highwater; + int curr; + int max; + int hitmax; /* count: how many times hit max? */ + char *name; +}; + + +/* queue state bits, Qmsg, Qcoalesce, and Qkick can be set in qopen */ +enum +{ + /* Queue.state */ + Qstarve = (1<<0), /* consumer starved */ + Qmsg = (1<<1), /* message stream */ + Qclosed = (1<<2), /* queue has been closed/hungup */ + Qflow = (1<<3), /* producer flow controlled */ + Qcoalesce = (1<<4), /* coalesce packets on read */ + Qkick = (1<<5), /* always call the kick routine after qwrite */ +}; + +#define DEVDOTDOT -1 + +#pragma varargck type "I" uchar* +#pragma varargck type "V" uchar* +#pragma varargck type "E" uchar* +#pragma varargck type "M" uchar* --- /dev/null +++ /sys/src/9/port64/portfns.h @@ -0,0 +1,402 @@ +void _assert(char*); +void accounttime(void); +Timer* addclock0link(void (*)(void), int); +int addphysseg(Physseg*); +void addbootfile(char*, uchar*, ulong); +void addwatchdog(Watchdog*); +Block* adjustblock(Block*, int); +void alarmkproc(void*); +Block* allocb(int); +int anyhigher(void); +int anyready(void); +Image* attachimage(int, Chan*, uintptr, ulong); +Page* auxpage(void); +Block* bl2mem(uchar*, Block*, int); +int blocklen(Block*); +ulong blocksize(ulong); +void bootlinks(void); +void cachedel(Image*, uintptr); +void cachepage(Page*, Image*); +void callwithureg(void(*)(Ureg*)); +char* chanpath(Chan*); +int canlock(Lock*); +int canpage(Proc*); +int canqlock(QLock*); +int canrlock(RWlock*); +void chandevinit(void); +void chandevreset(void); +void chandevshutdown(void); +void chanfree(Chan*); +void checkalarms(void); +void checkb(Block*, char*); +void cinit(void); +Chan* cclone(Chan*); +void cclose(Chan*); +void ccloseq(Chan*); +void closeegrp(Egrp*); +void closefgrp(Fgrp*); +void closepgrp(Pgrp*); +void closergrp(Rgrp*); +long clrfpintr(void); +void cmderror(Cmdbuf*, char*); +int cmount(Chan**, Chan*, int, char*); +void confinit(void); +int consactive(void); +void (*consdebug)(void); +void copen(Chan*); +Block* concatblock(Block*); +Block* copyblock(Block*, int); +void copypage(Page*, Page*); +void countpagerefs(ulong*, int); +int cread(Chan*, uchar*, int, vlong); +void cunmount(Chan*, Chan*); +void cupdate(Chan*, uchar*, int, vlong); +void cwrite(Chan*, uchar*, int, vlong); +uintptr dbgpc(Proc*); +long decref(Ref*); +int decrypt(void*, void*, int); +void delay(int); +Proc* dequeueproc(Schedq*, Proc*); +Chan* devattach(int, char*); +Block* devbread(Chan*, long, ulong); +long devbwrite(Chan*, Block*, ulong); +Chan* devclone(Chan*); +int devconfig(int, char *, DevConf *); +void devcreate(Chan*, char*, int, ulong); +void devdir(Chan*, Qid, char*, vlong, char*, long, Dir*); +long devdirread(Chan*, char*, long, Dirtab*, int, Devgen*); +Devgen devgen; +void devinit(void); +int devno(int, int); +Chan* devopen(Chan*, int, Dirtab*, int, Devgen*); +void devpermcheck(char*, ulong, int); +void devpower(int); +void devremove(Chan*); +void devreset(void); +void devshutdown(void); +int devstat(Chan*, uchar*, int, Dirtab*, int, Devgen*); +Walkqid* devwalk(Chan*, Chan*, char**, int, Dirtab*, int, Devgen*); +int devwstat(Chan*, uchar*, int); +void drawactive(int); +void drawcmap(void); +void dumpaproc(Proc*); +void dumpregs(Ureg*); +void dumpstack(void); +Fgrp* dupfgrp(Fgrp*); +int duppage(Page*); +void dupswap(Page*); +void edfinit(Proc*); +char* edfadmit(Proc*); +int edfready(Proc*); +void edfrecord(Proc*); +void edfrun(Proc*, int); +void edfstop(Proc*); +void edfyield(void); +int emptystr(char*); +int encrypt(void*, void*, int); +void envcpy(Egrp*, Egrp*); +int eqchan(Chan*, Chan*, int); +int eqchantdqid(Chan*, int, int, Qid, int); +int eqqid(Qid, Qid); +void error(char*); +uintptr execregs(uintptr, ulong, ulong); +void exhausted(char*); +void exit(int); +uvlong fastticks(uvlong*); +uvlong fastticks2ns(uvlong); +uvlong fastticks2us(uvlong); +int fault(uintptr, int); +void fdclose(int, int); +Chan* fdtochan(int, int, int, int); +int findmount(Chan**, Mhead**, int, int, Qid); +int fixfault(Segment*, uintptr, int, int); +void flushmmu(void); +void forceclosefgrp(void); +void forkchild(Proc*, Ureg*); +void forkret(void); +void free(void*); +void freeb(Block*); +void freeblist(Block*); +int freebroken(void); +void freepte(Segment*, Pte*); +void getcolor(ulong, ulong*, ulong*, ulong*); +uintptr getmalloctag(void*); +uintptr getrealloctag(void*); +void gotolabel(Label*); +char* getconfenv(void); +int haswaitq(void*); +long hostdomainwrite(char*, int); +long hostownerwrite(char*, int); +void hzsched(void); +Block* iallocb(int); +void iallocsummary(void); +long ibrk(uintptr, int); +void ilock(Lock*); +void iunlock(Lock*); +long incref(Ref*); +void initmark(Watermark *, int, char *); +void initseg(void); +int iprint(char*, ...); +void isdir(Chan*); +int iseve(void); +int islo(void); +Segment* isoverlap(Proc*, uintptr, int); +int ispages(void*); +int isphysseg(char*); +void ixsummary(void); +int kbdcr2nl(Queue*, int); +int kbdgetmap(uint, int*, int*, Rune*); +int kbdputc(Queue*, int); +void kbdputmap(ushort, ushort, Rune); +void kickpager(void); +void killbig(char*); +void kproc(char*, void(*)(void*), void*); +void kprocchild(Proc*, void (*)(void*), void*); +void (*kproftimer)(ulong); +void ksetenv(char*, char*, int); +void kstrcpy(char*, char*, int); +void kstrdup(char**, char*); +ulong l2be(long); +long latin1(Rune*, int); +int lock(Lock*); +void logopen(Log*); +void logclose(Log*); +char* logctl(Log*, int, char**, Logflag*); +void logn(Log*, int, void*, int); +long logread(Log*, void*, ulong, long); +void log(Log*, int, char*, ...); +Cmdtab* lookupcmd(Cmdbuf*, Cmdtab*, int); +Page* lookpage(Image*, uintptr); +#define MS2NS(n) (((vlong)(n))*1000000LL) +void machinit(void); +void* mallocz(ulong, int); +void* malloc(ulong); +void* mallocalign(ulong, ulong, long, ulong); +void mallocsummary(void); +Block* mem2bl(uchar*, int); +Block* mem2block(void *, ulong, int); +void mfreeseg(Segment*, uintptr, int); +void microdelay(int); +uvlong mk64fract(uvlong, uvlong); +void mkqid(Qid*, vlong, ulong, int); +void mmurelease(Proc*); +void mmuswitch(Proc*); +Chan* mntauth(Chan*, char*); +long mntversion(Chan*, char*, int, int); +void mouseresize(void); +void mountfree(Mount*); +ulong ms2tk(ulong); +ulong msize(void*); +ulong ms2tk(ulong); +uvlong ms2fastticks(ulong); +void mul64fract(uvlong*, uvlong, uvlong); +void muxclose(Mnt*); +Chan* namec(char*, int, int, ulong); +void nameerror(char*, char*); +Chan* newchan(void); +int newfd(Chan*); +Mhead* newmhead(Chan*); +Mount* newmount(Mhead*, Chan*, int, char*); +Page* newpage(int, Segment **, uintptr); +Path* newpath(char*); +Pgrp* newpgrp(void); +Rgrp* newrgrp(void); +Proc* newproc(void); +void nexterror(void); +void notemark(Watermark *, int); +int notify(Ureg*); +int nrand(int); +uvlong ns2fastticks(uvlong); +int okaddr(uintptr, ulong, int); +int openmode(ulong); +Block* packblock(Block*); +Block* padblock(Block*, int); +void pagechainhead(Page*); +void pageinit(void); +ulong pagenumber(Page*); +void pagersummary(void); +void panic(char*, ...); +Cmdbuf* parsecmd(char *a, int n); +void pathclose(Path*); +ulong perfticks(void); +void pexit(char*, int); +void pgrpcpy(Pgrp*, Pgrp*); +void pgrpnote(ulong, char*, long, int); +void pio(Segment *, uintptr, uintptr, Page **); +#define poperror() up->nerrlab-- +void portcountpagerefs(ulong*, int); +int postnote(Proc*, int, char*, int); +int pprint(char*, ...); +int preempted(void); +void prflush(void); +void printinit(void); +ulong procalarm(ulong); +void procctl(Proc*); +void procdump(void); +int procfdprint(Chan*, int, int, char*, int); +int procindex(ulong); +void procinit0(void); +void procflushseg(Segment*); +void procpriority(Proc*, int, int); +Proc* proctab(int); +extern void (*proctrace)(Proc*, int, vlong); +void procwired(Proc*, int); +Pte* ptealloc(void); +Pte* ptecpy(Pte*); +int pullblock(Block**, int); +Block* pullupblock(Block*, int); +Block* pullupqueue(Queue*, int); +void putimage(Image*); +void putmhead(Mhead*); +void putmmu(u64int, u64int, Page*); +void putpage(Page*); +void putseg(Segment*); +void putstrn(char*, int); +void putswap(Page*); +ulong pwait(Waitmsg*); +void qaddlist(Queue*, Block*); +Block* qbread(Queue*, int); +long qbwrite(Queue*, Block*); +Queue* qbypass(void (*)(void*, Block*), void*); +int qcanread(Queue*); +void qclose(Queue*); +int qconsume(Queue*, void*, int); +Block* qcopy(Queue*, int, ulong); +int qdiscard(Queue*, int); +void qflush(Queue*); +void qfree(Queue*); +int qfull(Queue*); +Block* qget(Queue*); +void qhangup(Queue*, char*); +int qisclosed(Queue*); +int qiwrite(Queue*, void*, int); +int qlen(Queue*); +void qlock(QLock*); +Queue* qopen(int, int, void (*)(void*), void*); +int qpass(Queue*, Block*); +int qpassnolim(Queue*, Block*); +int qproduce(Queue*, void*, int); +void qputback(Queue*, Block*); +long qread(Queue*, void*, int); +Block* qremove(Queue*); +void qreopen(Queue*); +void qsetlimit(Queue*, int); +void qunlock(QLock*); +int qwindow(Queue*); +int qwrite(Queue*, void*, int); +void qnoblock(Queue*, int); +int rand(void); +void randominit(void); +ulong randomread(void*, ulong); +void rdb(void); +void readn(Chan *, void *, long); +int readnum(ulong, char*, ulong, ulong, int); +int readstr(ulong, char*, ulong, char*); +void ready(Proc*); +void* realloc(void *v, ulong size); +void rebootcmd(int, char**); +void reboot(void*, void*, ulong); +void relocateseg(Segment*, uintptr); +void renameuser(char*, char*); +void resched(char*); +void resrcwait(char*); +int return0(void*); +void rlock(RWlock*); +long rtctime(void); +void runlock(RWlock*); +Proc* runproc(void); +void savefpregs(FPsave*); +void sched(void); +void scheddump(void); +void schedinit(void); +void (*screenputs)(char*, int); +long seconds(void); +uintptr segattach(Proc*, ulong, char *, uintptr, uintptr); +void segclock(uintptr); +void segpage(Segment*, Page*); +char* seprintmark(char *, char *, Watermark *); +int setcolor(ulong, ulong, ulong, ulong); +void setkernur(Ureg*, Proc*); +int setlabel(Label*); +void setmalloctag(void*, uintptr); +void setrealloctag(void*, uintptr); +void setregisters(Ureg*, char*, char*, int); +void setswapchan(Chan*); +char* skipslash(char*); +void sleep(Rendez*, int(*)(void*), void*); +void* smalloc(ulong); +int splhi(void); +int spllo(void); +void splx(int); +void splxpc(int); +char* srvname(Chan*); +int swapcount(uintptr); +int swapfull(void); +void swapinit(void); +void timeradd(Timer*); +void timerdel(Timer*); +void timersinit(void); +void timerintr(Ureg*, Tval); +void timerset(Tval); +ulong tk2ms(ulong); +#define TK2MS(x) ((x)*(1000/HZ)) +uvlong tod2fastticks(vlong); +vlong todget(vlong*); +void todsetfreq(vlong); +void todinit(void); +void todset(vlong, vlong, int); +Block* trimblock(Block*, int, int); +void tsleep(Rendez*, int (*)(void*), void*, ulong); +int uartctl(Uart*, char*); +int uartgetc(void); +void uartkick(void*); +void uartmouse(Uart*, int (*)(Queue*, int), int); +void uartsetmouseputc(Uart*, int (*)(Queue*, int)); +void uartputc(int); +void uartputs(char*, int); +void uartrecv(Uart*, char); +int uartstageoutput(Uart*); +void unbreak(Proc*); +void uncachepage(Page*); +long unionread(Chan*, void*, long); +void unlock(Lock*); +uvlong us2fastticks(uvlong); +void userinit(void); +uintptr userpc(void); +long userwrite(char*, int); +void validaddr(uintptr, ulong, int); +void validname(char*, int); +char* validnamedup(char*, int); +void validstat(uchar*, int); +void* vmemchr(void*, int, int); +Proc* wakeup(Rendez*); +int walk(Chan**, char**, int, int, int*); +void wlock(RWlock*); +void wunlock(RWlock*); +void* xalloc(ulong); +void* xallocz(ulong, int); +void xfree(void*); +void xhole(uintptr, uintptr); +void xinit(void); +int xmerge(void*, void*); +void* xspanalloc(ulong, int, ulong); +void xsummary(void); +void yield(void); +Segment* data2txt(Segment*); +Segment* dupseg(Segment**, int, int); +Segment* newseg(int, uintptr, ulong); +Segment* seg(Proc*, uintptr, int); +void hnputv(void*, uvlong); +void hnputl(void*, uint); +void hnputs(void*, ushort); +uvlong nhgetv(void*); +uint nhgetl(void*); +ushort nhgets(void*); +ulong µs(void); +void _xinc(long*); +long _xdec(long*); +long lcycles(void); + +#pragma varargck argpos iprint 1 +#pragma varargck argpos panic 1 +#pragma varargck argpos pprint 1 --- /dev/null +++ /sys/src/9/port64/proc.c @@ -0,0 +1,1634 @@ +#include +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" +#include "../port/edf.h" +#include + +int schedgain = 30; /* units in seconds */ +int nrdy; +Ref noteidalloc; + +void updatecpu(Proc*); +int reprioritize(Proc*); + +ulong delayedscheds; /* statistics */ +long skipscheds; +long preempts; +ulong load; + +static Ref pidalloc; + +static struct Procalloc +{ + Lock; + Proc* ht[128]; + Proc* arena; + Proc* free; +} procalloc; + +enum +{ + Q=10, + DQ=4, + Scaling=2, +}; + +Schedq runq[Nrq]; +ulong runvec; + +char *statename[] = +{ /* BUG: generate automatically */ + "Dead", + "Moribund", + "Ready", + "Scheding", + "Running", + "Queueing", + "QueueingR", + "QueueingW", + "Wakeme", + "Broken", + "Stopped", + "Rendez", + "Waitrelease", +}; + +static void pidhash(Proc*); +static void pidunhash(Proc*); +static void rebalance(void); + +/* + * Always splhi()'ed. + */ +void +schedinit(void) /* never returns */ +{ + Edf *e; + + setlabel(&m->sched); + if(up) { + if((e = up->edf) && (e->flags & Admitted)) + edfrecord(up); + m->proc = 0; + switch(up->state) { + case Running: + ready(up); + break; + case Moribund: + up->state = Dead; + edfstop(up); + if (up->edf) + free(up->edf); + up->edf = nil; + + /* + * Holding locks from pexit: + * procalloc + * palloc + */ + mmurelease(up); + + up->qnext = procalloc.free; + procalloc.free = up; + + unlock(&palloc); + unlock(&procalloc); + break; + } + up->mach = nil; + updatecpu(up); + up = nil; + } + sched(); +} + +/* + * If changing this routine, look also at sleep(). It + * contains a copy of the guts of sched(). + */ +void +sched(void) +{ + Proc *p; + + if(m->ilockdepth) + panic("cpu%d: ilockdepth %d, last lock %#p at %#p, sched called from %#p", + m->machno, + m->ilockdepth, + up? up->lastilock: nil, + (up && up->lastilock)? up->lastilock->pc: 0, + getcallerpc(&p+2)); + if(up){ + /* + * Delay the sched until the process gives up the locks + * it is holding. This avoids dumb lock loops. + * Don't delay if the process is Moribund. + * It called sched to die. + * But do sched eventually. This avoids a missing unlock + * from hanging the entire kernel. + * But don't reschedule procs holding palloc or procalloc. + * Those are far too important to be holding while asleep. + * + * This test is not exact. There can still be a few instructions + * in the middle of taslock when a process holds a lock + * but Lock.p has not yet been initialized. + */ + if(up->nlocks.ref) + if(up->state != Moribund) + if(up->delaysched < 20 + || palloc.Lock.p == up + || procalloc.Lock.p == up){ + up->delaysched++; + delayedscheds++; + return; + } + up->delaysched = 0; + + splhi(); + + /* statistics */ + m->cs++; + + procsave(up); + if(setlabel(&up->sched)){ + procrestore(up); + spllo(); + return; + } + gotolabel(&m->sched); + } + p = runproc(); + if(!p->edf){ + updatecpu(p); + p->priority = reprioritize(p); + } + if(p != m->readied) + m->schedticks = m->ticks + HZ/10; + m->readied = 0; + up = p; + up->state = Running; + up->mach = MACHP(m->machno); + m->proc = up; + mmuswitch(up); + gotolabel(&up->sched); +} + +int +anyready(void) +{ + return runvec; +} + +int +anyhigher(void) +{ + return runvec & ~((1<<(up->priority+1))-1); +} + +/* + * here once per clock tick to see if we should resched + */ +void +hzsched(void) +{ + /* once a second, rebalance will reprioritize ready procs */ + if(m->machno == 0) + rebalance(); + + /* unless preempted, get to run for at least 100ms */ + if(anyhigher() + || (!up->fixedpri && m->ticks > m->schedticks && anyready())){ + m->readied = nil; /* avoid cooperative scheduling */ + up->delaysched++; + } +} + +/* + * here at the end of non-clock interrupts to see if we should preempt the + * current process. Returns 1 if preempted, 0 otherwise. + */ +int +preempted(void) +{ + if(up && up->state == Running) + if(up->preempted == 0) + if(anyhigher()) + if(!active.exiting){ + m->readied = nil; /* avoid cooperative scheduling */ + up->preempted = 1; + sched(); + splhi(); + up->preempted = 0; + return 1; + } + return 0; +} + +/* + * Update the cpu time average for this particular process, + * which is about to change from up -> not up or vice versa. + * p->lastupdate is the last time an updatecpu happened. + * + * The cpu time average is a decaying average that lasts + * about D clock ticks. D is chosen to be approximately + * the cpu time of a cpu-intensive "quick job". A job has to run + * for approximately D clock ticks before we home in on its + * actual cpu usage. Thus if you manage to get in and get out + * quickly, you won't be penalized during your burst. Once you + * start using your share of the cpu for more than about D + * clock ticks though, your p->cpu hits 1000 (1.0) and you end up + * below all the other quick jobs. Interactive tasks, because + * they basically always use less than their fair share of cpu, + * will be rewarded. + * + * If the process has not been running, then we want to + * apply the filter + * + * cpu = cpu * (D-1)/D + * + * n times, yielding + * + * cpu = cpu * ((D-1)/D)^n + * + * but D is big enough that this is approximately + * + * cpu = cpu * (D-n)/D + * + * so we use that instead. + * + * If the process has been running, we apply the filter to + * 1 - cpu, yielding a similar equation. Note that cpu is + * stored in fixed point (* 1000). + * + * Updatecpu must be called before changing up, in order + * to maintain accurate cpu usage statistics. It can be called + * at any time to bring the stats for a given proc up-to-date. + */ +void +updatecpu(Proc *p) +{ + int n, t, ocpu; + int D = schedgain*HZ*Scaling; + + if(p->edf) + return; + + t = MACHP(0)->ticks*Scaling + Scaling/2; + n = t - p->lastupdate; + p->lastupdate = t; + + if(n == 0) + return; + if(n > D) + n = D; + + ocpu = p->cpu; + if(p != up) + p->cpu = (ocpu*(D-n))/D; + else{ + t = 1000 - ocpu; + t = (t*(D-n))/D; + p->cpu = 1000 - t; + } + +//iprint("pid %d %s for %d cpu %d -> %d\n", p->pid,p==up?"active":"inactive",n, ocpu,p->cpu); +} + +/* + * On average, p has used p->cpu of a cpu recently. + * Its fair share is conf.nmach/m->load of a cpu. If it has been getting + * too much, penalize it. If it has been getting not enough, reward it. + * I don't think you can get much more than your fair share that + * often, so most of the queues are for using less. Having a priority + * of 3 means you're just right. Having a higher priority (up to p->basepri) + * means you're not using as much as you could. + */ +int +reprioritize(Proc *p) +{ + int fairshare, n, load, ratio; + + load = MACHP(0)->load; + if(load == 0) + return p->basepri; + + /* + * fairshare = 1.000 * conf.nproc * 1.000/load, + * except the decimal point is moved three places + * on both load and fairshare. + */ + fairshare = (conf.nmach*1000*1000)/load; + n = p->cpu; + if(n == 0) + n = 1; + ratio = (fairshare+n/2) / n; + if(ratio > p->basepri) + ratio = p->basepri; + if(ratio < 0) + panic("reprioritize"); +//iprint("pid %d cpu %d load %d fair %d pri %d\n", p->pid, p->cpu, load, fairshare, ratio); + return ratio; +} + +/* + * add a process to a scheduling queue + */ +void +queueproc(Schedq *rq, Proc *p) +{ + int pri; + + pri = rq - runq; + lock(runq); + p->priority = pri; + p->rnext = 0; + if(rq->tail) + rq->tail->rnext = p; + else + rq->head = p; + rq->tail = p; + rq->n++; + nrdy++; + runvec |= 1<head; p; p = p->rnext){ + if(p == tp) + break; + l = p; + } + + /* + * p->mach==0 only when process state is saved + */ + if(p == 0 || p->mach){ + unlock(runq); + return nil; + } + if(p->rnext == 0) + rq->tail = l; + if(l) + l->rnext = p->rnext; + else + rq->head = p->rnext; + if(rq->head == nil) + runvec &= ~(1<<(rq-runq)); + rq->n--; + nrdy--; + if(p->state != Ready) + print("dequeueproc %s %lud %s\n", p->text, p->pid, statename[p->state]); + + unlock(runq); + return p; +} + +/* + * ready(p) picks a new priority for a process and sticks it in the + * runq for that priority. + */ +void +ready(Proc *p) +{ + int s, pri; + Schedq *rq; + void (*pt)(Proc*, int, vlong); + + s = splhi(); + if(edfready(p)){ + splx(s); + return; + } + + if(up != p && (p->wired == nil || p->wired == m)) + m->readied = p; /* group scheduling */ + + updatecpu(p); + pri = reprioritize(p); + p->priority = pri; + rq = &runq[pri]; + p->state = Ready; + queueproc(rq, p); + pt = proctrace; + if(pt) + pt(p, SReady, 0); + splx(s); +} + +/* + * yield the processor and drop our priority + */ +void +yield(void) +{ + if(anyready()){ + /* pretend we just used 1/2 tick */ + up->lastupdate -= Scaling/2; + sched(); + } +} + +/* + * recalculate priorities once a second. We need to do this + * since priorities will otherwise only be recalculated when + * the running process blocks. + */ +ulong balancetime; + +static void +rebalance(void) +{ + int pri, npri, t, x; + Schedq *rq; + Proc *p; + + t = m->ticks; + if(t - balancetime < HZ) + return; + balancetime = t; + + for(pri=0, rq=runq; prihead; + if(p == nil) + continue; + if(p->mp != MACHP(m->machno)) + continue; + if(pri == p->basepri) + continue; + updatecpu(p); + npri = reprioritize(p); + if(npri != pri){ + x = splhi(); + p = dequeueproc(rq, p); + if(p) + queueproc(&runq[npri], p); + splx(x); + goto another; + } + } +} + + +/* + * pick a process to run + */ +Proc* +runproc(void) +{ + Schedq *rq; + Proc *p; + ulong start, now; + int i; + void (*pt)(Proc*, int, vlong); + + start = perfticks(); + + /* cooperative scheduling until the clock ticks */ + if((p=m->readied) && p->mach==0 && p->state==Ready + && (p->wired == nil || p->wired == m) + && runq[Nrq-1].head == nil && runq[Nrq-2].head == nil){ + skipscheds++; + rq = &runq[p->priority]; + goto found; + } + + preempts++; + +loop: + /* + * find a process that last ran on this processor (affinity), + * or one that hasn't moved in a while (load balancing). Every + * time around the loop affinity goes down. + */ + spllo(); + for(i = 0;; i++){ + /* + * find the highest priority target process that this + * processor can run given affinity constraints. + * + */ + for(rq = &runq[Nrq-1]; rq >= runq; rq--){ + for(p = rq->head; p; p = p->rnext){ + if(p->mp == nil || p->mp == MACHP(m->machno) + || (!p->wired && i > 0)) + goto found; + } + } + + /* waste time or halt the CPU */ + idlehands(); + + /* remember how much time we're here */ + now = perfticks(); + m->perf.inidle += now-start; + start = now; + } + +found: + splhi(); + p = dequeueproc(rq, p); + if(p == nil) + goto loop; + + p->state = Scheding; + p->mp = MACHP(m->machno); + + if(edflock(p)){ + edfrun(p, rq == &runq[PriEdf]); /* start deadline timer and do admin */ + edfunlock(); + } + pt = proctrace; + if(pt) + pt(p, SRun, 0); + return p; +} + +int +canpage(Proc *p) +{ + int ok = 0; + + splhi(); + lock(runq); + /* Only reliable way to see if we are Running */ + if(p->mach == 0) { + p->newtlb = 1; + ok = 1; + } + unlock(runq); + spllo(); + + return ok; +} + +void +noprocpanic(char *msg) +{ + /* + * setting exiting will make hzclock() on each processor call exit(0). + * clearing our bit in machs avoids calling exit(0) from hzclock() + * on this processor. + */ + lock(&active); + active.machs &= ~(1<machno); + active.exiting = 1; + unlock(&active); + + procdump(); + delay(1000); + panic(msg); +} + +Proc* +newproc(void) +{ + char msg[64]; + Proc *p; + + lock(&procalloc); + while((p = procalloc.free) == nil) { + unlock(&procalloc); + + snprint(msg, sizeof msg, "no procs; %s forking", + up? up->text: "kernel"); + /* + * the situation is unlikely to heal itself. + * dump the proc table and restart by default. + * *noprocspersist in plan9.ini will yield the old + * behaviour of trying forever. + */ + if(getconf("*noprocspersist") == nil) + noprocpanic(msg); + resrcwait(msg); + lock(&procalloc); + } + procalloc.free = p->qnext; + unlock(&procalloc); + + p->state = Scheding; + p->psstate = "New"; + p->mach = 0; + p->qnext = 0; + p->nchild = 0; + p->nwait = 0; + p->waitq = 0; + p->parent = 0; + p->pgrp = 0; + p->egrp = 0; + p->fgrp = 0; + p->rgrp = 0; + p->pdbg = 0; + p->fpstate = FPinit; + p->kp = 0; + if(up && up->procctl == Proc_tracesyscall) + p->procctl = Proc_tracesyscall; + else + p->procctl = 0; + p->syscalltrace = 0; + p->notepending = 0; + p->ureg = 0; + p->privatemem = 0; + p->noswap = 0; + p->errstr = p->errbuf0; + p->syserrstr = p->errbuf1; + p->errbuf0[0] = '\0'; + p->errbuf1[0] = '\0'; + p->nlocks.ref = 0; + p->delaysched = 0; + p->trace = 0; + kstrdup(&p->user, "*nouser"); + kstrdup(&p->text, "*notext"); + kstrdup(&p->args, ""); + p->nargs = 0; + p->setargs = 0; + memset(p->seg, 0, sizeof p->seg); + p->pid = incref(&pidalloc); + pidhash(p); + p->noteid = incref(¬eidalloc); + if(p->pid==0 || p->noteid==0) + panic("pidalloc"); + if(p->kstack == 0) + p->kstack = smalloc(KSTACK); + + /* sched params */ + p->mp = 0; + p->wired = 0; + procpriority(p, PriNormal, 0); + p->cpu = 0; + p->lastupdate = MACHP(0)->ticks*Scaling; + p->edf = nil; + + return p; +} + +/* + * wire this proc to a machine + */ +void +procwired(Proc *p, int bm) +{ + Proc *pp; + int i; + char nwired[MAXMACH]; + Mach *wm; + + if(bm < 0){ + /* pick a machine to wire to */ + memset(nwired, 0, sizeof(nwired)); + p->wired = 0; + pp = proctab(0); + for(i=0; iwired; + if(wm && pp->pid) + nwired[wm->machno]++; + } + bm = 0; + for(i=0; iwired = MACHP(bm); + p->mp = p->wired; +} + +void +procpriority(Proc *p, int pri, int fixed) +{ + if(pri >= Npriq) + pri = Npriq - 1; + else if(pri < 0) + pri = 0; + p->basepri = pri; + p->priority = pri; + if(fixed){ + p->fixedpri = 1; + } else { + p->fixedpri = 0; + } +} + +void +procinit0(void) /* bad planning - clashes with devproc.c */ +{ + Proc *p; + int i; + + procalloc.free = xalloc(conf.nproc*sizeof(Proc)); + if(procalloc.free == nil){ + xsummary(); + panic("cannot allocate %lud procs (%ludMB)\n", conf.nproc, conf.nproc*sizeof(Proc)/(1024*1024)); + } + procalloc.arena = procalloc.free; + + p = procalloc.free; + for(i=0; iqnext = p+1; + p->qnext = 0; +} + +/* + * sleep if a condition is not true. Another process will + * awaken us after it sets the condition. When we awaken + * the condition may no longer be true. + * + * we lock both the process and the rendezvous to keep r->p + * and p->r synchronized. + */ +void +sleep(Rendez *r, int (*f)(void*), void *arg) +{ + int s; + void (*pt)(Proc*, int, vlong); + + s = splhi(); + + if(up->nlocks.ref) + print("process %lud sleeps with %lud locks held, last lock %#p locked at pc %#p, sleep called from %#p\n", + up->pid, up->nlocks.ref, up->lastlock, up->lastlock->pc, getcallerpc(&r)); + lock(r); + lock(&up->rlock); + if(r->p){ + print("double sleep called from %#p, %lud %lud\n", getcallerpc(&r), r->p->pid, up->pid); + dumpstack(); + } + + /* + * Wakeup only knows there may be something to do by testing + * r->p in order to get something to lock on. + * Flush that information out to memory in case the sleep is + * committed. + */ + r->p = up; + + if((*f)(arg) || up->notepending){ + /* + * if condition happened or a note is pending + * never mind + */ + r->p = nil; + unlock(&up->rlock); + unlock(r); + } else { + /* + * now we are committed to + * change state and call scheduler + */ + pt = proctrace; + if(pt) + pt(up, SSleep, 0); + up->state = Wakeme; + up->r = r; + + /* statistics */ + m->cs++; + + procsave(up); + if(setlabel(&up->sched)) { + /* + * here when the process is awakened + */ + procrestore(up); + spllo(); + } else { + /* + * here to go to sleep (i.e. stop Running) + */ + unlock(&up->rlock); + unlock(r); + gotolabel(&m->sched); + } + } + + if(up->notepending) { + up->notepending = 0; + splx(s); + if(up->procctl == Proc_exitme && up->closingfgrp) + forceclosefgrp(); + error(Eintr); + } + + splx(s); +} + +static int +tfn(void *arg) +{ + return up->trend == nil || up->tfn(arg); +} + +void +twakeup(Ureg*, Timer *t) +{ + Proc *p; + Rendez *trend; + + p = t->ta; + trend = p->trend; + p->trend = 0; + if(trend) + wakeup(trend); +} + +void +tsleep(Rendez *r, int (*fn)(void*), void *arg, ulong ms) +{ + if (up->tt){ + print("tsleep: timer active: mode %d, tf %#p\n", up->tmode, up->tf); + timerdel(up); + } + up->tns = MS2NS(ms); + up->tf = twakeup; + up->tmode = Trelative; + up->ta = up; + up->trend = r; + up->tfn = fn; + timeradd(up); + + if(waserror()){ + timerdel(up); + nexterror(); + } + sleep(r, tfn, arg); + if(up->tt) + timerdel(up); + up->twhen = 0; + poperror(); +} + +/* + * Expects that only one process can call wakeup for any given Rendez. + * We hold both locks to ensure that r->p and p->r remain consistent. + * Richard Miller has a better solution that doesn't require both to + * be held simultaneously, but I'm a paranoid - presotto. + */ +Proc* +wakeup(Rendez *r) +{ + Proc *p; + int s; + + s = splhi(); + + lock(r); + p = r->p; + + if(p != nil){ + lock(&p->rlock); + if(p->state != Wakeme || p->r != r){ + iprint("%p %p %d\n", p->r, r, p->state); + panic("wakeup: state"); + } + r->p = nil; + p->r = nil; + ready(p); + unlock(&p->rlock); + } + unlock(r); + + splx(s); + + return p; +} + +/* + * if waking a sleeping process, this routine must hold both + * p->rlock and r->lock. However, it can't know them in + * the same order as wakeup causing a possible lock ordering + * deadlock. We break the deadlock by giving up the p->rlock + * lock if we can't get the r->lock and retrying. + */ +int +postnote(Proc *p, int dolock, char *n, int flag) +{ + int s, ret; + Rendez *r; + Proc *d, **l; + + if(dolock) + qlock(&p->debug); + + if(flag != NUser && (p->notify == 0 || p->notified)) + p->nnote = 0; + + ret = 0; + if(p->nnote < NNOTE) { + strcpy(p->note[p->nnote].msg, n); + p->note[p->nnote++].flag = flag; + ret = 1; + } + p->notepending = 1; + if(dolock) + qunlock(&p->debug); + + /* this loop is to avoid lock ordering problems. */ + for(;;){ + s = splhi(); + lock(&p->rlock); + r = p->r; + + /* waiting for a wakeup? */ + if(r == nil) + break; /* no */ + + /* try for the second lock */ + if(canlock(r)){ + if(p->state != Wakeme || r->p != p) + panic("postnote: state %d %d %d", r->p != p, p->r != r, p->state); + p->r = nil; + r->p = nil; + ready(p); + unlock(r); + break; + } + + /* give other process time to get out of critical section and try again */ + unlock(&p->rlock); + splx(s); + sched(); + } + unlock(&p->rlock); + splx(s); + + if(p->state != Rendezvous) + return ret; + + /* Try and pull out of a rendezvous */ + lock(p->rgrp); + if(p->state == Rendezvous) { + p->rendval = ~0; + l = &REND(p->rgrp, p->rendtag); + for(d = *l; d; d = d->rendhash) { + if(d == p) { + *l = p->rendhash; + break; + } + l = &d->rendhash; + } + ready(p); + } + unlock(p->rgrp); + return ret; +} + +/* + * weird thing: keep at most NBROKEN around + */ +#define NBROKEN 4 +struct +{ + QLock; + int n; + Proc *p[NBROKEN]; +}broken; + +void +addbroken(Proc *p) +{ + qlock(&broken); + if(broken.n == NBROKEN) { + ready(broken.p[0]); + memmove(&broken.p[0], &broken.p[1], sizeof(Proc*)*(NBROKEN-1)); + --broken.n; + } + broken.p[broken.n++] = p; + qunlock(&broken); + + edfstop(up); + p->state = Broken; + p->psstate = 0; + sched(); +} + +void +unbreak(Proc *p) +{ + int b; + + qlock(&broken); + for(b=0; b < broken.n; b++) + if(broken.p[b] == p) { + broken.n--; + memmove(&broken.p[b], &broken.p[b+1], + sizeof(Proc*)*(NBROKEN-(b+1))); + ready(p); + break; + } + qunlock(&broken); +} + +int +freebroken(void) +{ + int i, n; + + qlock(&broken); + n = broken.n; + for(i=0; isyscalltrace) + free(up->syscalltrace); + up->alarm = 0; + if (up->tt) + timerdel(up); + pt = proctrace; + if(pt) + pt(up, SDead, 0); + + /* nil out all the resources under lock (free later) */ + qlock(&up->debug); + fgrp = up->fgrp; + up->fgrp = nil; + egrp = up->egrp; + up->egrp = nil; + rgrp = up->rgrp; + up->rgrp = nil; + pgrp = up->pgrp; + up->pgrp = nil; + dot = up->dot; + up->dot = nil; + qunlock(&up->debug); + + if(fgrp) + closefgrp(fgrp); + if(egrp) + closeegrp(egrp); + if(rgrp) + closergrp(rgrp); + if(dot) + cclose(dot); + if(pgrp) + closepgrp(pgrp); + + /* + * if not a kernel process and have a parent, + * do some housekeeping. + */ + if(up->kp == 0) { + p = up->parent; + if(p == 0) { + if(exitstr == 0) + exitstr = "unknown"; + panic("boot process died: %s", exitstr); + } + + while(waserror()) + ; + + wq = smalloc(sizeof(Waitq)); + poperror(); + + wq->w.pid = up->pid; + utime = up->time[TUser] + up->time[TCUser]; + stime = up->time[TSys] + up->time[TCSys]; + wq->w.time[TUser] = tk2ms(utime); + wq->w.time[TSys] = tk2ms(stime); + wq->w.time[TReal] = tk2ms(MACHP(0)->ticks - up->time[TReal]); + if(exitstr && exitstr[0]) + snprint(wq->w.msg, sizeof(wq->w.msg), "%s %lud: %s", up->text, up->pid, exitstr); + else + wq->w.msg[0] = '\0'; + + lock(&p->exl); + /* + * Check that parent is still alive. + */ + if(p->pid == up->parentpid && p->state != Broken) { + p->nchild--; + p->time[TCUser] += utime; + p->time[TCSys] += stime; + /* + * If there would be more than 128 wait records + * processes for my parent, then don't leave a wait + * record behind. This helps prevent badly written + * daemon processes from accumulating lots of wait + * records. + */ + if(p->nwait < 128) { + wq->next = p->waitq; + p->waitq = wq; + p->nwait++; + wq = nil; + wakeup(&p->waitr); + } + } + unlock(&p->exl); + if(wq) + free(wq); + } + + if(!freemem) + addbroken(up); + + qlock(&up->seglock); + es = &up->seg[NSEG]; + for(s = up->seg; s < es; s++) { + if(*s) { + putseg(*s); + *s = 0; + } + } + qunlock(&up->seglock); + + lock(&up->exl); /* Prevent my children from leaving waits */ + pidunhash(up); + up->pid = 0; + wakeup(&up->waitr); + unlock(&up->exl); + + for(f = up->waitq; f; f = next) { + next = f->next; + free(f); + } + + /* release debuggers */ + qlock(&up->debug); + if(up->pdbg) { + wakeup(&up->pdbg->sleep); + up->pdbg = 0; + } + qunlock(&up->debug); + + /* Sched must not loop for these locks */ + lock(&procalloc); + lock(&palloc); + + edfstop(up); + up->state = Moribund; + sched(); + panic("pexit"); +} + +int +haswaitq(void *x) +{ + Proc *p; + + p = (Proc *)x; + return p->waitq != 0; +} + +ulong +pwait(Waitmsg *w) +{ + ulong cpid; + Waitq *wq; + + if(!canqlock(&up->qwaitr)) + error(Einuse); + + if(waserror()) { + qunlock(&up->qwaitr); + nexterror(); + } + + lock(&up->exl); + if(up->nchild == 0 && up->waitq == 0) { + unlock(&up->exl); + error(Enochild); + } + unlock(&up->exl); + + sleep(&up->waitr, haswaitq, up); + + lock(&up->exl); + wq = up->waitq; + up->waitq = wq->next; + up->nwait--; + unlock(&up->exl); + + qunlock(&up->qwaitr); + poperror(); + + if(w) + memmove(w, &wq->w, sizeof(Waitmsg)); + cpid = wq->w.pid; + free(wq); + return cpid; +} + +Proc* +proctab(int i) +{ + return &procalloc.arena[i]; +} + +void +dumpaproc(Proc *p) +{ + ulong bss; + char *s; + + if(p == 0) + return; + + bss = 0; + if(p->seg[BSEG]) + bss = p->seg[BSEG]->top; + + s = p->psstate; + if(s == 0) + s = statename[p->state]; + print("%3lud:%10s pc %#p dbgpc %#p %8s (%s) ut %ld st %ld bss %lux qpc %#p nl %lud nd %lud lpc %#p pri %lud\n", + p->pid, p->text, p->pc, dbgpc(p), s, statename[p->state], + p->time[0], p->time[1], bss, p->qpc, p->nlocks.ref, p->delaysched, p->lastlock ? p->lastlock->pc : 0, p->priority); +} + +void +procdump(void) +{ + int i; + Proc *p; + + if(up) + print("up %lud\n", up->pid); + else + print("no current process\n"); + for(i=0; istate == Dead) + continue; + + dumpaproc(p); + } +} + +/* + * wait till all processes have flushed their mmu + * state about segement s + */ +void +procflushseg(Segment *s) +{ + int i, ns, nm, nwait; + Proc *p; + + /* + * tell all processes with this + * segment to flush their mmu's + */ + nwait = 0; + for(i=0; istate == Dead) + continue; + for(ns = 0; ns < NSEG; ns++) + if(p->seg[ns] == s){ + p->newtlb = 1; + for(nm = 0; nm < conf.nmach; nm++){ + if(MACHP(nm)->proc == p){ + MACHP(nm)->flushmmu = 1; + nwait++; + } + } + break; + } + } + + if(nwait == 0) + return; + + /* + * wait for all processors to take a clock interrupt + * and flush their mmu's + */ + for(nm = 0; nm < conf.nmach; nm++) + if(MACHP(nm) != m) + while(MACHP(nm)->flushmmu) + sched(); +} + +void +scheddump(void) +{ + Proc *p; + Schedq *rq; + + for(rq = &runq[Nrq-1]; rq >= runq; rq--){ + if(rq->head == 0) + continue; + print("rq%ld:", rq-runq); + for(p = rq->head; p; p = p->rnext) + print(" %lud(%lud)", p->pid, m->ticks - p->readytime); + print("\n"); + delay(150); + } + print("nrdy %d\n", nrdy); +} + +void +kproc(char *name, void (*func)(void *), void *arg) +{ + Proc *p; + static Pgrp *kpgrp; + + p = newproc(); + p->psstate = 0; + p->procmode = 0640; + p->kp = 1; + p->noswap = 1; + + p->fpsave = up->fpsave; + p->scallnr = up->scallnr; + p->s = up->s; + p->nerrlab = 0; + p->slash = up->slash; + p->dot = up->dot; + if(p->dot) + incref(p->dot); + + memmove(p->note, up->note, sizeof(p->note)); + p->nnote = up->nnote; + p->notified = 0; + p->lastnote = up->lastnote; + p->notify = up->notify; + p->ureg = 0; + p->dbgreg = 0; + + procpriority(p, PriKproc, 0); + + kprocchild(p, func, arg); + + kstrdup(&p->user, eve); + kstrdup(&p->text, name); + if(kpgrp == 0) + kpgrp = newpgrp(); + p->pgrp = kpgrp; + incref(kpgrp); + + memset(p->time, 0, sizeof(p->time)); + p->time[TReal] = MACHP(0)->ticks; + ready(p); +} + +/* + * called splhi() by notify(). See comment in notify for the + * reasoning. + */ +void +procctl(Proc *p) +{ + char *state; + ulong s; + + switch(p->procctl) { + case Proc_exitbig: + spllo(); + pexit("Killed: Insufficient physical memory", 1); + + case Proc_exitme: + spllo(); /* pexit has locks in it */ + pexit("Killed", 1); + + case Proc_traceme: + if(p->nnote == 0) + return; + /* No break */ + + case Proc_stopme: + p->procctl = 0; + state = p->psstate; + p->psstate = "Stopped"; + /* free a waiting debugger */ + s = spllo(); + qlock(&p->debug); + if(p->pdbg) { + wakeup(&p->pdbg->sleep); + p->pdbg = 0; + } + qunlock(&p->debug); + splhi(); + p->state = Stopped; + sched(); + p->psstate = state; + splx(s); + return; + } +} + +#include "errstr.h" + +void +error(char *err) +{ + spllo(); + + assert(up->nerrlab < NERR); + kstrcpy(up->errstr, err, ERRMAX); + setlabel(&up->errlab[NERR-1]); + nexterror(); +} + +void +nexterror(void) +{ + gotolabel(&up->errlab[--up->nerrlab]); +} + +void +exhausted(char *resource) +{ + char buf[ERRMAX]; + + snprint(buf, sizeof buf, "no free %s", resource); + iprint("%s\n", buf); + error(buf); +} + +void +killbig(char *why) +{ + int i; + Segment *s; + ulong l, max; + Proc *p, *ep, *kp; + + max = 0; + kp = 0; + ep = procalloc.arena+conf.nproc; + for(p = procalloc.arena; p < ep; p++) { + if(p->state == Dead || p->kp) + continue; + l = 0; + for(i=1; iseg[i]; + if(s != 0) + l += s->top - s->base; + } + if(l > max && ((p->procmode&0222) || strcmp(eve, p->user)!=0)) { + kp = p; + max = l; + } + } + + print("%lud: %s killed: %s\n", kp->pid, kp->text, why); + for(p = procalloc.arena; p < ep; p++) { + if(p->state == Dead || p->kp) + continue; + if(p != kp && p->seg[BSEG] && p->seg[BSEG] == kp->seg[BSEG]) + p->procctl = Proc_exitbig; + } + kp->procctl = Proc_exitbig; + for(i = 0; i < NSEG; i++) { + s = kp->seg[i]; + if(s != 0 && canqlock(&s->lk)) { + mfreeseg(s, s->base, (s->top - s->base)/BY2PG); + qunlock(&s->lk); + } + } +} + +/* + * change ownership to 'new' of all processes owned by 'old'. Used when + * eve changes. + */ +void +renameuser(char *old, char *new) +{ + Proc *p, *ep; + + ep = procalloc.arena+conf.nproc; + for(p = procalloc.arena; p < ep; p++) + if(p->user!=nil && strcmp(old, p->user)==0) + kstrdup(&p->user, new); +} + +/* + * time accounting called by clock() splhi'd + */ +void +accounttime(void) +{ + Proc *p; + ulong n, per; + static ulong nrun; + + p = m->proc; + if(p) { + nrun++; + p->time[p->insyscall]++; + } + + /* calculate decaying duty cycles */ + n = perfticks(); + per = n - m->perf.last; + m->perf.last = n; + per = (m->perf.period*(HZ-1) + per)/HZ; + if(per != 0) + m->perf.period = per; + + m->perf.avg_inidle = (m->perf.avg_inidle*(HZ-1)+m->perf.inidle)/HZ; + m->perf.inidle = 0; + + m->perf.avg_inintr = (m->perf.avg_inintr*(HZ-1)+m->perf.inintr)/HZ; + m->perf.inintr = 0; + + /* only one processor gets to compute system load averages */ + if(m->machno != 0) + return; + + /* + * calculate decaying load average. + * if we decay by (n-1)/n then it takes + * n clock ticks to go from load L to .36 L once + * things quiet down. it takes about 5 n clock + * ticks to go to zero. so using HZ means this is + * approximately the load over the last second, + * with a tail lasting about 5 seconds. + */ + n = nrun; + nrun = 0; + n = (nrdy+n)*1000; + m->load = (m->load*(HZ-1)+n)/HZ; +} + +static void +pidhash(Proc *p) +{ + int h; + + h = p->pid % nelem(procalloc.ht); + lock(&procalloc); + p->pidhash = procalloc.ht[h]; + procalloc.ht[h] = p; + unlock(&procalloc); +} + +static void +pidunhash(Proc *p) +{ + int h; + Proc **l; + + h = p->pid % nelem(procalloc.ht); + lock(&procalloc); + for(l = &procalloc.ht[h]; *l != nil; l = &(*l)->pidhash) + if(*l == p){ + *l = p->pidhash; + break; + } + unlock(&procalloc); +} + +int +procindex(ulong pid) +{ + Proc *p; + int h; + int s; + + s = -1; + h = pid % nelem(procalloc.ht); + lock(&procalloc); + for(p = procalloc.ht[h]; p != nil; p = p->pidhash) + if(p->pid == pid){ + s = p - procalloc.arena; + break; + } + unlock(&procalloc); + return s; +} --- /dev/null +++ /sys/src/9/port64/segment.c @@ -0,0 +1,799 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +static void imagereclaim(void); +static void imagechanreclaim(void); + +#include "io.h" + +/* + * Attachable segment types + */ +static Physseg physseg[10] = { + { SG_SHARED, "shared", 0, SEGMAXSIZE, 0, 0 }, + { SG_BSS, "memory", 0, SEGMAXSIZE, 0, 0 }, + { 0, 0, 0, 0, 0, 0 }, +}; + +static Lock physseglock; + +#define NFREECHAN 64 +#define IHASHSIZE 64 +#define ihash(s) imagealloc.hash[s%IHASHSIZE] +static struct Imagealloc +{ + Lock; + Image *free; + Image *hash[IHASHSIZE]; + QLock ireclaim; /* mutex on reclaiming free images */ + + Chan **freechan; /* free image channels */ + int nfreechan; /* number of free channels */ + int szfreechan; /* size of freechan array */ + QLock fcreclaim; /* mutex on reclaiming free channels */ +}imagealloc; + +Segment* (*_globalsegattach)(Proc*, char*); + +void +initseg(void) +{ + Image *i, *ie; + + imagealloc.free = xalloc(conf.nimage*sizeof(Image)); + if (imagealloc.free == nil) + panic("initseg: no memory"); + ie = &imagealloc.free[conf.nimage-1]; + for(i = imagealloc.free; i < ie; i++) + i->next = i+1; + i->next = 0; + imagealloc.freechan = malloc(NFREECHAN * sizeof(Chan*)); + imagealloc.szfreechan = NFREECHAN; +} + +Segment * +newseg(int type, uintptr base, ulong size) +{ + Segment *s; + int mapsize; + + if(size > (SEGMAPSIZE*PTEPERTAB)) + error(Enovmem); + + s = smalloc(sizeof(Segment)); + s->ref = 1; + s->type = type; + s->base = base; + s->top = base+(size*BY2PG); + s->size = size; + s->sema.prev = &s->sema; + s->sema.next = &s->sema; + + mapsize = ROUND(size, PTEPERTAB)/PTEPERTAB; + if(mapsize > nelem(s->ssegmap)){ + mapsize *= 2; + if(mapsize > (SEGMAPSIZE*PTEPERTAB)) + mapsize = (SEGMAPSIZE*PTEPERTAB); + s->map = smalloc(mapsize*sizeof(Pte*)); + s->mapsize = mapsize; + } + else{ + s->map = s->ssegmap; + s->mapsize = nelem(s->ssegmap); + } + + return s; +} + +void +putseg(Segment *s) +{ + Pte **pp, **emap; + Image *i; + + if(s == 0) + return; + + i = s->image; + if(i != 0) { + lock(i); + lock(s); + if(i->s == s && s->ref == 1) + i->s = 0; + unlock(i); + } + else + lock(s); + + s->ref--; + if(s->ref != 0) { + unlock(s); + return; + } + unlock(s); + + qlock(&s->lk); + if(i) + putimage(i); + + emap = &s->map[s->mapsize]; + for(pp = s->map; pp < emap; pp++) + if(*pp) + freepte(s, *pp); + + qunlock(&s->lk); + if(s->map != s->ssegmap) + free(s->map); + if(s->profile != 0) + free(s->profile); + free(s); +} + +void +relocateseg(Segment *s, uintptr offset) +{ + Page **pg, *x; + Pte *pte, **p, **endpte; + + endpte = &s->map[s->mapsize]; + for(p = s->map; p < endpte; p++) { + if(*p == 0) + continue; + pte = *p; + for(pg = pte->first; pg <= pte->last; pg++) { + if(x = *pg) + x->va += offset; + } + } +} + +Segment* +dupseg(Segment **seg, int segno, int share) +{ + int i, size; + Pte *pte; + Segment *n, *s; + + SET(n); + s = seg[segno]; + + qlock(&s->lk); + if(waserror()){ + qunlock(&s->lk); + nexterror(); + } + switch(s->type&SG_TYPE) { + case SG_TEXT: /* New segment shares pte set */ + case SG_SHARED: + case SG_PHYSICAL: + goto sameseg; + + case SG_STACK: + n = newseg(s->type, s->base, s->size); + break; + + case SG_BSS: /* Just copy on write */ + if(share) + goto sameseg; + n = newseg(s->type, s->base, s->size); + break; + + case SG_DATA: /* Copy on write plus demand load info */ + if(segno == TSEG){ + poperror(); + qunlock(&s->lk); + return data2txt(s); + } + + if(share) + goto sameseg; + n = newseg(s->type, s->base, s->size); + + incref(s->image); + n->image = s->image; + n->fstart = s->fstart; + n->flen = s->flen; + break; + } + size = s->mapsize; + for(i = 0; i < size; i++) + if(pte = s->map[i]) + n->map[i] = ptecpy(pte); + + n->flushme = s->flushme; + if(s->ref > 1) + procflushseg(s); + poperror(); + qunlock(&s->lk); + return n; + +sameseg: + incref(s); + poperror(); + qunlock(&s->lk); + return s; +} + +void +segpage(Segment *s, Page *p) +{ + Pte **pte; + uintptr off; + Page **pg; + + if(p->va < s->base || p->va >= s->top) + panic("segpage"); + + off = p->va - s->base; + pte = &s->map[off/PTEMAPMEM]; + if(*pte == 0) + *pte = ptealloc(); + + pg = &(*pte)->pages[(off&(PTEMAPMEM-1))/BY2PG]; + *pg = p; + if(pg < (*pte)->first) + (*pte)->first = pg; + if(pg > (*pte)->last) + (*pte)->last = pg; +} + +Image* +attachimage(int type, Chan *c, uintptr base, ulong len) +{ + Image *i, **l; + + /* reclaim any free channels from reclaimed segments */ + if(imagealloc.nfreechan) + imagechanreclaim(); + + lock(&imagealloc); + + /* + * Search the image cache for remains of the text from a previous + * or currently running incarnation + */ + for(i = ihash(c->qid.path); i; i = i->hash) { + if(c->qid.path == i->qid.path) { + lock(i); + if(eqqid(c->qid, i->qid) && + eqqid(c->mqid, i->mqid) && + c->mchan == i->mchan && + c->type == i->type) { + goto found; + } + unlock(i); + } + } + + /* + * imagereclaim dumps pages from the free list which are cached by image + * structures. This should free some image structures. + */ + while(!(i = imagealloc.free)) { + unlock(&imagealloc); + imagereclaim(); + sched(); + lock(&imagealloc); + } + + imagealloc.free = i->next; + + lock(i); + incref(c); + i->c = c; + i->type = c->type; + i->qid = c->qid; + i->mqid = c->mqid; + i->mchan = c->mchan; + l = &ihash(c->qid.path); + i->hash = *l; + *l = i; +found: + unlock(&imagealloc); + + if(i->s == 0) { + /* Disaster after commit in exec */ + if(waserror()) { + unlock(i); + pexit(Enovmem, 1); + } + i->s = newseg(type, base, len); + i->s->image = i; + i->ref++; + poperror(); + } + else + incref(i->s); + + return i; +} + +static struct { + int calls; /* times imagereclaim was called */ + int loops; /* times the main loop was run */ + uvlong ticks; /* total time in the main loop */ + uvlong maxt; /* longest time in main loop */ +} irstats; + +static void +imagereclaim(void) +{ + int n; + Page *p; + uvlong ticks; + + irstats.calls++; + /* Somebody is already cleaning the page cache */ + if(!canqlock(&imagealloc.ireclaim)) + return; + + lock(&palloc); + ticks = fastticks(nil); + n = 0; + /* + * All the pages with images backing them are at the + * end of the list (see putpage) so start there and work + * backward. + */ + for(p = palloc.tail; p && p->image && n<1000; p = p->prev) { + if(p->ref == 0 && canlock(p)) { + if(p->ref == 0) { + n++; + uncachepage(p); + } + unlock(p); + } + } + ticks = fastticks(nil) - ticks; + unlock(&palloc); + irstats.loops++; + irstats.ticks += ticks; + if(ticks > irstats.maxt) + irstats.maxt = ticks; + //print("T%llud+", ticks); + qunlock(&imagealloc.ireclaim); +} + +/* + * since close can block, this has to be called outside of + * spin locks. + */ +static void +imagechanreclaim(void) +{ + Chan *c; + + /* Somebody is already cleaning the image chans */ + if(!canqlock(&imagealloc.fcreclaim)) + return; + + /* + * We don't have to recheck that nfreechan > 0 after we + * acquire the lock, because we're the only ones who decrement + * it (the other lock contender increments it), and there's only + * one of us thanks to the qlock above. + */ + while(imagealloc.nfreechan > 0){ + lock(&imagealloc); + imagealloc.nfreechan--; + c = imagealloc.freechan[imagealloc.nfreechan]; + unlock(&imagealloc); + cclose(c); + } + + qunlock(&imagealloc.fcreclaim); +} + +void +putimage(Image *i) +{ + Chan *c, **cp; + Image *f, **l; + + if(i->notext) + return; + + lock(i); + if(--i->ref == 0) { + l = &ihash(i->qid.path); + mkqid(&i->qid, ~0, ~0, QTFILE); + unlock(i); + c = i->c; + + lock(&imagealloc); + for(f = *l; f; f = f->hash) { + if(f == i) { + *l = i->hash; + break; + } + l = &f->hash; + } + + i->next = imagealloc.free; + imagealloc.free = i; + + /* defer freeing channel till we're out of spin lock's */ + if(imagealloc.nfreechan == imagealloc.szfreechan){ + imagealloc.szfreechan += NFREECHAN; + cp = malloc(imagealloc.szfreechan*sizeof(Chan*)); + if(cp == nil) + panic("putimage"); + memmove(cp, imagealloc.freechan, imagealloc.nfreechan*sizeof(Chan*)); + free(imagealloc.freechan); + imagealloc.freechan = cp; + } + imagealloc.freechan[imagealloc.nfreechan++] = c; + unlock(&imagealloc); + + return; + } + unlock(i); +} + +long +ibrk(uintptr addr, int seg) +{ + Segment *s, *ns; + uintptr newtop; + ulong newsize; + int i, mapsize; + Pte **map; + + s = up->seg[seg]; + if(s == 0) + error(Ebadarg); + + if(addr == 0) + return s->base; + + qlock(&s->lk); + + /* We may start with the bss overlapping the data */ + if(addr < s->base) { + if(seg != BSEG || up->seg[DSEG] == 0 || addr < up->seg[DSEG]->base) { + qunlock(&s->lk); + error(Enovmem); + } + addr = s->base; + } + + newtop = PGROUND(addr); + newsize = (newtop-s->base)/BY2PG; + if(newtop < s->top) { + /* + * do not shrink a segment shared with other procs, as the + * to-be-freed address space may have been passed to the kernel + * already by another proc and is past the validaddr stage. + */ + if(s->ref > 1){ + qunlock(&s->lk); + error(Einuse); + } + mfreeseg(s, newtop, (s->top-newtop)/BY2PG); + s->top = newtop; + s->size = newsize; + qunlock(&s->lk); + flushmmu(); + return 0; + } + + for(i = 0; i < NSEG; i++) { + ns = up->seg[i]; + if(ns == 0 || ns == s) + continue; + if(newtop >= ns->base && newtop < ns->top) { + qunlock(&s->lk); + error(Esoverlap); + } + } + + if(newsize > (SEGMAPSIZE*PTEPERTAB)) { + qunlock(&s->lk); + error(Enovmem); + } + mapsize = ROUND(newsize, PTEPERTAB)/PTEPERTAB; + if(mapsize > s->mapsize){ + map = smalloc(mapsize*sizeof(Pte*)); + memmove(map, s->map, s->mapsize*sizeof(Pte*)); + if(s->map != s->ssegmap) + free(s->map); + s->map = map; + s->mapsize = mapsize; + } + + s->top = newtop; + s->size = newsize; + qunlock(&s->lk); + return 0; +} + +/* + * called with s->lk locked + */ +void +mfreeseg(Segment *s, uintptr start, int pages) +{ + int i, j, size; + uintptr soff; + Page *pg; + Page *list; + + soff = start-s->base; + j = (soff&(PTEMAPMEM-1))/BY2PG; + + size = s->mapsize; + list = nil; + for(i = soff/PTEMAPMEM; i < size; i++) { + if(pages <= 0) + break; + if(s->map[i] == 0) { + pages -= PTEPERTAB-j; + j = 0; + continue; + } + while(j < PTEPERTAB) { + pg = s->map[i]->pages[j]; + /* + * We want to zero s->map[i]->page[j] and putpage(pg), + * but we have to make sure other processors flush the + * entry from their TLBs before the page is freed. + * We construct a list of the pages to be freed, zero + * the entries, then (below) call procflushseg, and call + * putpage on the whole list. + * + * Swapped-out pages don't appear in TLBs, so it's okay + * to putswap those pages before procflushseg. + */ + if(pg){ + if(onswap(pg)) + putswap(pg); + else{ + pg->next = list; + list = pg; + } + s->map[i]->pages[j] = 0; + } + if(--pages == 0) + goto out; + j++; + } + j = 0; + } +out: + /* flush this seg in all other processes */ + if(s->ref > 1) + procflushseg(s); + + /* free the pages */ + for(pg = list; pg != nil; pg = list){ + list = list->next; + putpage(pg); + } +} + +Segment* +isoverlap(Proc *p, uintptr va, int len) +{ + int i; + Segment *ns; + uintptr newtop; + + newtop = va+len; + for(i = 0; i < NSEG; i++) { + ns = p->seg[i]; + if(ns == 0) + continue; + if((newtop > ns->base && newtop <= ns->top) || + (va >= ns->base && va < ns->top)) + return ns; + } + return nil; +} + +int +addphysseg(Physseg* new) +{ + Physseg *ps; + + /* + * Check not already entered and there is room + * for a new entry and the terminating null entry. + */ + lock(&physseglock); + for(ps = physseg; ps->name; ps++){ + if(strcmp(ps->name, new->name) == 0){ + unlock(&physseglock); + return -1; + } + } + if(ps-physseg >= nelem(physseg)-2){ + unlock(&physseglock); + return -1; + } + + *ps = *new; + unlock(&physseglock); + + return 0; +} + +int +isphysseg(char *name) +{ + Physseg *ps; + int rv = 0; + + lock(&physseglock); + for(ps = physseg; ps->name; ps++){ + if(strcmp(ps->name, name) == 0){ + rv = 1; + break; + } + } + unlock(&physseglock); + return rv; +} + +uintptr +segattach(Proc *p, ulong attr, char *name, uintptr va, uintptr len) +{ + int sno; + Segment *s, *os; + Physseg *ps; + + if(va != 0 && va >= USTKTOP) + error(Ebadarg); + + validaddr((uintptr)name, 1, 0); + vmemchr(name, 0, ~0); + + for(sno = 0; sno < NSEG; sno++) + if(p->seg[sno] == nil && sno != ESEG) + break; + + if(sno == NSEG) + error(Enovmem); + + /* + * first look for a global segment with the + * same name + */ + if(_globalsegattach != nil){ + s = (*_globalsegattach)(p, name); + if(s != nil){ + p->seg[sno] = s; + return s->base; + } + } + + len = PGROUND(len); + if(len == 0) + error(Ebadarg); + + /* + * Find a hole in the address space. + * Starting at the lowest possible stack address - len, + * check for an overlapping segment, and repeat at the + * base of that segment - len until either a hole is found + * or the address space is exhausted. Ensure that we don't + * map the zero page. + */ + if(va == 0) { + for (os = p->seg[SSEG]; os != nil; os = isoverlap(p, va, len)) { + va = os->base; + if(len >= va) + error(Enovmem); + va -= len; + } + va &= ~(BY2PG-1); + } else { + va &= ~(BY2PG-1); + if(va == 0 || va >= USTKTOP) + error(Ebadarg); + } + + if(isoverlap(p, va, len) != nil) + error(Esoverlap); + + for(ps = physseg; ps->name; ps++) + if(strcmp(name, ps->name) == 0) + goto found; + + error(Ebadarg); +found: + if(len > ps->size) + error(Enovmem); + + attr &= ~SG_TYPE; /* Turn off what is not allowed */ + attr |= ps->attr; /* Copy in defaults */ + + s = newseg(attr, va, len/BY2PG); + s->pseg = ps; + p->seg[sno] = s; + + return va; +} + +void +pteflush(Pte *pte, int s, int e) +{ + int i; + Page *p; + + for(i = s; i < e; i++) { + p = pte->pages[i]; + if(pagedout(p) == 0) + memset(p->cachectl, PG_TXTFLUSH, sizeof(p->cachectl)); + } +} + +long +syssegflush(uintptr *arg) +{ + Segment *s; + ulong l, len, chunk; + Pte *pte; + uintptr addr, ps, pe; + + addr = arg[0]; + len = arg[1]; + + while(len > 0) { + s = seg(up, addr, 1); + if(s == 0) + error(Ebadarg); + + s->flushme = 1; + more: + l = len; + if(addr+l > s->top) + l = s->top - addr; + + ps = addr-s->base; + pte = s->map[ps/PTEMAPMEM]; + ps &= PTEMAPMEM-1; + pe = PTEMAPMEM; + if(pe-ps > l){ + pe = ps + l; + pe = (pe+BY2PG-1)&~(BY2PG-1); + } + if(pe == ps) { + qunlock(&s->lk); + error(Ebadarg); + } + + if(pte) + pteflush(pte, ps/BY2PG, pe/BY2PG); + + chunk = pe-ps; + len -= chunk; + addr += chunk; + + if(len > 0 && addr < s->top) + goto more; + + qunlock(&s->lk); + } + flushmmu(); + return 0; +} + +void +segclock(uintptr pc) +{ + Segment *s; + + s = up->seg[TSEG]; + if(s == 0 || s->profile == 0) + return; + + s->profile[0] += TK2MS(1); + if(pc >= s->base && pc < s->top) { + pc -= s->base; + s->profile[pc>>LRESPROF] += TK2MS(1); + } +} --- /dev/null +++ /sys/src/9/port64/swap.c @@ -0,0 +1,447 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +static int canflush(Proc*, Segment*); +static void executeio(void); +static int needpages(void*); +static void pageout(Proc*, Segment*); +static void pagepte(int, Page**); +static void pager(void*); + +Image swapimage; + +static int swopen; +static Page **iolist; +static int ioptr; + +static ulong genage, genclock, gencount; +static uvlong gensum; + +static void +gentick(void) +{ + genclock++; + if(gencount) + genage = gensum / gencount; + else + genage = 0; + gensum = gencount = 0; +} + +void +swapinit(void) +{ + swapalloc.swmap = xalloc(conf.nswap); + swapalloc.top = &swapalloc.swmap[conf.nswap]; + swapalloc.alloc = swapalloc.swmap; + swapalloc.last = swapalloc.swmap; + swapalloc.free = conf.nswap; + iolist = xalloc(conf.nswppo*sizeof(Page*)); + if(swapalloc.swmap == 0 || iolist == 0) + panic("swapinit: not enough memory"); + + swapimage.notext = 1; +} + +uintptr +newswap(void) +{ + uchar *look; + + lock(&swapalloc); + + if(swapalloc.free == 0){ + unlock(&swapalloc); + return ~0; + } + + look = memchr(swapalloc.last, 0, swapalloc.top-swapalloc.last); + if(look == 0) + panic("inconsistent swap"); + + *look = 1; + swapalloc.last = look; + swapalloc.free--; + unlock(&swapalloc); + return (look-swapalloc.swmap) * BY2PG; +} + +void +putswap(Page *p) +{ + uchar *idx; + + lock(&swapalloc); + idx = &swapalloc.swmap[((uintptr)p)/BY2PG]; + if(--(*idx) == 0) { + swapalloc.free++; + if(idx < swapalloc.last) + swapalloc.last = idx; + } + if(*idx >= 254) + panic("putswap %#p == %ud", p, *idx); + unlock(&swapalloc); +} + +void +dupswap(Page *p) +{ + lock(&swapalloc); + if(++swapalloc.swmap[((uintptr)p)/BY2PG] == 0) + panic("dupswap"); + unlock(&swapalloc); +} + +int +swapcount(uintptr daddr) +{ + return swapalloc.swmap[daddr/BY2PG]; +} + +void +kickpager(void) +{ + static int started; + + if(started) + wakeup(&swapalloc.r); + else { + kproc("pager", pager, 0); + started = 1; + } +} + +static void +pager(void *junk) +{ + int i; + Segment *s; + Proc *p, *ep; + + if(waserror()) + panic("pager: os error"); + + p = proctab(0); + ep = &p[conf.nproc]; + +loop: + up->psstate = "Idle"; + wakeup(&palloc.r); + sleep(&swapalloc.r, needpages, 0); + + while(needpages(junk)) { + if(swapimage.c) { + p++; + if(p >= ep){ + p = proctab(0); + gentick(); + } + + if(p->state == Dead || p->noswap) + continue; + + if(!canqlock(&p->seglock)) + continue; /* process changing its segments */ + + for(i = 0; i < NSEG; i++) { + if(!needpages(junk)){ + qunlock(&p->seglock); + goto loop; + } + + if(s = p->seg[i]) { + switch(s->type&SG_TYPE) { + default: + break; + case SG_TEXT: + pageout(p, s); + break; + case SG_DATA: + case SG_BSS: + case SG_STACK: + case SG_SHARED: + up->psstate = "Pageout"; + pageout(p, s); + if(ioptr != 0) { + up->psstate = "I/O"; + executeio(); + } + break; + } + } + } + qunlock(&p->seglock); + } else { + print("out of memory\n"); + killbig("out of memory"); + freebroken(); /* can use the memory */ + + /* Emulate the old system if no swap channel */ + if(!swapimage.c) + tsleep(&up->sleep, return0, 0, 5000); + } + } + goto loop; +} + +static void +pageout(Proc *p, Segment *s) +{ + int type, i, size; + ulong age; + Pte *l; + Page **pg, *entry; + + if(!canqlock(&s->lk)) /* We cannot afford to wait, we will surely deadlock */ + return; + + if(s->steal) { /* Protected by /dev/proc */ + qunlock(&s->lk); + return; + } + + if(!canflush(p, s)) { /* Able to invalidate all tlbs with references */ + qunlock(&s->lk); + putseg(s); + return; + } + + if(waserror()) { + qunlock(&s->lk); + putseg(s); + return; + } + + /* Pass through the pte tables looking for memory pages to swap out */ + type = s->type&SG_TYPE; + size = s->mapsize; + for(i = 0; i < size; i++) { + l = s->map[i]; + if(l == 0) + continue; + for(pg = l->first; pg < l->last; pg++) { + entry = *pg; + if(pagedout(entry)) + continue; + + if(entry->modref & PG_REF) { + entry->modref &= ~PG_REF; + entry->gen = genclock; + } + + if(genclock < entry->gen) + age = ~(entry->gen - genclock); + else + age = genclock - entry->gen; + gensum += age; + gencount++; + if(age <= genage) + continue; + + pagepte(type, pg); + + if(ioptr >= conf.nswppo) + goto out; + } + } +out: + poperror(); + qunlock(&s->lk); + putseg(s); +} + +static int +canflush(Proc *p, Segment *s) +{ + int i; + Proc *ep; + + lock(s); + if(s->ref == 1) { /* Easy if we are the only user */ + s->ref++; + unlock(s); + return canpage(p); + } + s->ref++; + unlock(s); + + /* Now we must do hardwork to ensure all processes which have tlb + * entries for this segment will be flushed if we succeed in paging it out + */ + p = proctab(0); + ep = &p[conf.nproc]; + while(p < ep) { + if(p->state != Dead) { + for(i = 0; i < NSEG; i++) + if(p->seg[i] == s) + if(!canpage(p)) + return 0; + } + p++; + } + return 1; +} + +static void +pagepte(int type, Page **pg) +{ + uintptr daddr; + Page *outp; + + outp = *pg; + switch(type) { + case SG_TEXT: /* Revert to demand load */ + putpage(outp); + *pg = 0; + break; + + case SG_DATA: + case SG_BSS: + case SG_STACK: + case SG_SHARED: + /* + * get a new swap address and clear any pages + * referring to it from the cache + */ + daddr = newswap(); + if(daddr == ~0) + break; + cachedel(&swapimage, daddr); + + lock(outp); + + /* forget anything that it used to cache */ + uncachepage(outp); + + /* + * incr the reference count to make sure it sticks around while + * being written + */ + outp->ref++; + + /* + * enter it into the cache so that a fault happening + * during the write will grab the page from the cache + * rather than one partially written to the disk + */ + outp->daddr = daddr; + cachepage(outp, &swapimage); + *pg = (Page*)(daddr|PG_ONSWAP); + unlock(outp); + + /* Add page to IO transaction list */ + iolist[ioptr++] = outp; + break; + } +} + +void +pagersummary(void) +{ + print("%lud/%lud memory %lud/%lud swap %d iolist\n", + palloc.user-palloc.freecount, + palloc.user, conf.nswap-swapalloc.free, conf.nswap, + ioptr); +} + +static int +pageiocomp(void *a, void *b) +{ + Page *p1, *p2; + + p1 = *(Page **)a; + p2 = *(Page **)b; + if(p1->daddr > p2->daddr) + return 1; + else + return -1; +} + +static void +executeio(void) +{ + Page *out; + int i, n; + Chan *c; + char *kaddr; + KMap *k; + + c = swapimage.c; + qsort(iolist, ioptr, sizeof iolist[0], pageiocomp); + for(i = 0; i < ioptr; i++) { + if(ioptr > conf.nswppo) + panic("executeio: ioptr %d > %d", ioptr, conf.nswppo); + out = iolist[i]; + k = kmap(out); + kaddr = (char*)VA(k); + + if(waserror()) + panic("executeio: page out I/O error"); + + n = devtab[c->type]->write(c, kaddr, BY2PG, out->daddr); + if(n != BY2PG) + nexterror(); + + kunmap(k); + poperror(); + + /* Free up the page after I/O */ + lock(out); + out->ref--; + unlock(out); + putpage(out); + } + ioptr = 0; +} + +static int +needpages(void*) +{ + return palloc.freecount < swapalloc.headroom; +} + +void +setswapchan(Chan *c) +{ + uchar dirbuf[sizeof(Dir)+100]; + Dir d; + int n; + + if(swapimage.c) { + if(swapalloc.free != conf.nswap){ + cclose(c); + error(Einuse); + } + cclose(swapimage.c); + } + + /* + * if this isn't a file, set the swap space + * to be at most the size of the partition + */ + if(devtab[c->type]->dc != L'M'){ + n = devtab[c->type]->stat(c, dirbuf, sizeof dirbuf); + if(n <= 0){ + cclose(c); + error("stat failed in setswapchan"); + } + convM2D(dirbuf, n, &d, nil); + if(d.length < conf.nswap*BY2PG){ + conf.nswap = d.length/BY2PG; + swapalloc.top = &swapalloc.swmap[conf.nswap]; + swapalloc.free = conf.nswap; + } + } + + swapimage.c = c; +} + +int +swapfull(void) +{ + return swapalloc.free < conf.nswap/10; +} --- /dev/null +++ /sys/src/9/port64/syscallfmt.c @@ -0,0 +1,413 @@ +/* + * Print functions for system call tracing. + */ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +#include "/sys/src/libc/9syscall/sys.h" + +// WE ARE OVERRUNNING SOMEHOW +static void +fmtrwdata(Fmt* f, char* a, int n, char* suffix) +{ + int i; + char *t; + + if(a == nil){ + fmtprint(f, "0x0%s", suffix); + return; + } + validaddr((uintptr)a, n, 0); + t = smalloc(n+1); + for(i = 0; i < n; i++) + if(a[i] > 0x20 && a[i] < 0x7f) /* printable ascii? */ + t[i] = a[i]; + else + t[i] = '.'; + + fmtprint(f, " %#p/\"%s\"%s", a, t, suffix); + free(t); +} + +static void +fmtuserstring(Fmt* f, char* a, char* suffix) +{ + int n; + char *t; + + if(a == nil){ + fmtprint(f, "0/\"\"%s", suffix); + return; + } + validaddr((uintptr)a, 1, 0); + n = ((char*)vmemchr(a, 0, 0x7fffffff) - a) + 1; + t = smalloc(n+1); + memmove(t, a, n); + t[n] = 0; + fmtprint(f, "%#p/\"%s\"%s", a, t, suffix); + free(t); +} + +void +syscallfmt(int syscallno, uintptr pc, va_list list) +{ + long l; + Fmt fmt; + void *v; + vlong vl; + uintptr p; + int i[2], len; + char *a, **argv; + + fmtstrinit(&fmt); + fmtprint(&fmt, "%uld %s ", up->pid, up->text); + + if(syscallno > nsyscall) + fmtprint(&fmt, " %d ", syscallno); + else + fmtprint(&fmt, "%s ", sysctab[syscallno]? + sysctab[syscallno]: "huh?"); + + fmtprint(&fmt, "%#p ", pc); + if(up->syscalltrace != nil) + free(up->syscalltrace); + + switch(syscallno){ + case SYSR1: + p = va_arg(list, uintptr); + fmtprint(&fmt, "%#p", p); + break; + case _ERRSTR: /* deprecated */ + case CHDIR: + case EXITS: + case REMOVE: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, ""); + break; + case BIND: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + i[0] = va_arg(list, int); + fmtprint(&fmt, "%#ux", i[0]); + break; + case CLOSE: + case NOTED: + i[0] = va_arg(list, int); + fmtprint(&fmt, "%d", i[0]); + break; + case DUP: + i[0] = va_arg(list, int); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%d %d", i[0], i[1]); + break; + case ALARM: + l = va_arg(list, unsigned long); + fmtprint(&fmt, "%#lud ", l); + break; + case EXEC: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, ""); + argv = va_arg(list, char**); + validalign(PTR2UINT(argv), sizeof(char*)); + for(;;){ + validaddr((uintptr)argv, sizeof(char**), 0); + a = *(char **)argv; + if(a == nil) + break; + fmtprint(&fmt, " "); + fmtuserstring(&fmt, a, ""); + argv++; + } + break; + case _FSESSION: /* deprecated */ + case _FSTAT: /* deprecated */ + case _FWSTAT: /* obsolete */ + i[0] = va_arg(list, int); + a = va_arg(list, char*); + fmtprint(&fmt, "%d %#p", i[0], a); + break; + case FAUTH: + i[0] = va_arg(list, int); + a = va_arg(list, char*); + fmtprint(&fmt, "%d", i[0]); + fmtuserstring(&fmt, a, ""); + break; + case SEGBRK: + case RENDEZVOUS: + v = va_arg(list, void*); + fmtprint(&fmt, "%#p ", v); + v = va_arg(list, void*); + fmtprint(&fmt, "%#p", v); + break; + case _MOUNT: /* deprecated */ + i[0] = va_arg(list, int); + fmtprint(&fmt, "%d ", i[0]); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + i[0] = va_arg(list, int); + fmtprint(&fmt, "%#ux ", i[0]); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, ""); + break; + case OPEN: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + i[0] = va_arg(list, int); + fmtprint(&fmt, "%#ux", i[0]); + break; + case OSEEK: /* deprecated */ + i[0] = va_arg(list, int); + l = va_arg(list, long); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%d %ld %d", i[0], l, i[1]); + break; + case SLEEP: + l = va_arg(list, long); + fmtprint(&fmt, "%ld", l); + break; + case _STAT: /* obsolete */ + case _WSTAT: /* obsolete */ + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + a = va_arg(list, char*); + fmtprint(&fmt, "%#p", a); + break; + case RFORK: + i[0] = va_arg(list, int); + fmtprint(&fmt, "%#ux", i[0]); + break; + case PIPE: + case BRK_: + v = va_arg(list, int*); + fmtprint(&fmt, "%#p", v); + break; + case CREATE: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + i[0] = va_arg(list, int); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%#ux %#ux", i[0], i[1]); + break; + case FD2PATH: + case FSTAT: + case FWSTAT: + i[0] = va_arg(list, int); + a = va_arg(list, char*); + l = va_arg(list, unsigned long); + fmtprint(&fmt, "%d %#p %lud", i[0], a, l); + break; + case NOTIFY: + case SEGDETACH: + case _WAIT: /* deprecated */ + v = va_arg(list, void*); + fmtprint(&fmt, "%#p", v); + break; + case SEGATTACH: + i[0] = va_arg(list, int); + fmtprint(&fmt, "%d ", i[0]); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + /*FALLTHROUGH*/ + case SEGFREE: + case SEGFLUSH: + v = va_arg(list, void*); + l = va_arg(list, unsigned long); + fmtprint(&fmt, "%#p %lud", v, l); + break; + case UNMOUNT: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, ""); + break; + case SEMACQUIRE: + case SEMRELEASE: + v = va_arg(list, int*); + i[0] = va_arg(list, int); + fmtprint(&fmt, "%#p %d", v, i[0]); + break; + case TSEMACQUIRE: + v = va_arg(list, long*); + l = va_arg(list, ulong); + fmtprint(&fmt, "%#p %ld", v, l); + break; + case SEEK: + v = va_arg(list, vlong*); + i[0] = va_arg(list, int); + vl = va_arg(list, vlong); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%#p %d %#llux %d", v, i[0], vl, i[1]); + break; + case FVERSION: + i[0] = va_arg(list, int); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%d %d ", i[0], i[1]); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + l = va_arg(list, unsigned long); + fmtprint(&fmt, "%lud", l); + break; + case WSTAT: + case STAT: + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + /*FALLTHROUGH*/ + case ERRSTR: + case AWAIT: + a = va_arg(list, char*); + l = va_arg(list, unsigned long); + fmtprint(&fmt, "%#p %lud", a, l); + break; + case MOUNT: + i[0] = va_arg(list, int); + i[1] = va_arg(list, int); + fmtprint(&fmt, "%d %d ", i[0], i[1]); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, " "); + i[0] = va_arg(list, int); + fmtprint(&fmt, "%#ux ", i[0]); + a = va_arg(list, char*); + fmtuserstring(&fmt, a, ""); + break; + case _READ: /* deprecated */ + case PREAD: + i[0] = va_arg(list, int); + v = va_arg(list, void*); + l = va_arg(list, long); + fmtprint(&fmt, "%d %#p %ld", i[0], v, l); + if(syscallno == PREAD){ + vl = va_arg(list, vlong); + fmtprint(&fmt, " %lld", vl); + } + break; + case _WRITE: /* deprecated */ + case PWRITE: + i[0] = va_arg(list, int); + v = va_arg(list, void*); + l = va_arg(list, long); + fmtprint(&fmt, "%d ", i[0]); + len = MIN(l, 64); + fmtrwdata(&fmt, v, len, " "); + fmtprint(&fmt, "%ld", l); + if(syscallno == PWRITE){ + vl = va_arg(list, vlong); + fmtprint(&fmt, " %lld", vl); + } + break; + case NSEC: + v = va_arg(list, vlong*); + fmtprint(&fmt, "%#p", v); + break; + } + + up->syscalltrace = fmtstrflush(&fmt); +} + +void +sysretfmt(int syscallno, va_list list, uintptr ret, uvlong start, uvlong stop) +{ + long l; + void* v; + Fmt fmt; + vlong vl; + int i, len; + char *a, *errstr; + + fmtstrinit(&fmt); + + if(up->syscalltrace) + free(up->syscalltrace); + + errstr = "\"\""; + switch(syscallno){ + default: + case ALARM: + case _WRITE: + case PWRITE: + if((long)ret == -1) + errstr = up->syserrstr; + fmtprint(&fmt, " = %ld", (long)ret); + break; + case EXEC: + case SEGBRK: + case SEGATTACH: + case RENDEZVOUS: + if((void *)ret == (void*)-1) + errstr = up->syserrstr; + fmtprint(&fmt, " = %#p", (void *)ret); + break; + case AWAIT: + a = va_arg(list, char*); + l = va_arg(list, unsigned long); + if((long)ret > 0){ + fmtuserstring(&fmt, a, " "); + fmtprint(&fmt, "%lud = %ld", l, (long)ret); + } + else{ + fmtprint(&fmt, "%#p/\"\" %lud = %ld", a, l, (long)ret); + errstr = up->syserrstr; + } + break; + case _ERRSTR: + case ERRSTR: + a = va_arg(list, char*); + if(syscallno == _ERRSTR) + l = 64; + else + l = va_arg(list, unsigned long); + if((long)ret > 0){ + fmtuserstring(&fmt, a, " "); + fmtprint(&fmt, "%lud = %ld", l, (long)ret); + } + else{ + fmtprint(&fmt, "\"\" %lud = %ld", l, (long)ret); + errstr = up->syserrstr; + } + break; + case FD2PATH: + i = va_arg(list, int); + USED(i); + a = va_arg(list, char*); + l = va_arg(list, unsigned long); + if((long)ret > 0){ + fmtuserstring(&fmt, a, " "); + fmtprint(&fmt, "%lud = %ld", l, (long)ret); + } + else{ + fmtprint(&fmt, "\"\" %lud = %ld", l, (long)ret); + errstr = up->syserrstr; + } + break; + case _READ: + case PREAD: + i = va_arg(list, int); + USED(i); + v = va_arg(list, void*); + l = va_arg(list, long); + if((long)ret > 0){ + len = MIN(ret, 64); + fmtrwdata(&fmt, v, len, ""); + } + else{ + fmtprint(&fmt, "/\"\""); + errstr = up->syserrstr; + } + fmtprint(&fmt, " %ld", l); + if(syscallno == PREAD){ + vl = va_arg(list, vlong); + fmtprint(&fmt, " %lld", vl); + } + fmtprint(&fmt, " = %ld", (long)ret); + break; + case NSEC: + fmtprint(&fmt, " = %ld", (long)ret); /* FoV */ + break; + } + fmtprint(&fmt, " %s %#llud %#llud\n", errstr, start, stop); + up->syscalltrace = fmtstrflush(&fmt); +} --- /dev/null +++ /sys/src/9/port64/sysfile.c @@ -0,0 +1,1355 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" + +/* + * The sys*() routines needn't poperror() as they return directly to syscall(). + */ + +static void +unlockfgrp(Fgrp *f) +{ + int ex; + + ex = f->exceed; + f->exceed = 0; + unlock(f); + if(ex) + pprint("warning: process exceeds %d file descriptors\n", ex); +} + +int +growfd(Fgrp *f, int fd) /* fd is always >= 0 */ +{ + Chan **newfd, **oldfd; + + if(fd < f->nfd) + return 0; + if(fd >= f->nfd+DELTAFD) + return -1; /* out of range */ + /* + * Unbounded allocation is unwise + */ + if(f->nfd >= 5000){ + Exhausted: + print("no free file descriptors\n"); + return -1; + } + newfd = malloc((f->nfd+DELTAFD)*sizeof(Chan*)); + if(newfd == 0) + goto Exhausted; + oldfd = f->fd; + memmove(newfd, oldfd, f->nfd*sizeof(Chan*)); + f->fd = newfd; + free(oldfd); + f->nfd += DELTAFD; + if(fd > f->maxfd){ + if(fd/100 > f->maxfd/100) + f->exceed = (fd/100)*100; + f->maxfd = fd; + } + return 1; +} + +/* + * this assumes that the fgrp is locked + */ +int +findfreefd(Fgrp *f, int start) +{ + int fd; + + for(fd=start; fdnfd; fd++) + if(f->fd[fd] == 0) + break; + if(fd >= f->nfd && growfd(f, fd) < 0) + return -1; + return fd; +} + +int +newfd(Chan *c) +{ + int fd; + Fgrp *f; + + f = up->fgrp; + lock(f); + fd = findfreefd(f, 0); + if(fd < 0){ + unlockfgrp(f); + return -1; + } + if(fd > f->maxfd) + f->maxfd = fd; + f->fd[fd] = c; + unlockfgrp(f); + return fd; +} + +int +newfd2(int fd[2], Chan *c[2]) +{ + Fgrp *f; + + f = up->fgrp; + lock(f); + fd[0] = findfreefd(f, 0); + if(fd[0] < 0){ + unlockfgrp(f); + return -1; + } + fd[1] = findfreefd(f, fd[0]+1); + if(fd[1] < 0){ + unlockfgrp(f); + return -1; + } + if(fd[1] > f->maxfd) + f->maxfd = fd[1]; + f->fd[fd[0]] = c[0]; + f->fd[fd[1]] = c[1]; + unlockfgrp(f); + + return 0; +} + +Chan* +fdtochan(int fd, int mode, int chkmnt, int iref) +{ + Chan *c; + Fgrp *f; + + c = 0; + f = up->fgrp; + + lock(f); + if(fd<0 || f->nfd<=fd || (c = f->fd[fd])==0) { + unlock(f); + error(Ebadfd); + } + if(iref) + incref(c); + unlock(f); + + if(chkmnt && (c->flag&CMSG)) { + if(iref) + cclose(c); + error(Ebadusefd); + } + + if(mode<0 || c->mode==ORDWR) + return c; + + if((mode&OTRUNC) && c->mode==OREAD) { + if(iref) + cclose(c); + error(Ebadusefd); + } + + if((mode&~OTRUNC) != c->mode) { + if(iref) + cclose(c); + error(Ebadusefd); + } + + return c; +} + +int +openmode(ulong o) +{ + o &= ~(OTRUNC|OCEXEC|ORCLOSE); + if(o > OEXEC) + error(Ebadarg); + if(o == OEXEC) + return OREAD; + return o; +} + +uintptr +sysfd2path(uintptr *arg) +{ + Chan *c; + + validaddr(arg[1], arg[2], 1); + + c = fdtochan(arg[0], -1, 0, 1); + snprint((char*)arg[1], arg[2], "%s", chanpath(c)); + cclose(c); + return 0; +} + +uintptr +syspipe(uintptr *arg) +{ + int fd[2]; + Chan *c[2]; + Dev *d; + static char *datastr[] = {"data", "data1"}; + + validaddr(arg[0], sizeof(fd), 1); + validalign(arg[0], sizeof(int)); + d = devtab[devno('|', 0)]; + c[0] = namec("#|", Atodir, 0, 0); + c[1] = 0; + fd[0] = -1; + fd[1] = -1; + + if(waserror()){ + cclose(c[0]); + if(c[1]) + cclose(c[1]); + nexterror(); + } + c[1] = cclone(c[0]); + if(walk(&c[0], datastr+0, 1, 1, nil) < 0) + error(Egreg); + if(walk(&c[1], datastr+1, 1, 1, nil) < 0) + error(Egreg); + c[0] = d->open(c[0], ORDWR); + c[1] = d->open(c[1], ORDWR); + if(newfd2(fd, c) < 0) + error(Enofd); + poperror(); + + ((int*)arg[0])[0] = fd[0]; + ((int*)arg[0])[1] = fd[1]; + return 0; +} + +uintptr +sysdup(uintptr *arg) +{ + int fd; + Chan *c, *oc; + Fgrp *f = up->fgrp; + + /* + * Close after dup'ing, so date > #d/1 works + */ + c = fdtochan(arg[0], -1, 0, 1); + fd = arg[1]; + if(fd != -1){ + lock(f); + if(fd<0 || growfd(f, fd)<0) { + unlockfgrp(f); + cclose(c); + error(Ebadfd); + } + if(fd > f->maxfd) + f->maxfd = fd; + + oc = f->fd[fd]; + f->fd[fd] = c; + unlockfgrp(f); + if(oc) + cclose(oc); + }else{ + if(waserror()) { + cclose(c); + nexterror(); + } + fd = newfd(c); + if(fd < 0) + error(Enofd); + poperror(); + } + + return fd; +} + +uintptr +sysopen(uintptr *arg) +{ + int fd; + Chan *c; + + openmode(arg[1]); /* error check only */ + validaddr(arg[0], 1, 0); + c = namec((char*)arg[0], Aopen, arg[1], 0); + if(waserror()){ + cclose(c); + nexterror(); + } + fd = newfd(c); + if(fd < 0) + error(Enofd); + poperror(); + return fd; +} + +void +fdclose(int fd, int flag) +{ + int i; + Chan *c; + Fgrp *f = up->fgrp; + + lock(f); + c = f->fd[fd]; + if(c == 0){ + /* can happen for users with shared fd tables */ + unlock(f); + return; + } + if(flag){ + if(c==0 || !(c->flag&flag)){ + unlock(f); + return; + } + } + f->fd[fd] = 0; + if(fd == f->maxfd) + for(i=fd; --i>=0 && f->fd[i]==0; ) + f->maxfd = i; + + unlock(f); + cclose(c); +} + +uintptr +sysclose(uintptr *arg) +{ + fdtochan(arg[0], -1, 0, 0); + fdclose(arg[0], 0); + + return 0; +} + +long +unionread(Chan *c, void *va, long n) +{ + int i; + long nr; + Mhead *m; + Mount *mount; + + qlock(&c->umqlock); + m = c->umh; + rlock(&m->lock); + mount = m->mount; + /* bring mount in sync with c->uri and c->umc */ + for(i = 0; mount != nil && i < c->uri; i++) + mount = mount->next; + + nr = 0; + while(mount != nil){ + /* Error causes component of union to be skipped */ + if(mount->to && !waserror()){ + if(c->umc == nil){ + c->umc = cclone(mount->to); + c->umc = devtab[c->umc->type]->open(c->umc, OREAD); + } + + nr = devtab[c->umc->type]->read(c->umc, va, n, c->umc->offset); + c->umc->offset += nr; + poperror(); + } + if(nr > 0) + break; + + /* Advance to next element */ + c->uri++; + if(c->umc){ + cclose(c->umc); + c->umc = nil; + } + mount = mount->next; + } + runlock(&m->lock); + qunlock(&c->umqlock); + return nr; +} + +static void +unionrewind(Chan *c) +{ + qlock(&c->umqlock); + c->uri = 0; + if(c->umc){ + cclose(c->umc); + c->umc = nil; + } + qunlock(&c->umqlock); +} + +static int +dirfixed(uchar *p, uchar *e, Dir *d) +{ + int len; + + len = GBIT16(p)+BIT16SZ; + if(p + len > e) + return -1; + + p += BIT16SZ; /* ignore size */ + d->type = devno(GBIT16(p), 1); + p += BIT16SZ; + d->dev = GBIT32(p); + p += BIT32SZ; + d->qid.type = GBIT8(p); + p += BIT8SZ; + d->qid.vers = GBIT32(p); + p += BIT32SZ; + d->qid.path = GBIT64(p); + p += BIT64SZ; + d->mode = GBIT32(p); + p += BIT32SZ; + d->atime = GBIT32(p); + p += BIT32SZ; + d->mtime = GBIT32(p); + p += BIT32SZ; + d->length = GBIT64(p); + + return len; +} + +static char* +dirname(uchar *p, int *n) +{ + p += BIT16SZ+BIT16SZ+BIT32SZ+BIT8SZ+BIT32SZ+BIT64SZ + + BIT32SZ+BIT32SZ+BIT32SZ+BIT64SZ; + *n = GBIT16(p); + return (char*)p+BIT16SZ; +} + +static long +dirsetname(char *name, int len, uchar *p, long n, long maxn) +{ + char *oname; + int olen; + long nn; + + if(n == BIT16SZ) + return BIT16SZ; + + oname = dirname(p, &olen); + + nn = n+len-olen; + PBIT16(p, nn-BIT16SZ); + if(nn > maxn) + return BIT16SZ; + + if(len != olen) + memmove(oname+len, oname+olen, p+n-(uchar*)(oname+olen)); + PBIT16((uchar*)(oname-2), len); + memmove(oname, name, len); + return nn; +} + +/* + * Mountfix might have caused the fixed results of the directory read + * to overflow the buffer. Catch the overflow in c->dirrock. + */ +static void +mountrock(Chan *c, uchar *p, uchar **pe) +{ + uchar *e, *r; + int len, n; + + e = *pe; + + /* find last directory entry */ + for(;;){ + len = BIT16SZ+GBIT16(p); + if(p+len >= e) + break; + p += len; + } + + /* save it away */ + qlock(&c->rockqlock); + if(c->nrock+len > c->mrock){ + n = ROUND(c->nrock+len, 1024); + r = smalloc(n); + memmove(r, c->dirrock, c->nrock); + free(c->dirrock); + c->dirrock = r; + c->mrock = n; + } + memmove(c->dirrock+c->nrock, p, len); + c->nrock += len; + qunlock(&c->rockqlock); + + /* drop it */ + *pe = p; +} + +/* + * Satisfy a directory read with the results saved in c->dirrock. + */ +static int +mountrockread(Chan *c, uchar *op, long n, long *nn) +{ + long dirlen; + uchar *rp, *erp, *ep, *p; + + /* common case */ + if(c->nrock == 0) + return 0; + + /* copy out what we can */ + qlock(&c->rockqlock); + rp = c->dirrock; + erp = rp+c->nrock; + p = op; + ep = p+n; + while(rp+BIT16SZ <= erp){ + dirlen = BIT16SZ+GBIT16(rp); + if(p+dirlen > ep) + break; + memmove(p, rp, dirlen); + p += dirlen; + rp += dirlen; + } + + if(p == op){ + qunlock(&c->rockqlock); + return 0; + } + + /* shift the rest */ + if(rp != erp) + memmove(c->dirrock, rp, erp-rp); + c->nrock = erp - rp; + + *nn = p - op; + qunlock(&c->rockqlock); + return 1; +} + +static void +mountrewind(Chan *c) +{ + c->nrock = 0; +} + +/* + * Rewrite the results of a directory read to reflect current + * name space bindings and mounts. Specifically, replace + * directory entries for bind and mount points with the results + * of statting what is mounted there. Except leave the old names. + */ +static long +mountfix(Chan *c, uchar *op, long n, long maxn) +{ + char *name; + int nbuf, nname; + Chan *nc; + Mhead *mh; + Mount *m; + uchar *p; + int dirlen, rest; + long l; + uchar *buf, *e; + Dir d; + + p = op; + buf = nil; + nbuf = 0; + for(e=&p[n]; p+BIT16SZmount; m; m=m->next) + if(eqchantdqid(m->to, d.type, d.dev, d.qid, 1)) + goto Norewrite; + + name = dirname(p, &nname); + /* + * Do the stat but fix the name. If it fails, leave old entry. + * BUG: If it fails because there isn't room for the entry, + * what can we do? Nothing, really. Might as well skip it. + */ + if(buf == nil){ + buf = smalloc(4096); + nbuf = 4096; + } + if(waserror()) + goto Norewrite; + l = devtab[nc->type]->stat(nc, buf, nbuf); + l = dirsetname(name, nname, buf, l, nbuf); + if(l == BIT16SZ) + error("dirsetname"); + poperror(); + + /* + * Shift data in buffer to accomodate new entry, + * possibly overflowing into rock. + */ + rest = e - (p+dirlen); + if(l > dirlen){ + while(p+l+rest > op+maxn){ + mountrock(c, p, &e); + if(e == p){ + dirlen = 0; + goto Norewrite; + } + rest = e - (p+dirlen); + } + } + if(l != dirlen){ + memmove(p+l, p+dirlen, rest); + dirlen = l; + e = p+dirlen+rest; + } + + /* + * Rewrite directory entry. + */ + memmove(p, buf, l); + + Norewrite: + cclose(nc); + putmhead(mh); + } + } + if(buf) + free(buf); + + if(p != e) + error("oops in rockfix"); + + return e-op; +} + +static long +read(uintptr *arg, vlong *offp) +{ + long n, nn, nnn; + uchar *p; + Chan *c; + vlong off; + + n = arg[2]; + validaddr(arg[1], n, 1); + p = (void*)arg[1]; + c = fdtochan(arg[0], OREAD, 1, 1); + + if(waserror()){ + cclose(c); + nexterror(); + } + + /* + * The offset is passed through on directories, normally. + * Sysseek complains, but pread is used by servers like exportfs, + * that shouldn't need to worry about this issue. + * + * Notice that c->devoffset is the offset that c's dev is seeing. + * The number of bytes read on this fd (c->offset) may be different + * due to rewritings in rockfix. + */ + if(offp == nil) /* use and maintain channel's offset */ + off = c->offset; + else + off = *offp; + if(off < 0) + error(Enegoff); + + if(off == 0){ /* rewind to the beginning of the directory */ + if(offp == nil){ + c->offset = 0; + c->devoffset = 0; + } + mountrewind(c); + unionrewind(c); + } + + if(c->qid.type & QTDIR){ + if(mountrockread(c, p, n, &nn)){ + /* do nothing: mountrockread filled buffer */ + }else if(c->umh) + nn = unionread(c, p, n); + else{ + if(off != c->offset) + error(Edirseek); + nn = devtab[c->type]->read(c, p, n, c->devoffset); + } + nnn = mountfix(c, p, nn, n); + }else + nnn = nn = devtab[c->type]->read(c, p, n, off); + + lock(c); + c->devoffset += nn; + c->offset += nnn; + unlock(c); + + poperror(); + cclose(c); + + return nnn; +} + +uintptr +sys_read(uintptr *arg) +{ + return read(arg, nil); +} + +uintptr +syspread(uintptr *arg) +{ + vlong v; + va_list list; + + /* use varargs to guarantee alignment of vlong */ + va_start(list, arg[2]); + v = va_arg(list, vlong); + va_end(list); + + if(v == ~0ULL) + return read(arg, nil); + + return read(arg, &v); +} + +static long +write(uintptr *arg, vlong *offp) +{ + Chan *c; + long m, n; + vlong off; + + validaddr(arg[1], arg[2], 0); + n = 0; + c = fdtochan(arg[0], OWRITE, 1, 1); + if(waserror()) { + if(offp == nil){ + lock(c); + c->offset -= n; + unlock(c); + } + cclose(c); + nexterror(); + } + + if(c->qid.type & QTDIR) + error(Eisdir); + + n = arg[2]; + + if(offp == nil){ /* use and maintain channel's offset */ + lock(c); + off = c->offset; + c->offset += n; + unlock(c); + }else + off = *offp; + + if(off < 0) + error(Enegoff); + + m = devtab[c->type]->write(c, (void*)arg[1], n, off); + + if(offp == nil && m < n){ + lock(c); + c->offset -= n - m; + unlock(c); + } + + poperror(); + cclose(c); + + return m; +} + +uintptr +sys_write(uintptr *arg) +{ + return write(arg, nil); +} + +uintptr +syspwrite(uintptr *arg) +{ + vlong v; + va_list list; + + /* use varargs to guarantee alignment of vlong */ + va_start(list, arg[2]); + v = va_arg(list, vlong); + va_end(list); + + if(v == ~0ULL) + return write(arg, nil); + + return write(arg, &v); +} + +static void +sseek(uintptr *arg) +{ + Chan *c; + uchar buf[sizeof(Dir)+100]; + Dir dir; + int n, t; + vlong off; + union { + vlong v; + ulong u[2]; + } o; + + c = fdtochan(arg[1], -1, 1, 1); + if(waserror()){ + cclose(c); + nexterror(); + } + if(devtab[c->type]->dc == '|') + error(Eisstream); + + off = 0; + SET(t); + switch(sizeof arg[2]){ + case sizeof o.u[0]: + o.u[0] = arg[2]; + o.u[1] = arg[3]; + t = arg[4]; + break; + + case sizeof o.v: + o.v = arg[2]; + t = arg[3]; + break; + } + switch(t){ + case 0: + off = o.v; + if((c->qid.type & QTDIR) && off != 0) + error(Eisdir); + if(off < 0) + error(Enegoff); + c->offset = off; + break; + + case 1: + if(c->qid.type & QTDIR) + error(Eisdir); + lock(c); /* lock for read/write update */ + off = o.v + c->offset; + if(off < 0){ + unlock(c); + error(Enegoff); + } + c->offset = off; + unlock(c); + break; + + case 2: + if(c->qid.type & QTDIR) + error(Eisdir); + n = devtab[c->type]->stat(c, buf, sizeof buf); + if(convM2D(buf, n, &dir, nil) == 0) + error("internal error: stat error in seek"); + off = dir.length + o.v; + if(off < 0) + error(Enegoff); + c->offset = off; + break; + + default: + error(Ebadarg); + } + *(vlong*)arg[0] = off; + c->uri = 0; + c->dri = 0; + cclose(c); + poperror(); +} + +uintptr +sysseek(uintptr *arg) +{ + validaddr(arg[0], sizeof(vlong), 1); + validalign(arg[0], sizeof(vlong)); + sseek(arg); + return 0; +} + +uintptr +sysoseek(uintptr *arg) +{ + union { + vlong v; + ulong u[2]; + } o; + uintptr a[5]; + + o.v = (long)arg[1]; + a[0] = (uintptr)&o.v; + a[1] = arg[0]; + switch(sizeof arg[2]){ + case sizeof o.u[0]: + a[2] = o.u[0]; + a[3] = o.u[1]; + a[4] = arg[2]; + break; + + case sizeof o.v: + a[2] = o.v; + a[3] = arg[2]; + break; + } + sseek(a); + return o.v; +} + +void +validstat(uchar *s, int n) +{ + int m; + char buf[64]; + + if(statcheck(s, n) < 0) + error(Ebadstat); + /* verify that name entry is acceptable */ + s += STATFIXLEN - 4*BIT16SZ; /* location of first string */ + /* + * s now points at count for first string. + * if it's too long, let the server decide; this is + * only for his protection anyway. otherwise + * we'd have to allocate and waserror. + */ + m = GBIT16(s); + s += BIT16SZ; + if(m+1 > sizeof buf) + return; + memmove(buf, s, m); + buf[m] = '\0'; + /* name could be '/' */ + if(strcmp(buf, "/") != 0) + validname(buf, 0); +} + +static char* +pathlast(Path *p) +{ + char *s; + + if(p == nil) + return nil; + if(p->len == 0) + return nil; + s = strrchr(p->s, '/'); + if(s) + return s+1; + return p->s; +} + +uintptr +sysfstat(uintptr *arg) +{ + Chan *c; + uint l; + + l = arg[2]; + validaddr(arg[1], l, 1); + c = fdtochan(arg[0], -1, 0, 1); + if(waserror()) { + cclose(c); + nexterror(); + } + l = devtab[c->type]->stat(c, (uchar*)arg[1], l); + poperror(); + cclose(c); + return l; +} + +uintptr +sysstat(uintptr *arg) +{ + char *name; + Chan *c; + uint l; + + l = arg[2]; + validaddr(arg[1], l, 1); + validaddr(arg[0], 1, 0); + c = namec((char*)arg[0], Aaccess, 0, 0); + if(waserror()){ + cclose(c); + nexterror(); + } + l = devtab[c->type]->stat(c, (uchar*)arg[1], l); + name = pathlast(c->path); + if(name) + l = dirsetname(name, strlen(name), (uchar*)arg[1], l, arg[2]); + + poperror(); + cclose(c); + return l; +} + +uintptr +syschdir(uintptr *arg) +{ + Chan *c; + + validaddr(arg[0], 1, 0); + + c = namec((char*)arg[0], Atodir, 0, 0); + cclose(up->dot); + up->dot = c; + return 0; +} + +long +bindmount(int ismount, int fd, int afd, char* arg0, char* arg1, ulong flag, char* spec) +{ + int ret; + Chan *c0, *c1, *ac, *bc; + struct{ + Chan *chan; + Chan *authchan; + char *spec; + int flags; + }bogus; + + if((flag&~MMASK) || (flag&MORDER)==(MBEFORE|MAFTER)) + error(Ebadarg); + + if(ismount){ + validaddr((uintptr)spec, 1, 0); + spec = validnamedup(spec, 1); + if(waserror()){ + free(spec); + nexterror(); + } + + if(up->pgrp->noattach) + error(Enoattach); + + ac = nil; + bc = fdtochan(fd, ORDWR, 0, 1); + if(waserror()) { + if(ac) + cclose(ac); + cclose(bc); + nexterror(); + } + + if(afd >= 0) + ac = fdtochan(afd, ORDWR, 0, 1); + + bogus.flags = flag & MCACHE; + bogus.chan = bc; + bogus.authchan = ac; + bogus.spec = spec; + ret = devno('M', 0); + c0 = devtab[ret]->attach((char*)&bogus); + poperror(); /* ac bc */ + if(ac) + cclose(ac); + cclose(bc); + }else{ + spec = 0; + validaddr((uintptr)arg0, 1, 0); + c0 = namec(arg0, Abind, 0, 0); + } + + if(waserror()){ + cclose(c0); + nexterror(); + } + + validaddr((uintptr)arg1, 1, 0); + c1 = namec(arg1, Amount, 0, 0); + if(waserror()){ + cclose(c1); + nexterror(); + } + + ret = cmount(&c0, c1, flag, spec); + + poperror(); + cclose(c1); + poperror(); + cclose(c0); + if(ismount){ + fdclose(fd, 0); + poperror(); + free(spec); + } + return ret; +} + +uintptr +sysbind(uintptr *arg) +{ + return bindmount(0, -1, -1, (char*)arg[0], (char*)arg[1], arg[2], nil); +} + +uintptr +sysmount(uintptr *arg) +{ + return bindmount(1, arg[0], arg[1], nil, (char*)arg[2], arg[3], (char*)arg[4]); +} + +uintptr +sys_mount(uintptr *arg) +{ + return bindmount(1, arg[0], -1, nil, (char*)arg[1], arg[2], (char*)arg[3]); +} + +uintptr +sysunmount(uintptr *arg) +{ + Chan *cmount, *cmounted; + + cmounted = 0; + + validaddr(arg[1], 1, 0); + cmount = namec((char *)arg[1], Amount, 0, 0); + if(waserror()) { + cclose(cmount); + if(cmounted) + cclose(cmounted); + nexterror(); + } + + if(arg[0]) { + /* + * This has to be namec(..., Aopen, ...) because + * if arg[0] is something like /srv/cs or /fd/0, + * opening it is the only way to get at the real + * Chan underneath. + */ + validaddr(arg[0], 1, 0); + cmounted = namec((char*)arg[0], Aopen, OREAD, 0); + } + cunmount(cmount, cmounted); + poperror(); + cclose(cmount); + if(cmounted) + cclose(cmounted); + return 0; +} + +uintptr +syscreate(uintptr *arg) +{ + int fd; + Chan *c; + + openmode(arg[1]&~OEXCL); /* error check only; OEXCL okay here */ + validaddr(arg[0], 1, 0); + c = namec((char*)arg[0], Acreate, arg[1], arg[2]); + if(waserror()) { + cclose(c); + nexterror(); + } + fd = newfd(c); + if(fd < 0) + error(Enofd); + poperror(); + return fd; +} + +uintptr +sysremove(uintptr *arg) +{ + Chan *c; + + validaddr(arg[0], 1, 0); + c = namec((char*)arg[0], Aremove, 0, 0); + /* + * Removing mount points is disallowed to avoid surprises + * (which should be removed: the mount point or the mounted Chan?). + */ + if(c->ismtpt){ + cclose(c); + error(Eismtpt); + } + if(waserror()){ + c->type = 0; /* see below */ + cclose(c); + nexterror(); + } + devtab[c->type]->remove(c); + /* + * Remove clunks the fid, but we need to recover the Chan + * so fake it up. rootclose() is known to be a nop. + */ + c->type = 0; + poperror(); + cclose(c); + return 0; +} + +static long +wstat(Chan *c, uchar *d, int nd) +{ + long l; + int namelen; + + if(waserror()){ + cclose(c); + nexterror(); + } + if(c->ismtpt){ + /* + * Renaming mount points is disallowed to avoid surprises + * (which should be renamed? the mount point or the mounted Chan?). + */ + dirname(d, &namelen); + if(namelen) + nameerror(chanpath(c), Eismtpt); + } + l = devtab[c->type]->wstat(c, d, nd); + poperror(); + cclose(c); + return l; +} + +uintptr +syswstat(uintptr *arg) +{ + Chan *c; + uint l; + + l = arg[2]; + validaddr(arg[1], l, 0); + validstat((uchar*)arg[1], l); + validaddr(arg[0], 1, 0); + c = namec((char*)arg[0], Aaccess, 0, 0); + return wstat(c, (uchar*)arg[1], l); +} + +uintptr +sysfwstat(uintptr *arg) +{ + Chan *c; + uint l; + + l = arg[2]; + validaddr(arg[1], l, 0); + validstat((uchar*)arg[1], l); + c = fdtochan(arg[0], -1, 1, 1); + return wstat(c, (uchar*)arg[1], l); +} + +static void +packoldstat(uchar *buf, Dir *d) +{ + uchar *p; + ulong q; + + /* lay down old stat buffer - grotty code but it's temporary */ + p = buf; + strncpy((char*)p, d->name, 28); + p += 28; + strncpy((char*)p, d->uid, 28); + p += 28; + strncpy((char*)p, d->gid, 28); + p += 28; + q = d->qid.path & ~DMDIR; /* make sure doesn't accidentally look like directory */ + if(d->qid.type & QTDIR) /* this is the real test of a new directory */ + q |= DMDIR; + PBIT32(p, q); + p += BIT32SZ; + PBIT32(p, d->qid.vers); + p += BIT32SZ; + PBIT32(p, d->mode); + p += BIT32SZ; + PBIT32(p, d->atime); + p += BIT32SZ; + PBIT32(p, d->mtime); + p += BIT32SZ; + PBIT64(p, d->length); + p += BIT64SZ; + PBIT16(p, d->type); + p += BIT16SZ; + PBIT16(p, d->dev); +} + +uintptr +sys_stat(uintptr *arg) +{ + Chan *c; + uint l; + uchar buf[128]; /* old DIRLEN plus a little should be plenty */ + char strs[128], *name; + Dir d; + char old[] = "old stat system call - recompile"; + + validaddr(arg[1], 116, 1); + validaddr(arg[0], 1, 0); + c = namec((char*)arg[0], Aaccess, 0, 0); + if(waserror()){ + cclose(c); + nexterror(); + } + l = devtab[c->type]->stat(c, buf, sizeof buf); + /* buf contains a new stat buf; convert to old. yuck. */ + if(l <= BIT16SZ) /* buffer too small; time to face reality */ + error(old); + name = pathlast(c->path); + if(name) + l = dirsetname(name, strlen(name), buf, l, sizeof buf); + l = convM2D(buf, l, &d, strs); + if(l == 0) + error(old); + packoldstat((uchar*)arg[1], &d); + + poperror(); + cclose(c); + return 0; +} + +uintptr +sys_fstat(uintptr *arg) +{ + Chan *c; + char *name; + uint l; + uchar buf[128]; /* old DIRLEN plus a little should be plenty */ + char strs[128]; + Dir d; + char old[] = "old fstat system call - recompile"; + + validaddr(arg[1], 116, 1); + c = fdtochan(arg[0], -1, 0, 1); + if(waserror()){ + cclose(c); + nexterror(); + } + l = devtab[c->type]->stat(c, buf, sizeof buf); + /* buf contains a new stat buf; convert to old. yuck. */ + if(l <= BIT16SZ) /* buffer too small; time to face reality */ + error(old); + name = pathlast(c->path); + if(name) + l = dirsetname(name, strlen(name), buf, l, sizeof buf); + l = convM2D(buf, l, &d, strs); + if(l == 0) + error(old); + packoldstat((uchar*)arg[1], &d); + + poperror(); + cclose(c); + return 0; +} + +uintptr +sys_wstat(uintptr *) +{ + error("old wstat system call - recompile"); + return -1; +} + +uintptr +sys_fwstat(uintptr *) +{ + error("old fwstat system call - recompile"); + return -1; +} --- /dev/null +++ /sys/src/9/port64/sysproc.c @@ -0,0 +1,1144 @@ +#include "u.h" +#include "tos.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" +#include "../port/edf.h" + +#include + +int shargs(char*, int, char**); + +extern void checkpages(void); +extern void checkpagerefs(void); + +uintptr +sysr1(uintptr*) +{ + checkpagerefs(); + return 0; +} + +uintptr +sysrfork(uintptr *arg) +{ + Proc *p; + int n, i; + Fgrp *ofg; + Pgrp *opg; + Rgrp *org; + Egrp *oeg; + ulong pid, flag; + Mach *wm; + + flag = arg[0]; + /* Check flags before we commit */ + if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) + error(Ebadarg); + if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG)) + error(Ebadarg); + if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG)) + error(Ebadarg); + + if((flag&RFPROC) == 0) { + if(flag & (RFMEM|RFNOWAIT)) + error(Ebadarg); + if(flag & (RFFDG|RFCFDG)) { + ofg = up->fgrp; + if(flag & RFFDG) + up->fgrp = dupfgrp(ofg); + else + up->fgrp = dupfgrp(nil); + closefgrp(ofg); + } + if(flag & (RFNAMEG|RFCNAMEG)) { + opg = up->pgrp; + up->pgrp = newpgrp(); + if(flag & RFNAMEG) + pgrpcpy(up->pgrp, opg); + /* inherit noattach */ + up->pgrp->noattach = opg->noattach; + closepgrp(opg); + } + if(flag & RFNOMNT) + up->pgrp->noattach = 1; + if(flag & RFREND) { + org = up->rgrp; + up->rgrp = newrgrp(); + closergrp(org); + } + if(flag & (RFENVG|RFCENVG)) { + oeg = up->egrp; + up->egrp = smalloc(sizeof(Egrp)); + up->egrp->ref = 1; + if(flag & RFENVG) + envcpy(up->egrp, oeg); + closeegrp(oeg); + } + if(flag & RFNOTEG) + up->noteid = incref(¬eidalloc); + return 0; + } + + p = newproc(); + + p->fpsave = up->fpsave; + p->scallnr = up->scallnr; + p->s = up->s; + p->nerrlab = 0; + p->slash = up->slash; + p->dot = up->dot; + incref(p->dot); + + memmove(p->note, up->note, sizeof(p->note)); + p->privatemem = up->privatemem; + p->noswap = up->noswap; + p->nnote = up->nnote; + p->notified = 0; + p->lastnote = up->lastnote; + p->notify = up->notify; + p->ureg = up->ureg; + p->dbgreg = 0; + + /* Make a new set of memory segments */ + n = flag & RFMEM; + qlock(&p->seglock); + if(waserror()){ + qunlock(&p->seglock); + nexterror(); + } + for(i = 0; i < NSEG; i++) + if(up->seg[i]) + p->seg[i] = dupseg(up->seg, i, n); + qunlock(&p->seglock); + poperror(); + + /* File descriptors */ + if(flag & (RFFDG|RFCFDG)) { + if(flag & RFFDG) + p->fgrp = dupfgrp(up->fgrp); + else + p->fgrp = dupfgrp(nil); + } + else { + p->fgrp = up->fgrp; + incref(p->fgrp); + } + + /* Process groups */ + if(flag & (RFNAMEG|RFCNAMEG)) { + p->pgrp = newpgrp(); + if(flag & RFNAMEG) + pgrpcpy(p->pgrp, up->pgrp); + /* inherit noattach */ + p->pgrp->noattach = up->pgrp->noattach; + } + else { + p->pgrp = up->pgrp; + incref(p->pgrp); + } + if(flag & RFNOMNT) + p->pgrp->noattach = 1; + + if(flag & RFREND) + p->rgrp = newrgrp(); + else { + incref(up->rgrp); + p->rgrp = up->rgrp; + } + + /* Environment group */ + if(flag & (RFENVG|RFCENVG)) { + p->egrp = smalloc(sizeof(Egrp)); + p->egrp->ref = 1; + if(flag & RFENVG) + envcpy(p->egrp, up->egrp); + } + else { + p->egrp = up->egrp; + incref(p->egrp); + } + p->hang = up->hang; + p->procmode = up->procmode; + + /* Craft a return frame which will cause the child to pop out of + * the scheduler in user mode with the return register zero + */ + forkchild(p, up->dbgreg); + + p->parent = up; + p->parentpid = up->pid; + if(flag&RFNOWAIT) + p->parentpid = 0; + else { + lock(&up->exl); + up->nchild++; + unlock(&up->exl); + } + if((flag&RFNOTEG) == 0) + p->noteid = up->noteid; + + /* don't penalize the child, it hasn't done FP in a note handler. */ + p->fpstate = up->fpstate & ~FPillegal; + pid = p->pid; + memset(p->time, 0, sizeof(p->time)); + p->time[TReal] = MACHP(0)->ticks; + + kstrdup(&p->text, up->text); + kstrdup(&p->user, up->user); + /* + * since the bss/data segments are now shareable, + * any mmu info about this process is now stale + * (i.e. has bad properties) and has to be discarded. + */ + flushmmu(); + p->basepri = up->basepri; + p->priority = up->basepri; + p->fixedpri = up->fixedpri; + p->mp = up->mp; + wm = up->wired; + if(wm) + procwired(p, wm->machno); + ready(p); + sched(); + return pid; +} + +ulong +l2be(long l) +{ + uchar *cp; + + cp = (uchar*)&l; + return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3]; +} + +uintptr +sysexec(uintptr *arg) +{ + Segment *s, *ts; + int i; + Chan *tc; + char **argv, **argp; + char *a, *charp, *args, *file, *file0; + char *progarg[sizeof(Exec)/2+1], *elem, progelem[64]; + ulong ssize, spage, nargs, nbytes, n; + int indir; + Exec exec; + char line[sizeof(Exec)]; + Fgrp *f; + Image *img; + ulong magic; + uintptr t, d, b, entry, bssend, text, data, bss; + Tos *tos; + + indir = 0; + elem = nil; + validaddr(arg[0], 1, 0); + file0 = validnamedup((char*)arg[0], 1); + if(waserror()){ + free(file0); + free(elem); + nexterror(); + } + file = file0; + for(;;){ + tc = namec(file, Aopen, OEXEC, 0); + if(waserror()){ + cclose(tc); + nexterror(); + } + if(!indir) + kstrdup(&elem, up->genbuf); + + n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0); + if(n < 2) + error(Ebadexec); + magic = l2be(exec.magic); + text = l2be(exec.text); + entry = l2be(exec.entry); + if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){ + if(text >= USTKTOP-UTZERO + || entry < UTZERO+sizeof(Exec) + || entry >= UTZERO+sizeof(Exec)+text) + error(Ebadexec); + break; /* for binary */ + } + + /* + * Process #! /bin/sh args ... + */ + memmove(line, &exec, sizeof(Exec)); + if(indir || line[0]!='#' || line[1]!='!') + error(Ebadexec); + n = shargs(line, n, progarg); + if(n == 0) + error(Ebadexec); + indir = 1; + /* + * First arg becomes complete file name + */ + progarg[n++] = file; + progarg[n] = 0; + validaddr(arg[1], BY2WD, 1); + arg[1] += BY2WD; + file = progarg[0]; + if(strlen(elem) >= sizeof progelem) + error(Ebadexec); + strcpy(progelem, elem); + progarg[0] = progelem; + poperror(); + cclose(tc); + } + + data = l2be(exec.data); + bss = l2be(exec.bss); + t = UTROUND(UTZERO+sizeof(Exec)+text); + d = (t + data + (BY2PG-1)) & ~(BY2PG-1); + bssend = t + data + bss; + b = (bssend + (BY2PG-1)) & ~(BY2PG-1); + if(t >= KZERO || d >= KZERO || b >= KZERO) + error(Ebadexec); + + /* + * Args: pass 1: count + */ + nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */ + nargs = 0; + if(indir){ + argp = progarg; + while(*argp){ + a = *argp++; + nbytes += strlen(a) + 1; + nargs++; + } + } + validalign(arg[1], sizeof(char**)); + argp = (char**)arg[1]; + validaddr((uintptr)argp, BY2WD, 0); + while(*argp){ + a = *argp++; + if(((uintptr)argp&(BY2PG-1)) < BY2WD) + validaddr((uintptr)argp, BY2WD, 0); + validaddr((uintptr)a, 1, 0); + nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1; + nargs++; + } + ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1)); + + /* + * 8-byte align SP for those (e.g. sparc) that need it. + * execregs() will subtract another 4 bytes for argc. + */ + if(BY2WD == 4 && ((ssize+4) & 7)) + ssize += 4; + spage = (ssize+(BY2PG-1)) >> PGSHIFT; + + /* + * Build the stack segment, putting it in kernel virtual for the moment + */ + if(spage > TSTKSIZ) + error(Enovmem); + + qlock(&up->seglock); + if(waserror()){ + qunlock(&up->seglock); + nexterror(); + } + up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG); + + /* + * Args: pass 2: assemble; the pages will be faulted in + */ + tos = (Tos*)(TSTKTOP - sizeof(Tos)); + tos->cyclefreq = m->cyclefreq; + cycles((uvlong*)&tos->pcycles); + tos->pcycles = -tos->pcycles; + tos->kcycles = tos->pcycles; + tos->clock = 0; + argv = (char**)(TSTKTOP - ssize); + charp = (char*)(TSTKTOP - nbytes); + args = charp; + if(indir) + argp = progarg; + else + argp = (char**)arg[1]; + + for(i=0; itext); + up->text = elem; + elem = nil; /* so waserror() won't free elem */ + USED(elem); + + /* copy args; easiest from new process's stack */ + n = charp - args; + if(n > 128) /* don't waste too much space on huge arg lists */ + n = 128; + a = up->args; + up->args = nil; + free(a); + up->args = smalloc(n); + memmove(up->args, args, n); + if(n>0 && up->args[n-1]!='\0'){ + /* make sure last arg is NUL-terminated */ + /* put NUL at UTF-8 character boundary */ + for(i=n-1; i>0; --i) + if(fullrune(up->args+i, n-i)) + break; + up->args[i] = 0; + n = i+1; + } + up->nargs = n; + + /* + * Committed. + * Free old memory. + * Special segments are maintained across exec + */ + for(i = SSEG; i <= BSEG; i++) { + putseg(up->seg[i]); + /* prevent a second free if we have an error */ + up->seg[i] = 0; + } + for(i = BSEG+1; i < NSEG; i++) { + s = up->seg[i]; + if(s != 0 && (s->type&SG_CEXEC)) { + putseg(s); + up->seg[i] = 0; + } + } + + /* + * Close on exec + */ + f = up->fgrp; + for(i=0; i<=f->maxfd; i++) + fdclose(i, CCEXEC); + + /* Text. Shared. Attaches to cache image if possible */ + /* attachimage returns a locked cache image */ + img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT); + ts = img->s; + up->seg[TSEG] = ts; + ts->flushme = 1; + ts->fstart = 0; + ts->flen = sizeof(Exec)+text; + unlock(img); + + /* Data. Shared. */ + s = newseg(SG_DATA, t, (d-t)>>PGSHIFT); + up->seg[DSEG] = s; + + /* Attached by hand */ + incref(img); + s->image = img; + s->fstart = ts->fstart+ts->flen; + s->flen = data; + + /* BSS. Zero fill on demand */ + up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT); + + /* + * Move the stack + */ + s = up->seg[ESEG]; + up->seg[ESEG] = 0; + up->seg[SSEG] = s; + qunlock(&up->seglock); + poperror(); /* seglock */ + poperror(); /* elem */ + s->base = USTKTOP-USTKSIZE; + s->top = USTKTOP; + relocateseg(s, USTKTOP-TSTKTOP); + + /* + * '/' processes are higher priority (hack to make /ip more responsive). + */ + if(devtab[tc->type]->dc == L'/') + up->basepri = PriRoot; + up->priority = up->basepri; + poperror(); + cclose(tc); + + /* + * At this point, the mmu contains info about the old address + * space and needs to be flushed + */ + flushmmu(); + qlock(&up->debug); + up->nnote = 0; + up->notify = 0; + up->notified = 0; + up->privatemem = 0; + procsetup(up); + qunlock(&up->debug); + if(up->hang) + up->procctl = Proc_stopme; + + return execregs(entry, ssize, nargs); +} + +int +shargs(char *s, int n, char **ap) +{ + int i; + + s += 2; + n -= 2; /* skip #! */ + for(i=0; s[i]!='\n'; i++) + if(i == n-1) + return 0; + s[i] = 0; + *ap = 0; + i = 0; + for(;;) { + while(*s==' ' || *s=='\t') + s++; + if(*s == 0) + break; + i++; + *ap++ = s; + *ap = 0; + while(*s && *s!=' ' && *s!='\t') + s++; + if(*s == 0) + break; + else + *s++ = 0; + } + return i; +} + +int +return0(void*) +{ + return 0; +} + +uintptr +syssleep(uintptr *arg) +{ + + int n; + + n = arg[0]; + if(n <= 0) { + if (up->edf && (up->edf->flags & Admitted)) + edfyield(); + else + yield(); + return 0; + } + if(n < TK2MS(1)) + n = TK2MS(1); + tsleep(&up->sleep, return0, 0, n); + return 0; +} + +uintptr +sysalarm(uintptr *arg) +{ + return procalarm(arg[0]); +} + +uintptr +sysexits(uintptr *arg) +{ + char *status; + char *inval = "invalid exit string"; + char buf[ERRMAX]; + + status = (char*)arg[0]; + if(status){ + if(waserror()) + status = inval; + else{ + validaddr((uintptr)status, 1, 0); + if(vmemchr(status, 0, ERRMAX) == 0){ + memmove(buf, status, ERRMAX); + buf[ERRMAX-1] = 0; + status = buf; + } + poperror(); + } + + } + pexit(status, 1); + return 0; /* not reached */ +} + +uintptr +sys_wait(uintptr *arg) +{ + int pid; + Waitmsg w; + OWaitmsg *ow; + + if(arg[0] == 0) + return pwait(nil); + + validaddr(arg[0], sizeof(OWaitmsg), 1); + validalign(arg[0], BY2WD); /* who cares? */ + pid = pwait(&w); + if(pid >= 0){ + ow = (OWaitmsg*)arg[0]; + readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE); + readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE); + readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE); + readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE); + strncpy(ow->msg, w.msg, sizeof(ow->msg)); + ow->msg[sizeof(ow->msg)-1] = '\0'; + } + return pid; +} + +uintptr +sysawait(uintptr *arg) +{ + int i; + int pid; + Waitmsg w; + ulong n; + + n = arg[1]; + validaddr(arg[0], n, 1); + pid = pwait(&w); + if(pid < 0) + return -1; + i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q", + w.pid, + w.time[TUser], w.time[TSys], w.time[TReal], + w.msg); + + return i; +} + +void +werrstr(char *fmt, ...) +{ + va_list va; + + if(up == nil) + return; + + va_start(va, fmt); + vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va); + va_end(va); +} + +static long +generrstr(char *buf, uint nbuf) +{ + char tmp[ERRMAX]; + + if(nbuf == 0) + error(Ebadarg); + validaddr((uintptr)buf, nbuf, 1); + if(nbuf > sizeof tmp) + nbuf = sizeof tmp; + memmove(tmp, buf, nbuf); + + /* make sure it's NUL-terminated */ + tmp[nbuf-1] = '\0'; + memmove(buf, up->syserrstr, nbuf); + buf[nbuf-1] = '\0'; + memmove(up->syserrstr, tmp, nbuf); + return 0; +} + +uintptr +syserrstr(uintptr *arg) +{ + return generrstr((char*)arg[0], arg[1]); +} + +/* compatibility for old binaries */ +uintptr +sys_errstr(uintptr *arg) +{ + return generrstr((char*)arg[0], 64); +} + +uintptr +sysnotify(uintptr *arg) +{ + if(arg[0] != 0) + validaddr(arg[0], sizeof(uintptr), 0); + up->notify = (int(*)(void*, char*))(arg[0]); + return 0; +} + +uintptr +sysnoted(ulong *arg) +{ + if(arg[0]!=NRSTR && !up->notified) + error(Egreg); + return 0; +} + +uintptr +syssegbrk(uintptr *arg) +{ + int i; + uintptr addr; + Segment *s; + + addr = arg[0]; + for(i = 0; i < NSEG; i++) { + s = up->seg[i]; + if(s == 0 || addr < s->base || addr >= s->top) + continue; + switch(s->type&SG_TYPE) { + case SG_TEXT: + case SG_DATA: + case SG_STACK: + error(Ebadarg); + default: + return ibrk(arg[1], i); + } + } + + error(Ebadarg); + return 0; /* not reached */ +} + +uintptr +syssegattach(uintptr *arg) +{ + return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]); +} + +uintptr +syssegdetach(uintptr *arg) +{ + int i; + uintptr addr; + Segment *s; + + qlock(&up->seglock); + if(waserror()){ + qunlock(&up->seglock); + nexterror(); + } + + s = 0; + addr = arg[0]; + for(i = 0; i < NSEG; i++) + if(s = up->seg[i]) { + qlock(&s->lk); + if((addr >= s->base && addr < s->top) || + (s->top == s->base && addr == s->base)) + goto found; + qunlock(&s->lk); + } + + error(Ebadarg); + +found: + /* + * Check we are not detaching the initial stack segment. + */ + if(s == up->seg[SSEG]){ + qunlock(&s->lk); + error(Ebadarg); + } + up->seg[i] = 0; + qunlock(&s->lk); + putseg(s); + qunlock(&up->seglock); + poperror(); + + /* Ensure we flush any entries from the lost segment */ + flushmmu(); + return 0; +} + +uintptr +syssegfree(uintptr *arg) +{ + Segment *s; + uintptr from, to; + + from = arg[0]; + s = seg(up, from, 1); + if(s == nil) + error(Ebadarg); + to = (from + arg[1]) & ~(BY2PG-1); + from = PGROUND(from); + + if(to > s->top) { + qunlock(&s->lk); + error(Ebadarg); + } + + mfreeseg(s, from, (to - from) / BY2PG); + qunlock(&s->lk); + flushmmu(); + + return 0; +} + +/* For binary compatibility */ +uintptr +sysbrk_(uintptr *arg) +{ + return ibrk(arg[0], BSEG); +} + +uintptr +sysrendezvous(uintptr *arg) +{ + uintptr tag, val; + Proc *p, **l; + + tag = arg[0]; + l = &REND(up->rgrp, tag); + up->rendval = ~(uintptr)0; + + lock(up->rgrp); + for(p = *l; p; p = p->rendhash) { + if(p->rendtag == tag) { + *l = p->rendhash; + val = p->rendval; + p->rendval = arg[1]; + + while(p->mach != 0) + ; + ready(p); + unlock(up->rgrp); + return val; + } + l = &p->rendhash; + } + + /* Going to sleep here */ + up->rendtag = tag; + up->rendval = arg[1]; + up->rendhash = *l; + *l = up; + up->state = Rendezvous; + unlock(up->rgrp); + + sched(); + + return up->rendval; +} + +/* + * The implementation of semaphores is complicated by needing + * to avoid rescheduling in syssemrelease, so that it is safe + * to call from real-time processes. This means syssemrelease + * cannot acquire any qlocks, only spin locks. + * + * Semacquire and semrelease must both manipulate the semaphore + * wait list. Lock-free linked lists only exist in theory, not + * in practice, so the wait list is protected by a spin lock. + * + * The semaphore value *addr is stored in user memory, so it + * cannot be read or written while holding spin locks. + * + * Thus, we can access the list only when holding the lock, and + * we can access the semaphore only when not holding the lock. + * This makes things interesting. Note that sleep's condition function + * is called while holding two locks - r and up->rlock - so it cannot + * access the semaphore value either. + * + * An acquirer announces its intention to try for the semaphore + * by putting a Sema structure onto the wait list and then + * setting Sema.waiting. After one last check of semaphore, + * the acquirer sleeps until Sema.waiting==0. A releaser of n + * must wake up n acquirers who have Sema.waiting set. It does + * this by clearing Sema.waiting and then calling wakeup. + * + * There are three interesting races here. + + * The first is that in this particular sleep/wakeup usage, a single + * wakeup can rouse a process from two consecutive sleeps! + * The ordering is: + * + * (a) set Sema.waiting = 1 + * (a) call sleep + * (b) set Sema.waiting = 0 + * (a) check Sema.waiting inside sleep, return w/o sleeping + * (a) try for semaphore, fail + * (a) set Sema.waiting = 1 + * (a) call sleep + * (b) call wakeup(a) + * (a) wake up again + * + * This is okay - semacquire will just go around the loop + * again. It does mean that at the top of the for(;;) loop in + * semacquire, phore.waiting might already be set to 1. + * + * The second is that a releaser might wake an acquirer who is + * interrupted before he can acquire the lock. Since + * release(n) issues only n wakeup calls -- only n can be used + * anyway -- if the interrupted process is not going to use his + * wakeup call he must pass it on to another acquirer. + * + * The third race is similar to the second but more subtle. An + * acquirer sets waiting=1 and then does a final canacquire() + * before going to sleep. The opposite order would result in + * missing wakeups that happen between canacquire and + * waiting=1. (In fact, the whole point of Sema.waiting is to + * avoid missing wakeups between canacquire() and sleep().) But + * there can be spurious wakeups between a successful + * canacquire() and the following semdequeue(). This wakeup is + * not useful to the acquirer, since he has already acquired + * the semaphore. Like in the previous case, though, the + * acquirer must pass the wakeup call along. + * + * This is all rather subtle. The code below has been verified + * with the spin model /sys/src/9/port/semaphore.p. The + * original code anticipated the second race but not the first + * or third, which were caught only with spin. The first race + * is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it. + * It was lucky that my abstract model of sleep/wakeup still managed + * to preserve that behavior. + * + * I remain slightly concerned about memory coherence + * outside of locks. The spin model does not take + * queued processor writes into account so we have to + * think hard. The only variables accessed outside locks + * are the semaphore value itself and the boolean flag + * Sema.waiting. The value is only accessed with cmpswap, + * whose job description includes doing the right thing as + * far as memory coherence across processors. That leaves + * Sema.waiting. To handle it, we call coherence() before each + * read and after each write. - rsc + */ + +/* Add semaphore p with addr a to list in seg. */ +static void +semqueue(Segment *s, long *a, Sema *p) +{ + memset(p, 0, sizeof *p); + p->addr = a; + lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */ + p->next = &s->sema; + p->prev = s->sema.prev; + p->next->prev = p; + p->prev->next = p; + unlock(&s->sema); +} + +/* Remove semaphore p from list in seg. */ +static void +semdequeue(Segment *s, Sema *p) +{ + lock(&s->sema); + p->next->prev = p->prev; + p->prev->next = p->next; + unlock(&s->sema); +} + +/* Wake up n waiters with addr a on list in seg. */ +static void +semwakeup(Segment *s, long *a, long n) +{ + Sema *p; + + lock(&s->sema); + for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){ + if(p->addr == a && p->waiting){ + p->waiting = 0; + coherence(); + wakeup(p); + n--; + } + } + unlock(&s->sema); +} + +/* Add delta to semaphore and wake up waiters as appropriate. */ +static long +semrelease(Segment *s, long *addr, long delta) +{ + long value; + + do + value = *addr; + while(!cmpswap(addr, value, value+delta)); + semwakeup(s, addr, delta); + return value+delta; +} + +/* Try to acquire semaphore using compare-and-swap */ +static int +canacquire(long *addr) +{ + long value; + + while((value=*addr) > 0) + if(cmpswap(addr, value, value-1)) + return 1; + return 0; +} + +/* Should we wake up? */ +static int +semawoke(void *p) +{ + coherence(); + return !((Sema*)p)->waiting; +} + +/* Acquire semaphore (subtract 1). */ +static int +semacquire(Segment *s, long *addr, int block) +{ + int acquired; + Sema phore; + + if(canacquire(addr)) + return 1; + if(!block) + return 0; + + acquired = 0; + semqueue(s, addr, &phore); + for(;;){ + phore.waiting = 1; + coherence(); + if(canacquire(addr)){ + acquired = 1; + break; + } + if(waserror()) + break; + sleep(&phore, semawoke, &phore); + poperror(); + } + semdequeue(s, &phore); + coherence(); /* not strictly necessary due to lock in semdequeue */ + if(!phore.waiting) + semwakeup(s, addr, 1); + if(!acquired) + nexterror(); + return 1; +} + +/* Acquire semaphore or time-out */ +static int +tsemacquire(Segment *s, long *addr, ulong ms) +{ + int acquired, timedout; + ulong t, elms; + Sema phore; + + if(canacquire(addr)) + return 1; + if(ms == 0) + return 0; + acquired = timedout = 0; + semqueue(s, addr, &phore); + for(;;){ + phore.waiting = 1; + coherence(); + if(canacquire(addr)){ + acquired = 1; + break; + } + if(waserror()) + break; + t = m->ticks; + tsleep(&phore, semawoke, &phore, ms); + elms = TK2MS(m->ticks - t); + poperror(); + if(elms >= ms){ + timedout = 1; + break; + } + ms -= elms; + } + semdequeue(s, &phore); + coherence(); /* not strictly necessary due to lock in semdequeue */ + if(!phore.waiting) + semwakeup(s, addr, 1); + if(timedout) + return 0; + if(!acquired) + nexterror(); + return 1; +} + +uintptr +syssemacquire(uintptr *arg) +{ + int block; + long *addr; + Segment *s; + + validaddr(arg[0], sizeof(long), 1); + validalign(arg[0], sizeof(long)); + addr = (long*)arg[0]; + block = arg[1]; + + if((s = seg(up, (uintptr)addr, 0)) == nil) + error(Ebadarg); + if(*addr < 0) + error(Ebadarg); + return semacquire(s, addr, block); +} + +uintptr +systsemacquire(uintptr *arg) +{ + long *addr; + ulong ms; + Segment *s; + + validaddr(arg[0], sizeof(long), 1); + validalign(arg[0], sizeof(long)); + addr = (long*)arg[0]; + ms = arg[1]; + + if((s = seg(up, (uintptr)addr, 0)) == nil) + error(Ebadarg); + if(*addr < 0) + error(Ebadarg); + return tsemacquire(s, addr, ms); +} + +uintptr +syssemrelease(uintptr *arg) +{ + long *addr, delta; + Segment *s; + + validaddr(arg[0], sizeof(long), 1); + validalign(arg[0], sizeof(long)); + addr = (long*)arg[0]; + delta = arg[1]; + + if((s = seg(up, (uintptr)addr, 0)) == nil) + error(Ebadarg); + /* delta == 0 is a no-op, not a release */ + if(delta < 0 || *addr < 0) + error(Ebadarg); + return semrelease(s, addr, delta); +} + +uintptr +sysnsec(uintptr *arg) +{ + validaddr(arg[0], sizeof(vlong), 1); + validalign(arg[0], sizeof(vlong)); + + *(vlong*)arg[0] = todget(nil); + + return 0; +} --- /dev/null +++ /sys/src/9/port64/taslock.c @@ -0,0 +1,255 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" +#include "../port/error.h" +#include "../port/edf.h" + +long maxlockcycles; +long maxilockcycles; +long cumlockcycles; +long cumilockcycles; +uintptr maxlockpc; +uintptr maxilockpc; + +struct +{ + ulong locks; + ulong glare; + ulong inglare; +} lockstats; + +static void +inccnt(Ref *r) +{ + _xinc(&r->ref); +} + +static int +deccnt(Ref *r) +{ + int x; + + x = _xdec(&r->ref); + if(x < 0) + panic("deccnt pc=%#p", getcallerpc(&r)); + return x; +} + +static void +dumplockmem(char *tag, Lock *l) +{ + uchar *cp; + int i; + + iprint("%s: ", tag); + cp = (uchar*)l; + for(i = 0; i < 64; i++) + iprint("%2.2ux ", cp[i]); + iprint("\n"); +} + +void +lockloop(Lock *l, uintptr pc) +{ + Proc *p; + + p = l->p; + print("lock %#p loop key %#lux pc %#p held by pc %#p proc %lud\n", + l, l->key, pc, l->pc, p ? p->pid : 0); + dumpaproc(up); + if(p != nil) + dumpaproc(p); +} + +int +lock(Lock *l) +{ + int i; + uintptr pc; + + pc = getcallerpc(&l); + + lockstats.locks++; + if(up) + inccnt(&up->nlocks); /* prevent being scheded */ + if(tas(&l->key) == 0){ + if(up) + up->lastlock = l; + l->pc = pc; + l->p = up; + l->isilock = 0; +#ifdef LOCKCYCLES + l->lockcycles = -lcycles(); +#endif + return 0; + } + if(up) + deccnt(&up->nlocks); + + lockstats.glare++; + for(;;){ + lockstats.inglare++; + i = 0; + while(l->key){ + if(conf.nmach < 2 && up && up->edf && (up->edf->flags & Admitted)){ + /* + * Priority inversion, yield on a uniprocessor; on a + * multiprocessor, the other processor will unlock + */ + print("inversion %#p pc %#p proc %lud held by pc %#p proc %lud\n", + l, pc, up ? up->pid : 0, l->pc, l->p ? l->p->pid : 0); + up->edf->d = todget(nil); /* yield to process with lock */ + } + if(i++ > 100000000){ + i = 0; + lockloop(l, pc); + } + } + if(up) + inccnt(&up->nlocks); + if(tas(&l->key) == 0){ + if(up) + up->lastlock = l; + l->pc = pc; + l->p = up; + l->isilock = 0; +#ifdef LOCKCYCLES + l->lockcycles = -lcycles(); +#endif + return 1; + } + if(up) + deccnt(&up->nlocks); + } +} + +void +ilock(Lock *l) +{ + ulong x; + uintptr pc; + + pc = getcallerpc(&l); + lockstats.locks++; + + x = splhi(); + if(tas(&l->key) != 0){ + lockstats.glare++; + /* + * Cannot also check l->pc, l->m, or l->isilock here + * because they might just not be set yet, or + * (for pc and m) the lock might have just been unlocked. + */ + for(;;){ + lockstats.inglare++; + splx(x); + while(l->key) + ; + x = splhi(); + if(tas(&l->key) == 0) + goto acquire; + } + } +acquire: + m->ilockdepth++; + if(up) + up->lastilock = l; + l->sr = x; + l->pc = pc; + l->p = up; + l->isilock = 1; + l->m = MACHP(m->machno); +#ifdef LOCKCYCLES + l->lockcycles = -lcycles(); +#endif +} + +int +canlock(Lock *l) +{ + if(up) + inccnt(&up->nlocks); + if(tas(&l->key)){ + if(up) + deccnt(&up->nlocks); + return 0; + } + + if(up) + up->lastlock = l; + l->pc = getcallerpc(&l); + l->p = up; + l->m = MACHP(m->machno); + l->isilock = 0; +#ifdef LOCKCYCLES + l->lockcycles = -lcycles(); +#endif + return 1; +} + +void +unlock(Lock *l) +{ +#ifdef LOCKCYCLES + l->lockcycles += lcycles(); + cumlockcycles += l->lockcycles; + if(l->lockcycles > maxlockcycles){ + maxlockcycles = l->lockcycles; + maxlockpc = l->pc; + } +#endif + if(l->key == 0) + print("unlock: not locked: pc %#p\n", getcallerpc(&l)); + if(l->isilock) + print("unlock of ilock: pc %#p, held by %#p\n", getcallerpc(&l), l->pc); + if(l->p != up) + print("unlock: up changed: pc %#p, acquired at pc %#p, lock p %#p, unlock up %#p\n", getcallerpc(&l), l->pc, l->p, up); + l->m = nil; + l->key = 0; + coherence(); + + if(up && deccnt(&up->nlocks) == 0 && up->delaysched && islo()){ + /* + * Call sched if the need arose while locks were held + * But, don't do it from interrupt routines, hence the islo() test + */ + sched(); + } +} + +uintptr ilockpcs[0x100] = { [0xff] = 1 }; +static int n; + +void +iunlock(Lock *l) +{ + ulong sr; + +#ifdef LOCKCYCLES + l->lockcycles += lcycles(); + cumilockcycles += l->lockcycles; + if(l->lockcycles > maxilockcycles){ + maxilockcycles = l->lockcycles; + maxilockpc = l->pc; + } + if(l->lockcycles > 2400) + ilockpcs[n++ & 0xff] = l->pc; +#endif + if(l->key == 0) + print("iunlock: not locked: pc %#p\n", getcallerpc(&l)); + if(!l->isilock) + print("iunlock of lock: pc %#p, held by %#p\n", getcallerpc(&l), l->pc); + if(islo()) + print("iunlock while lo: pc %#p, held by %#p\n", getcallerpc(&l), l->pc); + + sr = l->sr; + l->m = nil; + l->key = 0; + coherence(); + m->ilockdepth--; + if(up) + up->lastilock = nil; + splx(sr); +} --- /dev/null +++ /sys/src/9/port64/xalloc.c @@ -0,0 +1,282 @@ +#include "u.h" +#include "../port/lib.h" +#include "mem.h" +#include "dat.h" +#include "fns.h" + +enum +{ + Nhole = 128, + Magichole = 0x484F4C45, /* HOLE */ +}; + +typedef struct Hole Hole; +typedef struct Xalloc Xalloc; +typedef struct Xhdr Xhdr; + +struct Hole +{ + uintptr addr; + uintptr size; + uintptr top; + Hole* link; +}; + +struct Xhdr +{ + ulong size; + ulong magix; + char data[]; +}; + +struct Xalloc +{ + Lock; + Hole hole[Nhole]; + Hole* flist; + Hole* table; +}; + +static Xalloc xlists; + +void +xinit(void) +{ + int i, n, upages, kpages; + ulong maxpages; + Confmem *m; + Pallocmem *pm; + Hole *h, *eh; + + eh = &xlists.hole[Nhole-1]; + for(h = xlists.hole; h < eh; h++) + h->link = h+1; + + xlists.flist = xlists.hole; + + upages = conf.upages; + kpages = conf.npage - upages; + pm = palloc.mem; + for(i=0; inpage; + if(n > kpages) + n = kpages; + /* don't try to use non-KADDR-able memory for kernel */ + maxpages = cankaddr(m->base)/BY2PG; + if(n > maxpages) + n = maxpages; + /* first give to kernel */ + if(n > 0){ + m->kbase = (uintptr)KADDR(m->base); + m->klimit = (uintptr)KADDR(m->base+n*BY2PG); + xhole(m->base, n*BY2PG); + kpages -= n; + } + /* if anything left over, give to user */ + if(n < m->npage){ + if(pm >= palloc.mem+nelem(palloc.mem)){ + print("xinit: losing %lud pages\n", m->npage-n); + continue; + } + pm->base = m->base+n*BY2PG; + pm->npage = m->npage - n; + pm++; + } + } +// xsummary(); /* call it from main if desired */ +} + +void* +xspanalloc(ulong size, int align, ulong span) +{ + uintptr a, v, t; + a = (uintptr)xalloc(size+align+span); + if(a == 0) + panic("xspanalloc: %lud %d %lux", size, align, span); + + if(span > 2) { + v = (a + span) & ~((uintptr)span-1); + t = v - a; + if(t > 0) + xhole(PADDR(a), t); + t = a + span - v; + if(t > 0) + xhole(PADDR(v+size+align), t); + } + else + v = a; + + if(align > 1) + v = (v + align) & ~((uintptr)align-1); + + return (void*)v; +} + +void* +xallocz(ulong size, int zero) +{ + Xhdr *p; + Hole *h, **l; + + /* add room for magix & size overhead, round up to nearest vlong */ + size += BY2V + offsetof(Xhdr, data[0]); + size &= ~(BY2V-1); + + ilock(&xlists); + l = &xlists.table; + for(h = *l; h; h = h->link) { + if(h->size >= size) { + p = (Xhdr*)KADDR(h->addr); + h->addr += size; + h->size -= size; + if(h->size == 0) { + *l = h->link; + h->link = xlists.flist; + xlists.flist = h; + } + iunlock(&xlists); + if(zero) + memset(p, 0, size); + p->magix = Magichole; + p->size = size; + return p->data; + } + l = &h->link; + } + iunlock(&xlists); + return nil; +} + +void* +xalloc(ulong size) +{ + return xallocz(size, 1); +} + +void +xfree(void *p) +{ + Xhdr *x; + + x = (Xhdr*)((uintptr)p - offsetof(Xhdr, data[0])); + if(x->magix != Magichole) { + xsummary(); + panic("xfree(%#p) %#ux != %#lux", p, Magichole, x->magix); + } + xhole(PADDR((uintptr)x), x->size); +} + +int +xmerge(void *vp, void *vq) +{ + Xhdr *p, *q; + + p = (Xhdr*)(((uintptr)vp - offsetof(Xhdr, data[0]))); + q = (Xhdr*)(((uintptr)vq - offsetof(Xhdr, data[0]))); + if(p->magix != Magichole || q->magix != Magichole) { + int i; + ulong *wd; + void *badp; + + xsummary(); + badp = (p->magix != Magichole? p: q); + wd = (ulong *)badp - 12; + for (i = 24; i-- > 0; ) { + print("%#p: %lux", wd, *wd); + if (wd == badp) + print(" <-"); + print("\n"); + wd++; + } + panic("xmerge(%#p, %#p) bad magic %#lux, %#lux", + vp, vq, p->magix, q->magix); + } + if((uchar*)p+p->size == (uchar*)q) { + p->size += q->size; + return 1; + } + return 0; +} + +void +xhole(uintptr addr, uintptr size) +{ + uintptr top; + Hole *h, *c, **l; + + if(size == 0) + return; + + top = addr + size; + ilock(&xlists); + l = &xlists.table; + for(h = *l; h; h = h->link) { + if(h->top == addr) { + h->size += size; + h->top = h->addr+h->size; + c = h->link; + if(c && h->top == c->addr) { + h->top += c->size; + h->size += c->size; + h->link = c->link; + c->link = xlists.flist; + xlists.flist = c; + } + iunlock(&xlists); + return; + } + if(h->addr > addr) + break; + l = &h->link; + } + if(h && top == h->addr) { + h->addr -= size; + h->size += size; + iunlock(&xlists); + return; + } + + if(xlists.flist == nil) { + iunlock(&xlists); + print("xfree: no free holes, leaked %llud bytes\n", (uvlong)size); + return; + } + + h = xlists.flist; + xlists.flist = h->link; + h->addr = addr; + h->top = top; + h->size = size; + h->link = *l; + *l = h; + iunlock(&xlists); +} + +void +xsummary(void) +{ + int i; + Hole *h; + uintptr s; + + i = 0; + for(h = xlists.flist; h; h = h->link) + i++; + + print("%d holes free", i); + s = 0; + for(h = xlists.table; h; h = h->link) { + if (1) { + print("addr %#p top %#p size %llud\n", + h->addr, h->top, (uvlong)h->size); + delay(10); + } + s += h->size; + if (h == h->link) { + print("xsummary: infinite loop broken\n"); + break; + } + } + print(" %lld bytes free\n", (vlong)s); +}