src/afs/HPUX/osi_vnodeops.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /* This is a placeholder for routines unique to the port of AFS to hp-ux*/
  11
  12 #include "../afs/param.h"
  13 #include "../afs/sysincludes.h" /* Standard vendor system headers */
  14 #include "../afs/afsincludes.h" /* Afs-based standard headers */
  15 #include "../afs/afs_stats.h" /* statistics stuff */
  16
  17 #include <sys/uio.h>
  18 #include <sys/vfs.h>
  19 #include <sys/mount.h>
  20 #include <sys/vnode.h>
  21 #include <sys/pathname.h>
  22
  23 extern struct vfsops Afs_vfsops;
  24 extern int afs_hp_strategy();
  25 extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
  26 extern int afs_pagein();
  27 extern int afs_pageout();
  28 extern int afs_ioctl();
  29 extern int afs_prealloc();
  30 extern int afs_mapdbd();
  31 extern int afs_mmap();
  32 extern int afs_cachelimit();
  33 extern int afs_vm_checkpage();
  34 extern int afs_vm_fscontiguous();
  35 extern int afs_vm_stopio();
  36 extern int afs_read_ahead();
  37 extern int afs_unmap();
  38 extern int afs_release();
  39 extern int afs_swapfs_len();
  40 extern int afs_readdir2();
  41 extern int afs_readdir();
  42 extern int afs_readdir3();
  43 extern int afs_pathconf();
  44 extern int afs_close();
  45
  46 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
  47
  48
  49 /*
  50  * Copy an mbuf to the contiguous area pointed to by cp.
  51  * Skip <off> bytes and copy <len> bytes.
  52  * Returns the number of bytes not transferred.
  53  * The mbuf is NOT changed.
  54  */
  55 int
  56 m_cpytoc(m, off, len, cp)
  57         register struct mbuf *m;
  58         register int off, len;
  59         register caddr_t cp;
  60 {
  61         register int ml;
  62
  63         if (m == NULL || off < 0 || len < 0 || cp == NULL)
  64                 osi_Panic("m_cpytoc");
  65         while (off && m)
  66                 if (m->m_len <= off) {
  67                         off -= m->m_len;
  68                         m = m->m_next;
  69                         continue;
  70                 } else
  71                         break;
  72         if (m == NULL)
  73                 return (len);
  74
  75         ml = MIN(len, m->m_len - off);
  76         bcopy(mtod(m, caddr_t)+off, cp, (u_int)ml);
  77         cp += ml;
  78         len -= ml;
  79         m = m->m_next;
  80
  81         while (len && m) {
  82                 ml = m->m_len;
  83                 bcopy(mtod(m, caddr_t), cp, (u_int)ml);
  84                 cp += ml;
  85                 len -= ml;
  86                 m = m->m_next;
  87         }
  88
  89         return (len);
  90 }
  91
  92 /*
  93  *  Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
  94  * totally new.  This came about because HP-UX has lockf() implemented as
  95  * a system call while Sun has it implemented as a library (apparently).
  96  * To handle this, we have to translate the lockf() request into an
  97  * fcntl() looking request, and then translate the results back if necessary.
  98  * we call afs_lockctl() directly .
  99  */
 100 afs_lockf( vp, flag, len, cred, fp, LB, UB )
 101     struct vnode *vp;
 102     int flag;
 103     struct AFS_UCRED *cred;
 104     struct file *fp;
 105     k_off_t len, LB, UB;
 106 {
 107     /*for now, just pretend it works*/
 108     struct k_flock flock;
 109     int cmd, code;
 110
 111     /*
 112      * Create a flock structure and translate the lockf request
 113      * into an appropriate looking fcntl() type request for afs_lockctl()
 114      */
 115     flock.l_whence = 0;
 116     flock.l_len = len;
 117     flock.l_start = fp->f_offset;
 118     /* convert negative lengths to positive */
 119     if (flock.l_len < 0 ) {
 120         flock.l_start += flock.l_len;
 121         flock.l_len = -(flock.l_len);
 122     }
 123     /*
 124      * Adjust values to look like fcntl() requests.
 125      * All locks are write locks, only F_LOCK requests
 126      * are blocking.  F_TEST has to be translated into
 127      * a get lock and then back again.
 128      */
 129     flock.l_type = F_WRLCK;
 130     cmd = F_SETLK;
 131     switch (flag) {
 132       case F_ULOCK:
 133         flock.l_type = F_UNLCK;
 134         break;
 135       case F_LOCK:
 136         cmd = F_SETLKW;
 137         break;
 138       case F_TEST:
 139         cmd = F_GETLK;
 140         break;
 141     }
 142     u.u_error = mp_afs_lockctl(vp,  &flock, cmd, fp->f_cred);
 143     if (u.u_error) {
 144         return(u.u_error);              /* some other error code */
 145     }
 146     /*
 147      * if request is F_TEST, and GETLK changed
 148      * the lock type to ULOCK, then return 0, else
 149      * set errno to EACCESS and return.
 150      */
 151     if (flag == F_TEST && flock.l_type != F_UNLCK) {
 152         u.u_error = EACCES;
 153         return (u.u_error);
 154     }
 155     return (0);
 156 }
 157
 158
 159 #include "../machine/vmparam.h" /* For KERNELSPACE */
 160 #include "../h/debug.h"
 161 #include "../h/types.h"
 162 #include "../h/param.h"
 163 #include "../h/vmmac.h"
 164 #include "../h/time.h"
 165 #include "../ufs/inode.h"
 166 #include "../ufs/fs.h"
 167 #include "../h/dbd.h"
 168 #include "../h/vfd.h"
 169 #include "../h/region.h"
 170 #include "../h/pregion.h"
 171 #include "../h/vmmeter.h"
 172 #include "../h/user.h"
 173 #include "../h/sysinfo.h"
 174 #include "../h/pfdat.h"
 175 #include "../h/tuneable.h"
 176 #include "../h/buf.h"
 177 #include "../netinet/in.h"
 178 #include "../rpc/types.h"
 179 #include "../rpc/auth.h"
 180 #include "../rpc/clnt.h"
 181 #include "../rpc/xdr.h"
 182
 183 /* a freelist of one */
 184 struct buf *afs_bread_freebp = 0;
 185
 186 /*
 187  *  Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
 188  *  Thus we can use fake bufs (ie not from the real buffer pool).
 189  */
 190 afs_bread(vp, lbn, bpp)
 191         struct vnode *vp;
 192         daddr_t lbn;
 193         struct buf **bpp;
 194 {
 195         int offset, fsbsize, error;
 196         struct buf *bp;
 197         struct iovec iov;
 198         struct uio uio;
 199
 200         AFS_STATCNT(afs_bread);
 201         fsbsize = vp->v_vfsp->vfs_bsize;
 202         offset = lbn * fsbsize;
 203         if (afs_bread_freebp) {
 204                 bp = afs_bread_freebp;
 205                 afs_bread_freebp = 0;
 206         } else {
 207                 bp = (struct buf *) AFS_KALLOC(sizeof(*bp));
 208                 bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
 209         }
 210
 211         iov.iov_base = bp->b_un.b_addr;
 212         iov.iov_len = fsbsize;
 213         uio.afsio_iov = &iov;
 214         uio.afsio_iovcnt = 1;
 215         uio.afsio_seg = AFS_UIOSYS;
 216         uio.afsio_offset = offset;
 217         uio.afsio_resid = fsbsize;
 218         uio.uio_fpflags = 0;
 219         *bpp = 0;
 220
 221         error = afs_read((struct vcache *)vp, &uio, p_cred(u.u_procp),
 222                          lbn, bpp, 0);
 223         if (error) {
 224                 afs_bread_freebp = bp;
 225                 return error;
 226         }
 227         if (*bpp) {
 228                 afs_bread_freebp = bp;
 229         } else {
 230                 *(struct buf **)&bp->b_vp = bp; /* mark as fake */
 231                 *bpp = bp;
 232         }
 233         return 0;
 234 }
 235
 236 afs_brelse(vp, bp)
 237 struct vnode *vp;
 238 struct buf *bp;
 239 {
 240     AFS_STATCNT(afs_brelse);
 241
 242     if ((struct buf *)bp->b_vp != bp) { /* not fake */
 243         ufs_brelse(bp->b_vp, bp);
 244     } else if (afs_bread_freebp) {
 245         AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
 246         AFS_KFREE(bp, sizeof(*bp));
 247     } else {
 248         afs_bread_freebp = bp;
 249     }
 250 }
 251
 252
 253 afs_bmap(avc, abn, anvp, anbn)
 254     register struct vcache *avc;
 255     afs_int32 abn, *anbn;
 256     struct vcache **anvp; {
 257     AFS_STATCNT(afs_bmap);
 258     if (anvp)
 259         *anvp = avc;
 260     if (anbn)
 261         *anbn = abn * (8192 / DEV_BSIZE);   /* in 512 byte units */
 262     return 0;
 263 }
 264
 265 afs_inactive(avc, acred)
 266     register struct vcache *avc;
 267     struct AFS_UCRED *acred;
 268 {
 269     struct vnode *vp = (struct vnode *)avc;
 270     ulong_t context;
 271     lock_t *sv_lock;
 272     if (afs_shuttingdown) return ;
 273
 274     /*
 275      * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
 276      * v_count 1 on last reference!
 277      */
 278     MP_H_SPINLOCK_USAV(vn_h_sl_pool,vp,&sv_lock,&context);
 279     if (avc->vrefCount < 1) osi_Panic("afs_inactive : v_count < 1\n");
 280
 281     /*
 282      * If more than 1 don't unmap the vnode but do decrement the ref count
 283      */
 284     vp->v_count--;
 285     if (vp->v_count > 0) {
 286         MP_SPINUNLOCK_USAV(sv_lock,context);
 287         return 0;
 288     }
 289     MP_SPINUNLOCK_USAV(sv_lock,context);
 290     afs_InactiveVCache(avc, acred);
 291     return 0;
 292 }
 293
 294
 295 int
 296 mp_afs_open(register struct vnode **avcp, int aflags, struct AFS_UCRED *acred)
 297 {
 298     register int code;
 299
 300     AFS_GLOCK();
 301     code = afs_open(avcp, aflags, acred);
 302     AFS_GUNLOCK();
 303     return (code);
 304 }
 305
 306 int
 307 mp_afs_close(register struct vnode *avcp, int aflags, struct AFS_UCRED *acred)
 308 {
 309     register int code;
 310
 311     AFS_GLOCK();
 312     code = afs_close(avcp, aflags, acred);
 313     AFS_GUNLOCK();
 314     return (code);
 315 }
 316
 317 int
 318 mp_afs_rdwr(register struct vnode *avcp, struct uio *uio, enum uio_rw arw, int aio, struct AFS_UCRED *acred)
 319 {
 320     register int code;
 321     long save_resid;
 322
 323     AFS_GLOCK();
 324     save_resid = uio->uio_resid;
 325     code = afs_rdwr(avcp, uio, arw, aio, acred);
 326     if (arw == UIO_WRITE && code == ENOSPC) {
 327         /* HP clears code if any data written. */
 328         uio->uio_resid = save_resid;
 329     }
 330     AFS_GUNLOCK();
 331     return (code);
 332 }
 333
 334 int
 335 mp_afs_getattr(register struct vnode *avcp, struct vattr *attrs, struct AFS_UCRED *acred, enum vsync unused1)
 336 {
 337     register int code;
 338
 339     AFS_GLOCK();
 340     code = afs_getattr(avcp, attrs, acred);
 341     AFS_GUNLOCK();
 342     return (code);
 343 }
 344
 345 int
 346 mp_afs_setattr(register struct vnode *avcp, register struct vattr *attrs, struct AFS_UCRED *acred, int unused1)
 347 {
 348     register int code;
 349
 350     AFS_GLOCK();
 351     code = afs_setattr(avcp, attrs, acred);
 352     AFS_GUNLOCK();
 353     return (code);
 354 }
 355
 356 int
 357 mp_afs_access(register struct vnode *avcp, int mode, struct AFS_UCRED *acred)
 358 {
 359     register int code;
 360
 361     AFS_GLOCK();
 362     code = afs_access(avcp, mode, acred);
 363     AFS_GUNLOCK();
 364     return (code);
 365 }
 366
 367 int
 368 mp_afs_lookup(register struct vnode *adp, char *aname, register struct vnode **avcp, struct AFS_UCRED *acred, struct vnode *unused1)
 369 {
 370     register int code;
 371
 372     AFS_GLOCK();
 373     code = afs_lookup(adp, aname, avcp, acred);
 374     AFS_GUNLOCK();
 375     return (code);
 376 }
 377
 378 int
 379 mp_afs_create(register struct vnode *adp, char *aname, struct vattr *attrs, enum vcexcl aexcl, int amode, struct vnode **avcp, struct AFS_UCRED *acred)
 380 {
 381     register int code;
 382
 383     AFS_GLOCK();
 384     code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
 385     AFS_GUNLOCK();
 386     return (code);
 387 }
 388
 389
 390 int
 391 mp_afs_remove(register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
 392 {
 393     register int code;
 394
 395     AFS_GLOCK();
 396     code = afs_remove(adp, aname, acred);
 397     AFS_GUNLOCK();
 398     return (code);
 399 }
 400
 401 int
 402 mp_afs_link(register struct vnode *avc, register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
 403 {
 404     register int code;
 405
 406     AFS_GLOCK();
 407     code = afs_link(avc, adp, aname, acred);
 408     AFS_GUNLOCK();
 409     return (code);
 410 }
 411
 412 int
 413 mp_afs_rename(register struct vnode *aodp, char *aname1, register struct vnode *andp, char *aname2, struct AFS_UCRED *acred)
 414 {
 415     register int code;
 416
 417     AFS_GLOCK();
 418     code = afs_rename(aodp, aname1, andp, aname2, acred);
 419     AFS_GUNLOCK();
 420     return (code);
 421 }
 422
 423 int
 424 mp_afs_mkdir(register struct vnode *adp, char *aname, struct vattr *attrs, register struct vnode **avcp, struct AFS_UCRED *acred)
 425 {
 426     register int code;
 427
 428     AFS_GLOCK();
 429     code = afs_mkdir(adp, aname, attrs, avcp, acred);
 430     AFS_GUNLOCK();
 431     return (code);
 432 }
 433
 434
 435 int
 436 mp_afs_rmdir(register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
 437 {
 438     register int code;
 439
 440     AFS_GLOCK();
 441     code = afs_rmdir(adp, aname, acred);
 442     AFS_GUNLOCK();
 443     return (code);
 444 }
 445
 446
 447 int
 448 mp_afs_readdir(register struct vnode *avc, struct uio *auio, struct AFS_UCRED *acred)
 449 {
 450     register int code;
 451
 452     AFS_GLOCK();
 453     code = afs_readdir(avc, auio, acred);
 454     AFS_GUNLOCK();
 455     return (code);
 456 }
 457
 458 int
 459 mp_afs_symlink(register struct vnode *adp, char *aname, struct vattr *attrs, char *atargetName, struct AFS_UCRED *acred)
 460 {
 461     register int code;
 462
 463     AFS_GLOCK();
 464     code = afs_symlink(adp, aname, attrs, atargetName, acred);
 465     AFS_GUNLOCK();
 466     return (code);
 467 }
 468
 469
 470 int
 471 mp_afs_readlink(register struct vnode *avc, struct uio *auio, struct AFS_UCRED *acred)
 472 {
 473     register int code;
 474
 475     AFS_GLOCK();
 476     code = afs_readlink(avc, auio, acred);
 477     AFS_GUNLOCK();
 478     return (code);
 479 }
 480
 481 int
 482 mp_afs_fsync(register struct vnode *avc, struct AFS_UCRED *acred, int unused1)
 483 {
 484     register int code;
 485
 486     AFS_GLOCK();
 487     code = afs_fsync(avc, acred);
 488     AFS_GUNLOCK();
 489     return (code);
 490 }
 491
 492 int
 493 mp_afs_bread(register struct vnode *avc, daddr_t lbn, struct buf **bpp, struct vattr *unused1, struct ucred *unused2)
 494 {
 495     register int code;
 496
 497     AFS_GLOCK();
 498     code = afs_bread(avc, lbn, bpp);
 499     AFS_GUNLOCK();
 500     return (code);
 501 }
 502
 503 int
 504 mp_afs_brelse(register struct vnode *avc, struct buf *bp)
 505 {
 506     register int code;
 507
 508     AFS_GLOCK();
 509     code = afs_brelse(avc, bp);
 510     AFS_GUNLOCK();
 511     return (code);
 512 }
 513
 514
 515 int
 516 mp_afs_inactive(register struct vnode *avc, struct AFS_UCRED *acred)
 517 {
 518     register int code;
 519
 520     AFS_GLOCK();
 521     code = afs_inactive(avc, acred);
 522     AFS_GUNLOCK();
 523     return (code);
 524 }
 525
 526 int
 527 mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd, struct AFS_UCRED *acred, struct file *unused1, off_t unused2, off_t unused3)
 528 {
 529     register int code;
 530
 531     AFS_GLOCK();
 532     code = afs_lockctl(avc, af, cmd, acred);
 533     AFS_GUNLOCK();
 534     return (code);
 535 }
 536
 537 int
 538 mp_afs_fid(struct vnode *avc, struct fid **fidpp)
 539 {
 540     register int code;
 541
 542     AFS_GLOCK();
 543     code = afs_fid(avc, fidpp);
 544     AFS_GUNLOCK();
 545     return (code);
 546 }
 547 int
 548 mp_afs_readdir2(register struct vnode *avc, struct uio *auio, struct AFS_UCRED *acred)
 549 {
 550     register int code;
 551
 552     AFS_GLOCK();
 553     code = afs_readdir2(avc, auio, acred);
 554     AFS_GUNLOCK();
 555     return (code);
 556 }
 557
 558
 559 struct vnodeops Afs_vnodeops = {
 560         mp_afs_open,
 561         mp_afs_close,
 562         mp_afs_rdwr,
 563         afs_ioctl,
 564         afs_noop,
 565         mp_afs_getattr,
 566         mp_afs_setattr,
 567         mp_afs_access,
 568         mp_afs_lookup,
 569         mp_afs_create,
 570         mp_afs_remove,
 571         mp_afs_link,
 572         mp_afs_rename,
 573         mp_afs_mkdir,
 574         mp_afs_rmdir,
 575         afs_readdir,
 576         mp_afs_symlink,
 577         mp_afs_readlink,
 578         mp_afs_fsync,
 579         mp_afs_inactive,
 580         afs_bmap,
 581         afs_hp_strategy,
 582 #if     !defined(AFS_NONFSTRANS)
 583                 /* on HPUX102 the nfs translator calls afs_bread but does
 584                 * not call afs_brelse. Hence we see a memory leak. If the
 585                 * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
 586                 * the same data : this is the path we follow now. */
 587         afs_noop,
 588         afs_noop,
 589 #else
 590         mp_afs_bread,
 591         mp_afs_brelse,
 592 #endif
 593         afs_badop,      /* pathsend */
 594         afs_noop,       /* setacl */
 595         afs_noop,       /* getacl */
 596         afs_pathconf,
 597         afs_pathconf,
 598         mp_afs_lockctl,
 599         afs_lockf,      /* lockf */
 600         mp_afs_fid,
 601         afs_noop,       /*fsctl */
 602         afs_badop,
 603         afs_pagein,
 604         afs_pageout,
 605         NULL,
 606         NULL,
 607         afs_prealloc,
 608         afs_mapdbd,
 609         afs_mmap,
 610         afs_cachelimit,
 611         afs_vm_checkpage,
 612         afs_vm_fscontiguous,
 613         afs_vm_stopio,
 614         afs_read_ahead ,
 615         afs_release,
 616         afs_unmap,
 617         afs_swapfs_len,
 618         mp_afs_readdir2,
 619         afs_readdir3,
 620 };
 621
 622 struct vnodeops *afs_ops = &Afs_vnodeops;
 623
 624 /* vnode file operations, and our own */
 625 extern int vno_rw();
 626 extern int vno_ioctl();
 627 extern int vno_select();
 628 extern int afs_closex();
 629 extern int vno_close();
 630 struct fileops afs_fileops = {
 631     vno_rw,
 632     vno_ioctl,
 633     vno_select,
 634     afs_close,
 635 };
 636
 637 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
 638
 639 /*
 640  ********************************************************************
 641  ****
 642  ****                   afspgin_setup_io_ranges ()
 643  ****    similar to:    nfspgin_setup_io_ranges ()
 644  ********************************************************************
 645  */
 646 pgcnt_t
 647 afspgin_setup_io_ranges(
 648         vfspage_t       *vm_info,
 649         pgcnt_t         bpages,
 650         k_off_t         isize,
 651         pgcnt_t         startindex)
 652 {
 653         pgcnt_t         file_offset = VM_FILE_OFFSET(vm_info);
 654         pgcnt_t         minpage;        /* first page to bring in */
 655         pgcnt_t         maxpage;        /* one past last page to bring in */
 656         pgcnt_t         maxpagein;
 657         pgcnt_t         multio_maxpage;
 658         daddr_t         start_blk;
 659         dbd_t           *dbd;
 660         expnd_flags_t   up_reason, down_reason;
 661         int             count = 1;
 662         int             indx = 0;
 663         int             max_num_io;
 664         int             dbdtype;
 665         preg_t          *prp;
 666
 667         VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
 668
 669         /*
 670          * We do not go past the end of the current pregion nor past the end
 671          * of the current file.
 672          */
 673
 674         maxpage = startindex + (bpages - (startindex+file_offset) % bpages);
 675         maxpage = vm_reset_maxpage(vm_info, maxpage);
 676         maxpage = MIN(maxpage, (pgcnt_t)btorp(isize) - file_offset);
 677         maxpage = MIN(maxpage, startindex + maxpagein);
 678         multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
 679
 680         if (!maxpage)
 681                 return (0);
 682
 683         VASSERT(maxpage >= startindex);
 684
 685         /*
 686          * Expanding the fault will create calls to FINDENTRY() for new
 687          * pages, which will obsolete "dbd", so copy what it points to
 688          * and clear it to prevent using stale data.
 689          */
 690
 691         prp = VM_PRP(vm_info);
 692         dbdtype = DBD_TYPE(vm_info);
 693         start_blk = DBD_DATA(vm_info);
 694         vm_info->dbd = NULL;
 695         vm_info->vfd = NULL;
 696         VASSERT(dbdtype != DBD_NONE);
 697
 698         if (max_num_io == 1) {
 699                 /*
 700                  * We need to set up one I/O: First we attempt to expand the
 701                  * I/O forward. Then we expand the I/O backwards.
 702                  */
 703                 count = expand_faultin_up(vm_info, dbdtype, (int)bpages,
 704                                           maxpage, count, startindex,
 705                                           start_blk, &up_reason);
 706                 maxpage = startindex + count;
 707                 VASSERT(maxpage <= startindex + maxpagein);
 708                 minpage = startindex - (startindex+file_offset) % bpages;
 709                 minpage = MAX(minpage, maxpage - maxpagein);
 710                 VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
 711                 minpage = vm_minpage(vm_info, minpage);
 712                 VASSERT(minpage <= startindex);
 713                 count = expand_faultin_down(vm_info, dbdtype, (int)bpages,
 714                                             minpage, count, &startindex,
 715                                             &start_blk, &down_reason);
 716                 VM_SET_IO_STARTINDX(vm_info, 0, startindex);
 717                 VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
 718                 VM_SET_IO_COUNT(vm_info, 0, count);
 719                 VM_SET_NUM_IO(vm_info, 1);
 720         }
 721
 722         if (max_num_io > 1) {
 723                 /*
 724                  * We need to set up multiple I/O information; beginning
 725                  * with the startindex, we will expand upwards. The expansion
 726                  * could stop for one of 2 reasons; we take the appropriate
 727                  * action in each of these cases:
 728                  *      o VM reasons: abort setting up the multiple I/O
 729                  *        information and return to our caller indicating
 730                  *        that "retry" is required.
 731                  *      o pagelimit: set up the next I/O info [we may have
 732                  *        reached multio_maxpage at this point].
 733                  * Note that expansion involves no more than a block at a time;
 734                  * hence it could never stop due to "discontiguous block"
 735                  * reason.
 736                  */
 737                 startindex = minpage = vm_minpage(vm_info, 0);
 738                 for (indx = 0;
 739                      (indx < max_num_io) && (startindex < multio_maxpage);
 740                      indx++, startindex +=count) {
 741                         dbd = FINDDBD(prp->p_reg, startindex);
 742                         start_blk = dbd->dbd_data;
 743                         maxpage = startindex +
 744                                   (bpages - (startindex+file_offset) % bpages);
 745                         maxpage = min(maxpage, multio_maxpage);
 746                         count = expand_faultin_up(vm_info, dbdtype,
 747                                         bpages, maxpage, 1 /* count */,
 748                                         startindex, start_blk, &up_reason);
 749                         VM_SET_IO_STARTINDX(vm_info, indx, startindex);
 750                         VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
 751                         VM_SET_IO_COUNT(vm_info, indx, count);
 752                         if (up_reason & VM_REASONS)
 753                                 break;
 754                         VASSERT(!(up_reason&NONCONTIGUOUS_BLOCK));
 755                         VASSERT(up_reason & PAGELIMIT);
 756                 }
 757                 if (startindex < multio_maxpage) {
 758                         VM_MULT_IO_FAILURE(vm_info);
 759                         VM_REINIT_FAULT_DBDVFD(vm_info);
 760                         return (0);                     /* retry */
 761                 }
 762                 count = maxpagein;
 763                 VM_SET_NUM_IO(vm_info, indx);
 764         }
 765
 766         /*
 767          * Tell VM where the I/O intends to start.  This may be different
 768          * from the faulting point.
 769          */
 770
 771         VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
 772
 773         return(count);
 774
 775 }
 776
 777 /*
 778  ********************************************************************
 779  ****
 780  ****                   afspgin_blkflsh ()
 781  ****   similar to:     nfspgin_blkflsh ()
 782  ********************************************************************
 783  */
 784 retval_t
 785 afspgin_blkflsh (
 786         vfspage_t       *vm_info,
 787         struct vnode    *devvp,
 788         pgcnt_t         *num_4k)
 789 {
 790         int             flush_reslt = 0;
 791         pgcnt_t         count = *num_4k;
 792         pgcnt_t         page_count;
 793         int             indx = 0;
 794         int             num_io = VM_GET_NUM_IO(vm_info);
 795
 796         /*
 797          * On this blkflush() we don't want to purge the buffer cache and we do
 798          * want to wait, so the flags are '0'.
 799          */
 800
 801         for (indx = 0; indx < num_io; indx++) {
 802                 flush_reslt = blkflush(devvp,
 803                                     (daddr_t)VM_GET_IO_STARTBLK(vm_info, indx),
 804                                        ptob(VM_GET_IO_COUNT(vm_info, indx)),
 805                                        0, VM_REGION(vm_info));
 806                 if (flush_reslt) {
 807                         vm_lock(vm_info);
 808                         if (vm_page_now_valid(vm_info, &page_count)) {
 809                                 vm_release_memory(vm_info);
 810                                 vm_release_structs(vm_info);
 811                                 *num_4k = page_count;
 812                                 return(VM_PAGE_PRESENT);
 813                         }
 814                         return (VM_RETRY);
 815                 }
 816         }
 817         return (VM_DONE);
 818 }
 819
 820 /*
 821  ********************************************************************
 822  ****
 823  ****                   afspgin_io ()
 824  ****    similar to:    nfspgin_io ()
 825  ********************************************************************
 826  */
 827 int
 828 afspgin_io(
 829         vfspage_t       *vm_info,
 830         struct vnode    *devvp,
 831         pgcnt_t         bpages,
 832         pgcnt_t         maxpagein,
 833         pgcnt_t         count)
 834 {
 835         int             i;
 836         int             error = 0;
 837         caddr_t         vaddr = VM_ADDR(vm_info);
 838         caddr_t         virt_addr = VM_MAPPED_ADDR(vm_info);
 839         pagein_info_t   *io = VM_PAGEIN_INFO(vm_info);
 840         preg_t          *prp = VM_PRP(vm_info);
 841         int             wrt = VM_WRT(vm_info);
 842         space_t         space = VM_SPACE(vm_info);
 843         int             num_io = VM_GET_NUM_IO(vm_info);
 844
 845 #ifdef notdef /* Not used in AFS */
 846         /*
 847          * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
 848          * be used in this case.
 849          *
 850          * Unlike UFS, NFS does not start the faulting page I/O
 851          * asynchronously. Why?  Asynchronous requests are handled by the
 852          * biod's.  It doesn't make sense to queue up the faulting request
 853          * behind other asynchrnous requests.  This is not true for UFS
 854          * where the asynchrnous request is immediately handled.
 855          */
 856
 857         if ((VM_READ_AHEAD_ALLOWED(vm_info)) &&
 858             (nfs_read_ahead_on) &&
 859             (NFS_DO_READ_AHEAD) &&
 860             (should_do_read_ahead(prp, vaddr))) {
 861
 862                 pgcnt_t max_rhead_io;
 863                 caddr_t rhead_vaddr;
 864                 pgcnt_t total_rheads_allowed;
 865
 866                 /*
 867                  * Determine the maximum amount of read-ahead I/O.
 868                  */
 869                 total_rheads_allowed = maxpagein - count ;
 870
 871                 /*
 872                  * If the count is less than a block, raise it to one.
 873                  */
 874                 if (total_rheads_allowed < bpages)
 875                         total_rheads_allowed = bpages;
 876
 877                 max_rhead_io = total_rheads_allowed;
 878                 rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count*NBPG);
 879                 error = nfs_read_ahead(vm_info->vp, prp, wrt, space,
 880                                        rhead_vaddr, &max_rhead_io);
 881
 882                 /*
 883                  * Set the next fault location.  If read_ahead launches any
 884                  * I/O it will adjust it accordingly.
 885                  */
 886                 vm_info->prp->p_nextfault = vm_info->startindex + count;
 887
 888                 /*
 889                  * Now perform the faulting I/O synchronously.
 890                  */
 891                 vm_unlock(vm_info);
 892
 893                 error = syncpageio((swblk_t)VM_GET_IO_STARTBLK(vm_info, 0),
 894                                    VM_MAPPED_SPACE(vm_info),
 895                                    VM_MAPPED_ADDR(vm_info),
 896                                    (int)ptob(count), B_READ, devvp,
 897                                    B_vfs_pagein|B_pagebf, VM_REGION(vm_info));
 898         } else
 899 #endif
 900         {
 901                 virt_addr = VM_MAPPED_ADDR(vm_info);
 902                 vm_unlock(vm_info);
 903                 for (i = 0; i < num_io; i++) {
 904                         /*
 905                          * REVISIT -- investigate doing asyncpageio().
 906                          */
 907                         error |= (io[i].error =
 908                                 syncpageio(
 909                                       (swblk_t)VM_GET_IO_STARTBLK(vm_info, i),
 910                                       VM_MAPPED_SPACE(vm_info),
 911                                       virt_addr,
 912                                       (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
 913                                       B_READ, devvp,
 914                                       B_vfs_pagein|B_pagebf,
 915                                       VM_REGION(vm_info)));
 916                         virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
 917                 }
 918                 /*
 919                  * Set the next fault location.  If read_ahead launches any
 920                  * I/O it will adjust it accordingly.
 921                  */
 922                 vm_info->prp->p_nextfault = vm_info->startindex + count;
 923         }
 924
 925         return (error);
 926 }
 927
 928 /*
 929  ********************************************************************
 930  ****
 931  ****                   afspgin_update_dbd ()
 932  ****    similar to:    nfspgin_update_dbd ()
 933  ********************************************************************
 934  */
 935 void
 936 afspgin_update_dbd(
 937         vfspage_t       *vm_info,
 938         int             bsize)
 939 {
 940         k_off_t         off;
 941         pgcnt_t         count = bsize / NBPG;
 942         k_off_t         rem;
 943         pgcnt_t         m;
 944         pgcnt_t         pgindx;
 945         daddr_t         blkno;
 946         int             num_io = VM_GET_NUM_IO(vm_info);
 947         int             i;
 948
 949         for (i = 0; i < num_io; i++) {
 950
 951                 pgindx = VM_GET_IO_STARTINDX(vm_info, i);
 952                 off = vnodindx(VM_REGION(vm_info), pgindx);
 953                 rem = off % bsize;
 954                 blkno = VM_GET_IO_STARTBLK(vm_info, i);
 955
 956                 VASSERT(bsize % NBPG == 0);
 957                 VASSERT(rem % NBPG == 0);
 958
 959                 pgindx -= (pgcnt_t)btop(rem);
 960                 blkno -= (daddr_t)btodb(rem);
 961
 962                 /*
 963                  * This region could start in mid-block.  If so, pgindx
 964                  * could be less than 0, so we adjust pgindx and blkno back
 965                  * up so that pgindx is 0.
 966                  */
 967
 968                 if (pgindx < 0) {
 969                         pgcnt_t prem;
 970                         prem = 0 - pgindx;
 971                         pgindx = 0;
 972                         count -= prem;
 973                         blkno += btodb(ptob(prem));
 974                 }
 975
 976                 for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
 977                      m++, pgindx++, blkno += btodb(NBPG)) {
 978                         /*
 979                         * Note:  since this only changes one block, it
 980                         * assumes only one block was faulted in.  Currently
 981                         * this is always true for remote files, and we only
 982                         * get here for remote files, so everything is ok.
 983                         */
 984                         vm_mark_dbd(vm_info, pgindx, blkno);
 985                 }
 986         }
 987 }
 988
 989 int afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
 990     struct vnode *vp;
 991     preg_t       *prp;
 992     int          wrt;
 993     space_t      space;
 994     caddr_t      vaddr;
 995     pgcnt_t      *ret_startindex;
 996 {
 997     pgcnt_t      startindex;
 998     pgcnt_t      pgindx = *ret_startindex;
 999     pgcnt_t      maxpagein;
1000     struct       vnode *devvp;
1001     pgcnt_t      count;
1002     daddr_t      start_blk=0;
1003     int          bsize;
1004     int          error;
1005     k_off_t      isize;
1006     int          shared;             /* writable memory mapped file */
1007     retval_t     retval = 0;
1008     pgcnt_t      ok_dbd_limit = 0;   /* last dbd that we can trust */
1009     pgcnt_t      bpages;             /* number of pages per block */
1010     pgcnt_t      page_count;
1011     vfspage_t*   vm_info=NULL;
1012     int          done;
1013
1014     struct vattr va;
1015
1016     caddr_t      nvaddr;
1017     space_t      nspace;
1018     int          change_to_fstore = 0;  /* need to change dbds to DBD_FSTORE */
1019     int          flush_start_blk = 0;
1020     int          flush_end_blk = 0;
1021
1022     int i, j;
1023
1024     AFS_STATCNT(afs_pagein);
1025     vmemp_lockx();              /* lock down VM empire */
1026
1027     /* Initialize the VM info structure */
1028     done = vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1029                           LGPG_ENABLE);
1030
1031     /* Check to see if we slept and the page was falted in. */
1032     if (done) {
1033         vm_release_structs(vm_info);
1034         vmemp_returnx(1);
1035     }
1036
1037     vp = VM_GET_PAGEIN_VNODE(vm_info);
1038     VASSERT(vp != NULL);
1039     shared = VM_SHARED_OBJECT(vm_info);
1040     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1041
1042     /*
1043      * Get the devvp and block size for this vnode type
1044      */
1045     devvp = vp;
1046     bsize = vp->v_vfsp->vfs_bsize;
1047     if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1048         osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1049
1050     bpages = (pgcnt_t)btop(bsize);
1051     VASSERT(bpages > 0);
1052     VM_SET_FS_MAX_PAGES(vm_info, bpages);
1053
1054     /* this trace cannot be here because the afs_global lock might not be
1055         held at this point. We hold the vm global lock throughout
1056         this procedure ( and not the AFS global lock )
1057     afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1058                ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1059                 ICL_TYPE_LONG, shared);
1060     */
1061     /* Come here if we have to release the region lock before
1062      * locking pages.  This can happen in memreserve() and
1063      * blkflush().
1064      */
1065 retry:
1066     /*
1067      * For remote files like ours, we want to check to see if the file has shrunk.
1068      * If so, we should invalidate any pages past the end.  In the name
1069      * of efficiency, we only do this if the page we want to fault is
1070      * past the end of the file.
1071      */
1072     {
1073        if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1074           VM_ZOMBIE_OBJECT(vm_info);
1075           vm_release_memory(vm_info);
1076           vm_release_structs(vm_info);
1077           vmemp_returnx(0);
1078        }
1079        isize = va.va_size;
1080        if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1081           /*
1082            * The file has shrunk and someone is trying to access a
1083            * page past the end of the object.  Shrink the object back
1084            * to its currrent size, send a SIGBUS to the faulting
1085            * process and return.
1086            *
1087            * We must release the region lock before calling mtrunc(),
1088            * since mtrunc() locks all the regions that are using this
1089            * file.
1090            */
1091           vm_release_memory(vm_info);
1092           vm_truncate_region(vm_info, isize);
1093           vm_release_structs(vm_info);
1094           vmemp_returnx(-SIGBUS);
1095        }
1096     }
1097
1098     maxpagein = vm_pick_maxpagein(vm_info);
1099     if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1100         /* Check to see if we should continue faulting.  */
1101         if (vm_page_now_valid(vm_info, &page_count)) {
1102                 vm_release_memory(vm_info);
1103                 vm_release_structs(vm_info);
1104                 vmemp_returnx(page_count);
1105         }
1106     }
1107     if (count = vm_no_io_required(vm_info)) {
1108             /* Release any excess memory.  */
1109             vm_release_memory(vm_info);
1110             vm_release_structs(vm_info);
1111             vmemp_returnx(count);
1112     }
1113
1114 #ifdef OSDEBUG
1115     /*
1116      * We should never have DBD_HOLE pages in a non-MMF region.
1117      */
1118     if (!shared)
1119         VASSERT(dbd->dbd_type != DBD_HOLE);
1120 #endif
1121     VASSERT( DBD_TYPE(vm_info) != DBD_NONE);
1122
1123     startindex = *ret_startindex;
1124
1125     /*
1126      * If the page we want is in memory already, take it
1127      */
1128     if (VM_MEMORY_RESERVED(vm_info) < maxpagein)
1129     {
1130         /* pick up the rest of memory now.  */
1131         if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1132                 if (vm_page_now_valid(vm_info, &page_count)) {
1133                         vm_release_memory(vm_info);
1134                         vm_release_structs(vm_info);
1135                         vmemp_returnx(page_count);
1136                 }
1137                 goto retry;
1138         }
1139     }
1140
1141     if (!(count = afspgin_setup_io_ranges(vm_info, bpages, isize,
1142                                           startindex))) {
1143        goto retry;
1144     }
1145
1146     startindex = VM_GET_STARTINDX(vm_info);
1147
1148     VASSERT(maxpagein >= count);
1149
1150     /*
1151      * Release the memory we won't need.
1152      */
1153     if (count < maxpagein) {
1154        vm_release_excess_memory(vm_info,
1155                 (VM_MEMORY_RESERVED(vm_info) - count));
1156      }
1157
1158     retval = afspgin_blkflsh(vm_info, devvp, &count);
1159
1160     if (retval == VM_RETRY) {
1161        goto retry;
1162     }
1163
1164     if (retval == VM_PAGE_PRESENT)
1165        return (count);
1166
1167 #if 0
1168     /*
1169      * The definition of krusage_cntr_t is in h/kmetric.h, which
1170      * is not shipped.  Since it's just statistics, we punt and do
1171      * not update it.  If it's a problem we'll need to get HP to export
1172      * an interface that we can use to increment the counter.
1173      */
1174
1175     /* It's a real fault, not a reclaim */
1176     {
1177        krusage_cntr_t *temp;
1178        temp = kt_cntrp(u.u_kthreadp);
1179        temp->krc_majflt++;
1180     }
1181 #endif
1182
1183     /*
1184      * Tell VM where the I/O intends to start.  This may be different
1185      * from the faulting point.
1186      */
1187
1188     /*
1189      * vm_prepare_io will fill the region with pages and release the
1190      * region lock.
1191      */
1192     vm_prepare_io(vm_info, &count);
1193
1194     /*
1195      * Count may have been adjusted, check to make sure it's non-zero.
1196      */
1197     if (count == 0) {
1198        if (vm_retry(vm_info)) {
1199           goto retry;
1200        }
1201
1202        /*
1203         * Release resources and retry the fault.  Release any excess
1204         * memory.
1205         */
1206
1207        vm_release_memory(vm_info);
1208        vm_release_structs(vm_info);
1209        vmemp_returnx(0);
1210     }
1211
1212     error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1213
1214     if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1215        retval = -SIGBUS;
1216        VM_ZOMBIE_OBJECT(vm_info);
1217        goto backout;
1218     }
1219     /*
1220      * For a writable memory mapped file that is remote we must
1221      * detect potential holes in the file and force allocation of
1222      * disk space on the remote system.  Unfortunately, there is
1223      * no easy way to do this, so this gets a little ugly.
1224      */
1225     if (shared && wrt) {
1226        /*
1227         * See if The user wants to write to this page.  Write some
1228         * minimal amount of data back to the remote file to
1229         * force allocation of file space.  We only need to
1230         * write a small amount, since holes are always at
1231         * least one filesystem block in size.
1232         */
1233        error = vm_alloc_hole(vm_info);
1234
1235        /*
1236         * If some sort of I/O error occurred we generate a
1237         * SIGBUS for the process that caused the write,
1238         * undo our page locks, etc and return.
1239         */
1240        if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1241           VM_ZOMBIE_OBJECT(vm_info);
1242           retval = -SIGBUS;
1243           goto backout;
1244        }
1245
1246        /*
1247         * Change these dbds to DBD_FSTORE.  We cannot do it here,
1248         * since the region must be locked, and it is not locked
1249         * at the moment.  We cannot lock the region yet, as we
1250         * first have to release the page locks.
1251         */
1252        change_to_fstore = 1;
1253     }
1254
1255     vm_finish_io(vm_info, count);
1256
1257     /*
1258      * Acquire the lock before we play around with changing the vfd's.
1259      */
1260     vm_lock(vm_info);
1261
1262     if (change_to_fstore)
1263        afspgin_update_dbd(vm_info, bsize);
1264
1265     mpproc_info[getprocindex()].cnt.v_exfod += count;
1266     vmemp_unlockx();      /* free up VM empire */
1267     *ret_startindex = startindex;
1268
1269     /*
1270      * In case we have any excess memory...
1271      */
1272     if (VM_MEMORY_RESERVED(vm_info))
1273       vm_release_memory(vm_info);
1274     vm_release_structs(vm_info);
1275
1276     return count;
1277
1278 backout:
1279
1280     vm_finish_io_failed(vm_info, count);
1281
1282     vm_lock(vm_info);
1283
1284     vm_undo_validation(vm_info, count);
1285
1286     /*
1287      * In case we have any excess memory...
1288      */
1289     if (VM_MEMORY_RESERVED(vm_info))
1290        vm_release_memory(vm_info);
1291     vm_release_structs(vm_info);
1292
1293     vmemp_unlockx();     /* free up VM empire */
1294     return retval;
1295 }
1296
1297 int
1298 afs_pageout(vp,prp, start, end, flags)
1299     struct vnode *vp;   /* not used */
1300     preg_t       *prp;
1301     pgcnt_t      start;
1302     pgcnt_t      end;
1303     int          flags;
1304 {
1305     struct vnode *filevp;
1306     struct vnode *devvp;
1307     pgcnt_t i;
1308     int steal;
1309     int vhand;
1310     int hard;
1311     int *piocnt;        /* wakeup counter used if PAGEOUT_WAIT */
1312     struct ucred *old_cred;
1313     vfspage_t vm_info;
1314     fsdata_t args;
1315
1316     int inode_changed = 0;
1317     int file_is_remote;
1318     struct inode *ip;
1319
1320     AFS_STATCNT(afs_pageout);
1321
1322     steal = (flags & PAGEOUT_FREE);
1323     vhand = (flags & PAGEOUT_VHAND);
1324     hard  = (flags & PAGEOUT_HARD);
1325
1326     vmemp_lockx();
1327
1328     /*  Initialize the VM info structure.  */
1329     vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1330
1331     /*
1332      * If the region is marked "don't swap", then don't steal any pages
1333      * from it.  We can, however, write dirty pages out to disk (only if
1334      * PAGEOUT_FREE is not set).
1335      */
1336     if (vm_no_pageout(&vm_info)) {
1337         vmemp_unlockx();
1338         return(0);
1339     }
1340
1341     /*
1342      * If caller wants to wait until the I/O is complete.
1343      */
1344     vm_setup_wait_for_io(&vm_info);
1345
1346     filevp = VM_GET_PAGEOUT_VNODE(&vm_info); /* always page out to back store */
1347     VASSERT(filevp != NULL);
1348
1349     bzero((caddr_t)&args, sizeof(fsdata_t));
1350     args.remote_down = 0;       /* assume remote file servers are up */
1351     args.remote = 1;            /* we are remote */
1352     args.bsize = 0;             /* filled up later by afs_vm_checkpage() */
1353
1354     if (filevp->v_fstype == VUFS) {
1355         ip = VTOI(filevp);
1356         devvp = ip->i_devvp;
1357         file_is_remote = 0;
1358     }
1359     else {
1360         file_is_remote = 1;
1361         devvp = filevp;
1362
1363         /*
1364          * If we are vhand(), and this is an NFS file, we need to
1365          * see if the NFS server is "down".  If so, we decide
1366          * if we will try to talk to it again, or defer pageouts
1367          * of dirty NFS pages until a future time.
1368          */
1369 #ifdef  notdef
1370         if (vhand && filevp->v_fstype == VNFS &&
1371                 vtomi(filevp)->mi_down && vtomi(filevp)->mi_hard) {
1372             extern afs_int32 vhand_nfs_retry;
1373             /*
1374              * If there is still time left on our timer, we will
1375              * not talk to this server right now.
1376              */
1377             if (vhand_nfs_retry > 0)
1378                 args.remote_down = 1;
1379         }
1380 #endif
1381     }
1382
1383     /*
1384      * Initialize args.  We set bsize to 0 to tell vfs_vfdcheck() that
1385      * it must get the file size and other attributes if it comes across
1386      * a dirty page.
1387      */
1388     vm_info.fs_data = (caddr_t)&args;
1389
1390     /* this trace cannot be here because the afs_global lock might not be
1391         held at this point. We hold the vm global lock throughout
1392         this procedure ( and not the AFS global lock )
1393     afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1394                ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1395     */
1396
1397     i = start;
1398
1399     while (i <= end) {
1400         struct buf *bp;
1401         k_off_t start;
1402         pgcnt_t npages;
1403         k_off_t nbytes;
1404         int error;
1405
1406         extern int pageiodone();
1407         space_t nspace;
1408         caddr_t nvaddr;
1409
1410         /*
1411          * Ask the VM system to find the next run of pages.
1412          */
1413         vm_find_next_range(&vm_info, i, end);
1414
1415         /*
1416          * It's possible that the remote file shrunk in size.  Check the flags
1417          * to see if the request was beyond the end of the file.  If it was,
1418          * truncate the region to the file size and continue.  We could be on a
1419          * run so after trunction continue, there may be some I/O to write
1420          * out.
1421          */
1422         if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1423                 pgcnt_t pglen = (pgcnt_t)btorp(args.isize);
1424
1425                 /*
1426                  * This page is past the end of the file.  Unlock this page
1427                  * (region_trunc will throw it away) and then call
1428                  * region_trunc() to invalidate all pages past the new end of
1429                  * the file.
1430                  */
1431                 region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1432
1433                 /*
1434                  * remove the truncation flag.
1435                  */
1436                 VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1437         }
1438
1439         if (VM_NO_PAGEOUT_RUN(&vm_info))
1440             break;
1441
1442         /*
1443          * We have a run of dirty pages [args.start...args.end].
1444          */
1445         VASSERT(filevp->v_fstype != VCDFS);
1446         VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1447         VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1448
1449        /*
1450         * We will be doing an I/O on the region, let the VM system know.
1451         */
1452        (void)vm_up_physio_count(&vm_info);
1453
1454         /*
1455          * Okay, get set to perform the I/O.
1456          */
1457         inode_changed = 1;
1458         npages = (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1459                         VM_START_PAGEOUT_INDX(&vm_info);
1460
1461         /*
1462          * Allocate and initialize an I/O buffer.
1463          */
1464         bp = bswalloc();
1465         vm_init_bp(&vm_info, bp); /* Let the VM system initialize */
1466
1467         /* Identify this buffer for KI */
1468         bp->b_bptype = B_vfs_pageout|B_pagebf;
1469
1470         if (steal)
1471             bp->b_flags = B_CALL|B_BUSY|B_PAGEOUT;    /* steal pages */
1472         else
1473             bp->b_flags = B_CALL|B_BUSY;              /* keep pages */
1474
1475         /*
1476          * If we are vhand paging over NFS, we will wait for the I/O
1477          * to complete.
1478          */
1479         if (vhand && filevp->v_fstype == VNFS) {
1480             bp->b_flags &= ~B_CALL;
1481         } else {
1482             bp->b_iodone = (int (*)())pageiodone;
1483         }
1484
1485         /*
1486          * Make sure we do not write past the end of the file.
1487          */
1488         nbytes = ptob(npages);
1489         start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1490         if (start + nbytes > args.isize) {
1491 #ifdef OSDEBUG
1492             /*
1493              * The amount we are off better not be bigger than a
1494              * filesystem block.
1495              */
1496             if (start + nbytes - args.isize >= args.bsize) {
1497                 osi_Panic("afs_pageout: remainder too large");
1498             }
1499 #endif
1500             /*
1501              * Reset the size of the I/O as necessary.  For remote
1502              * files, we set the size to the exact number of bytes to
1503              * the end of the file.  For local files, we round this up
1504              * to the nearest DEV_BSIZE chunk since disk I/O must always
1505              * be in multiples of DEV_BSIZE.  In this case, we do not
1506              * bother to zero out the data past the "real" end of the
1507              * file, this is done when the data is read (either through
1508              * mmap() or by normal file system access).
1509              */
1510             if (file_is_remote)
1511                 nbytes = args.isize - start;
1512             else
1513                 nbytes = roundup(args.isize - start, DEV_BSIZE);
1514         }
1515
1516         /*
1517          * Now get ready to perform the I/O
1518          */
1519         if (!vm_protect_pageout(&vm_info, npages))
1520         {
1521                 VASSERT(vhand);
1522                 vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1523                 vm_finish_io_failed(&vm_info, npages);
1524                 bswfree(bp);
1525                 break;
1526         }
1527         /*
1528          * If this is an NFS write by vhand(), we will not be calling
1529          * pageiodone().  asyncpageio() increments parolemem for us
1530          * if bp->b_iodone is pageiodone, so we must do it manually
1531          * if pageiodone() will not be called automatically.
1532          */
1533         if (!(bp->b_flags & B_CALL) && steal) {
1534             register ulong_t context;
1535
1536             SPINLOCK_USAV(pfdat_lock, context);
1537             parolemem += btorp(nbytes);
1538             SPINUNLOCK_USAV(pfdat_lock, context);
1539         }
1540         blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1541                         (BX_NOBUFWAIT|BX_PURGE), VM_REGION(&vm_info));
1542
1543         /*
1544          * If vhand is the one paging things out, and this is an NFS
1545          * file, we need to temporarily become a different user so
1546          * that we are not trying to page over NFS as root.  We use
1547          * the user credentials associated with the writable file
1548          * pointer that is in the psuedo-vas for this MMF.
1549          *
1550          * NOTE: we are currently using "va_rss" to store the ucred
1551          *       value in the vas (this should be fixed in 10.0).
1552          */
1553         old_cred = kt_cred(u.u_kthreadp);
1554         if (vhand) {
1555             set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1556
1557             /*
1558              * If root was the one who opened the mmf for write,
1559              * va_cred will be NULL.  So reset kt_cred(u.u_kthreadp) to what it
1560              * was.  We will page out as root, but that is the
1561              * correct thing to do in this case anyway.
1562              */
1563             if (kt_cred(u.u_kthreadp) == NULL)
1564                 set_kt_cred(u.u_kthreadp, old_cred);
1565         }
1566
1567         /*
1568          * Really do the I/O.
1569          */
1570         error = asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1571                         VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1572                         (int)nbytes, B_WRITE, devvp);
1573
1574         VASSERT(error == 0);
1575
1576 #ifdef  notdef
1577         /*
1578          * If we are vhand paging over NFS we want to wait for the
1579          * I/O to complete and take the appropriate actions if an
1580          * error is encountered.
1581          */
1582         if (vhand) {
1583             if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1584                 /*
1585                  * The server is down, ignore this failure, and
1586                  * try again later. (rfscall() has set our retry
1587                  * timer).
1588                  */
1589                 fsdata.remote_down = 1;
1590                 pageiocleanup(bp, 0);
1591
1592                 /*
1593                  * vm_vfdcheck() has cleared the valid bit on the
1594                  * vfds for these pages.  We must go back and set the
1595                  * valid bit, as the pages are really not gone.
1596                  *
1597                  * NOTE: we can do this because we still hold (and have
1598                  * not released) the region lock.
1599                  */
1600                 if (steal)
1601                     vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1602             }
1603             else {
1604                 /*
1605                  * The I/O succeeded, or we had an error that we do
1606                  * not want to defer until later.  Call pageidone()
1607                  * to handle things.
1608                  */
1609                 pageiodone(bp);
1610             }
1611         }
1612 #endif
1613
1614         /*
1615          * And restore our credentials to what they were.
1616          */
1617         set_kt_cred(u.u_kthreadp, old_cred);
1618
1619         /*
1620          * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1621          * can now unreserve it.
1622          */
1623         if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1624             vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1625             vm_release_malloc_memory();
1626         }
1627
1628         /*
1629          * Update statistics
1630          */
1631         if (steal) {
1632             if (flags & PF_DEACT) {
1633                 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1634 /*              sar_bswapout += ptod(npages);*/
1635             }
1636             else if (vhand) {
1637                 mpproc_info[getprocindex()].cnt.v_pgout++;
1638                 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1639             }
1640         }
1641
1642         /*
1643          * If time and patience have delivered enough
1644          * pages, then quit now while we are ahead.
1645          */
1646         if (VM_STOP_PAGING(&vm_info))
1647             break;
1648
1649         i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1650     }
1651
1652     vm_finish_pageout(&vm_info); /* update vhand's stealscan */
1653
1654     vmemp_unlockx();
1655
1656     /*
1657      * If we wanted to wait for the I/O to complete, sleep on piocnt.
1658      * We must decrement it by one first, and then make sure that it
1659      * is non-zero before going to sleep.
1660      */
1661     vm_wait_for_io(&vm_info);
1662
1663     if (inode_changed && !file_is_remote) {
1664         imark(ip, IUPD|ICHG);
1665         iupdat(ip, 0, 0);
1666     }
1667     return 0;
1668 }
1669
1670 int
1671 afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1672      struct vnode *filevp;
1673      off_t        offset;
1674      daddr_t      *bn;        /* Block number. */
1675      int          flags;      /* B_READ or B_WRITE */
1676      int          *hole;      /* To be used for read-ahead. */
1677      pgcnt_t      *startidx;  /* To be used for read-ahead. */
1678      pgcnt_t      *endidx;    /* To be used for read-ahead. */
1679 {
1680         daddr_t lbn, local_bn;
1681         int on;
1682         int err;
1683         long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1684
1685         if (startidx)
1686                 *startidx = (pgcnt_t)(offset/NBPG);
1687         if (endidx)
1688                 *endidx = (pgcnt_t)(offset/NBPG);
1689         if (hole)
1690                 *hole = 0;      /* Can't have holes. */
1691         if (bsize <= 0 )
1692                 osi_Panic("afs_mapdbd: zero size");
1693
1694         lbn = (daddr_t)(offset / bsize);
1695         on = offset % bsize;
1696
1697         err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1698         VASSERT(err == 0);
1699
1700         /*
1701          * We can never get a bn less than zero on remote files.
1702          */
1703         VASSERT(local_bn >= 0);
1704
1705         local_bn = local_bn + btodb(on);
1706         *bn = local_bn;
1707
1708         return(0);
1709 }
1710
1711 /*
1712  * Return values:
1713  *      1: The blocks are contiguous.
1714  *      0: The blocks are not contiguous.
1715  */
1716 int
1717 afs_vm_fscontiguous(vp, args, cur_data)
1718      struct vnode *vp;
1719      vfspage_t    *args;
1720      u_int        cur_data;
1721 {
1722         if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1723                 return(1);
1724         } else {
1725                 return(0);
1726         }
1727 }
1728
1729 /*
1730  * Return values:
1731  *      1: Stop, this page is the last in the block.
1732  *      0: Continue on
1733  * Terminate requests at filesystem block boundaries
1734  */
1735 afs_vm_stopio(vp, args)
1736      struct vnode *vp;
1737      vfspage_t    *args;
1738 {
1739         fsdata_t *fsdata = (fsdata_t *)args->fs_data;
1740
1741         if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0) {
1742                 return(1);
1743         } else {
1744                 return(0);
1745         }
1746 }
1747
1748 /*
1749  *      afs_vm_checkpage is called by the VM while collecting a run of
1750  *      pages on a pageout.  afs_vm_checkpage() is called for each page
1751  *      VM wants to write to disk.
1752  */
1753 afs_vm_checkpage(vp, args, pgindx, cur_data)
1754      struct vnode *vp;
1755      vfspage_t    *args;
1756      pgcnt_t      pgindx;
1757      int          cur_data;
1758 {
1759     fsdata_t *fsdata = (fsdata_t *)args->fs_data;
1760
1761     if (fsdata->remote_down) { /* never happens for AFS */
1762         /*
1763          * The remote system is down.
1764          */
1765         VASSERT(args->run == 0);
1766         return 1;
1767     }
1768     /*
1769      * A dirty page.  If we have not yet determined the file size and
1770      * other attributes that we need to write out pages (the block
1771      * size and ok_dbd_limit), get that information now.
1772      */
1773     if (fsdata->bsize == 0) {
1774         k_off_t isize;
1775         long bsize;
1776         struct vattr va;
1777         struct vnode *filevp;
1778         /*
1779          * Get the various attributes about the file.  Store them
1780          * in args for the next time around.
1781          */
1782         filevp = args->vp;
1783
1784         bsize = vtoblksz(filevp);
1785         args->maxpgs = (pgcnt_t)btop(bsize);
1786
1787         if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1788                 /*
1789                  * The VOP_GETATTR() failed.
1790                  * we are vhand, and this is a hard mount, we will
1791                  * skip dirty pages for a while and try again later.
1792                  */
1793                 if (args->vm_flags & PAGEOUT_VHAND)
1794                 {
1795                         VASSERT(args->run == 0);
1796                         return 1;
1797                 }
1798                 /*
1799                  * This is a "soft" mount, or some other error was
1800                  * returned from the server.  Mark this region
1801                  * as a zombie, and free this dirty page.
1802                  */
1803                 VM_ZOMBIE_OBJECT(args);
1804
1805                 /*
1806                  * The caller will see r_zomb and remove the page
1807                  * appropriately.
1808                  */
1809                 return(1);
1810         }
1811         isize = va.va_size;
1812         fsdata->isize = isize;
1813         fsdata->bsize = bsize;
1814         fsdata->remote = 1;
1815     }
1816     /*
1817     * See if the file has shrunk (this could have happened
1818     * asynchronously because of NFS or DUX).  If so, invalidate
1819     * all of the pages past the end of the file. This is only
1820     * needed for remote files, as local files are truncated
1821     * synchronously.
1822     */
1823
1824     if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1825         /*
1826          * This page is past the end of the file.  Unlock this page
1827          * (region_trunc will throw it away) and then call region_trunc()
1828          * to invalidate all pages past the new end of the file.
1829          */
1830                 VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1831                 return(1);
1832     }
1833 #ifdef notdef
1834     if ((args->vm_flags & PAGEOUT_VHAND) &&
1835         (!(args->vm_flags & PAGEOUT_RESERVED)) &&
1836         (!(VM_IS_ZOMBIE(args)))) {
1837         VASSERT(args->run == 0);
1838         if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1839             /*
1840              * Got enough memory to pageout.  Mark the fact that we did
1841              * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1842              * later (in remote_pageout()).
1843              */
1844             args->vm_flags |= PAGEOUT_RESERVED;
1845         } else {
1846             /*
1847              * We do not have enough memory to do this pageout.  By
1848              * definition, we do not yet have a run, so we just unlock
1849              * this page and tell foreach_valid() to continue scanning.
1850              * If we come across another dirty page, we will try to
1851              * reserve memory again.  That is okay, in fact some memory
1852              * may have freed up (as earlier pageouts complete under
1853              * interrupt).
1854              */
1855             return 1;
1856         }
1857     }
1858 #endif
1859     return(0);
1860 }
1861
1862 afs_swapfs_len(bp)
1863      struct buf *bp;
1864 {
1865         long fs_bsize;
1866         long max_size;
1867         long bnrem;
1868
1869         fs_bsize = vtoblksz(bp->b_vp);
1870         /*
1871          * Check to see if we are starting mid block.  If so, then
1872          * we must return the remainder of the block or less depending
1873          * on the length.
1874          */
1875         bnrem = bp->b_offset % fs_bsize;
1876         if (bnrem) {
1877                 max_size = fs_bsize - bnrem;
1878         } else {
1879                 max_size = fs_bsize;
1880         }
1881
1882         if (bp->b_bcount > max_size) {
1883                 return(max_size);
1884         } else {
1885                 return(bp->b_bcount);
1886         }
1887 }
1888
1889 afs_mmap(vp, off, size_bytes, access)
1890      struct vnode *vp;
1891      u_int off;
1892      u_int size_bytes;
1893      int access;
1894 {
1895         long bsize = vtoblksz(vp);
1896
1897         if (bsize % NBPG != 0) {
1898                 return(EINVAL);
1899         }
1900
1901         return(0);
1902 }
1903
1904 afs_cachelimit(vp, len, location)
1905      struct vnode *vp;
1906      k_off_t len;
1907      int *location;
1908 {
1909         /*
1910          * Disk addresses are logical, not physical, so fragments are
1911          * transparent.
1912          */
1913         *location = btorp(len) + 1;
1914 }
1915
1916 afs_release(vp)
1917      struct vnode *vp;
1918 {
1919         return(0);
1920 }
1921
1922 int
1923 afs_unmap(vp,off, size_bytes,access)
1924      struct vnode *vp;
1925      u_int off;
1926      u_int size_bytes;
1927      int access;
1928 {
1929         return 0;
1930 }
1931
1932 int
1933 afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
1934      struct vnode *vp;
1935      preg_t *prp;
1936      int wrt;
1937      space_t space;
1938      caddr_t vaddr;
1939      pgcnt_t *rhead_cnt;
1940 {
1941         printf("afs_read_ahead returning 0 \n");
1942         return 0;
1943 }
1944
1945 int
1946 afs_prealloc(vp, size, ignore_minfree, reserved)
1947       struct vnode    *vp;
1948       size_t          size;
1949       int             ignore_minfree;
1950       int             reserved;
1951 {
1952         printf("afs_prealloc returning ENOSPC\n");
1953         return ENOSPC;
1954 }
1955
1956 int
1957 afs_ioctl(vp, com, data, flag, cred)
1958         struct vnode *vp;
1959         int com;
1960         caddr_t data;
1961         int flag;
1962         struct ucred *cred;
1963 {
1964         int error;
1965         struct afs_ioctl afsioctl, *ai;
1966
1967         AFS_STATCNT(afs_ioctl);
1968
1969         /* The call must be a VICEIOCTL call */
1970         if (((com >> 8) & 0xff) == 'V') {
1971 #ifdef notdef
1972            /* AFS_COPYIN returns error 14. Copy data in instead */
1973            AFS_COPYIN(data, (caddr_t) &afsioctl, sizeof(afsioctl), error);
1974            if (error) return(error);
1975 #endif
1976            ai = (struct afs_ioctl *) data;
1977            afsioctl.in       = ai->in;
1978            afsioctl.out      = ai->out;
1979            afsioctl.in_size  = ai->in_size;
1980            afsioctl.out_size = ai->out_size;
1981            error = HandleIoctl((struct vcache *)vp, com, &afsioctl);
1982            return(error);
1983         }
1984         return(ENOTTY);
1985 }
1986
1987 #define roundtoint(x)   (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
1988 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
1989                                 2 * sizeof(u_short)))
1990
1991 int
1992 afs_readdir(vp, uiop, cred)
1993      struct vnode *vp;
1994      struct uio *uiop;
1995      struct ucred *cred;
1996 {
1997         struct uio auio;
1998         struct iovec aiov;
1999         caddr_t ibuf, obuf, ibufend, obufend;
2000         struct __dirent32 *idp;
2001         struct dirent *odp;
2002         int count, outcount;
2003         dir_off_t offset;
2004         uint64_t tmp_offset;
2005
2006         count = uiop->uio_resid;
2007         /* Allocate temporary space for format conversion */
2008         ibuf = kmem_alloc(2*count);     /* overkill - fix later */
2009         obuf = kmem_alloc(count + sizeof (struct dirent));
2010         aiov.iov_base = ibuf;
2011         aiov.iov_len = count;
2012         auio.uio_iov = &aiov;
2013         auio.uio_iovcnt = 1;
2014         offset = auio.uio_offset = uiop->uio_offset;
2015         auio.uio_seg = UIOSEG_KERNEL;
2016         auio.uio_resid = count;
2017         auio.uio_fpflags = 0;
2018
2019         u.u_error = mp_afs_readdir2(vp, &auio, cred);
2020         if (u.u_error)
2021                 goto out;
2022
2023         /* Convert entries from __dirent32 to dirent format */
2024
2025         for (idp = (struct __dirent32 *) ibuf, odp = (struct dirent *) obuf,
2026              ibufend = ibuf + (count - auio.uio_resid),
2027              obufend = obuf + count;
2028              (caddr_t)idp < ibufend;
2029              idp = (struct __dirent32 *) ((caddr_t) idp  + idp->__d_reclen),
2030              odp = (struct dirent *) ((caddr_t) odp  + odp->d_reclen)) {
2031                 odp->d_ino = idp->__d_ino;
2032                 odp->d_namlen = idp->__d_namlen;
2033                 (void) strcpy(odp->d_name, idp->__d_name);
2034                 odp->d_reclen = reclen(odp);
2035                 if ((caddr_t) odp + odp->d_reclen > obufend)
2036                         break;
2037                 /* record offset *after* we're sure to use this entry */
2038                 bcopy((char *)&idp->__d_off, (char *)&tmp_offset, sizeof tmp_offset);
2039                 offset = tmp_offset;
2040         }
2041
2042         outcount = (caddr_t) odp - obuf;
2043         AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2044         if (u.u_error)
2045                 goto out;
2046         uiop->uio_offset = offset;
2047 out:
2048         kmem_free(ibuf, count);
2049         kmem_free(obuf, count + sizeof (struct dirent));
2050         return u.u_error;
2051 }
2052
2053
2054 #define roundtolong(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2055 #define reclen_dirent64(dp)      roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2056                                 2 * sizeof(u_short)))
2057
2058 int
2059 afs_readdir3(vp, uiop, cred)
2060      struct vnode *vp;
2061      struct uio *uiop;
2062      struct ucred *cred;
2063 {
2064         struct uio auio;
2065         struct iovec aiov;
2066         caddr_t ibuf, obuf, ibufend, obufend;
2067         struct __dirent32 *idp;
2068         struct __dirent64 *odp;
2069         int count, outcount;
2070         dir_off_t offset;
2071
2072         count = uiop->uio_resid;
2073         /* Allocate temporary space for format conversion */
2074         ibuf = kmem_alloc(2*count);     /* overkill - fix later */
2075         obuf = kmem_alloc(count + sizeof (struct __dirent64));
2076         aiov.iov_base = ibuf;
2077         aiov.iov_len = count;
2078         auio.uio_iov = &aiov;
2079         auio.uio_iovcnt = 1;
2080         offset = auio.uio_offset = uiop->uio_offset;
2081         auio.uio_seg = UIOSEG_KERNEL;
2082         auio.uio_resid = count;
2083         auio.uio_fpflags = 0;
2084
2085         u.u_error = mp_afs_readdir2(vp, &auio, cred);
2086         if (u.u_error)
2087                 goto out;
2088
2089         /* Convert entries from __dirent32 to __dirent64 format */
2090
2091         for (idp = (struct __dirent32 *) ibuf, odp = (struct __dirent64 *) obuf,
2092              ibufend = ibuf + (count - auio.uio_resid),
2093              obufend = obuf + count;
2094              (caddr_t)idp < ibufend;
2095              idp = (struct __dirent32 *) ((caddr_t) idp  + idp->__d_reclen),
2096              odp = (struct __dirent64 *) ((caddr_t) odp  + odp->__d_reclen)) {
2097                 bcopy((char *)&idp->__d_off, (char *)&odp->__d_off, sizeof odp->__d_off);
2098                 odp->__d_ino = idp->__d_ino;
2099                 odp->__d_namlen = idp->__d_namlen;
2100                 (void) strcpy(odp->__d_name, idp->__d_name);
2101                 odp->__d_reclen = reclen_dirent64(odp);
2102                 if ((caddr_t) odp + odp->__d_reclen > obufend)
2103                         break;
2104                 /* record offset *after* we're sure to use this entry */
2105                 offset = odp->__d_off;
2106         }
2107
2108         outcount = (caddr_t) odp - obuf;
2109         AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2110         if (u.u_error)
2111                 goto out;
2112         uiop->uio_offset = offset;
2113 out:
2114         kmem_free(ibuf, count);
2115         kmem_free(obuf, count + sizeof (struct __dirent64));
2116         return u.u_error;
2117 }
2118
2119 #define AFS_SV_SEMA_HASH 1
2120 #define AFS_SV_SEMA_HASH_DEBUG 0
2121
2122 #if AFS_SV_SEMA_HASH
2123 /* This portion of the code was originally used to implement
2124  * thread specific storage for the semaphore save area. However,
2125  * there were some spare fields in the proc structure, this is
2126  * now being used for the saving semapores.  Hence, this portion of
2127  * the code is no longer used.
2128  */
2129
2130 /* This portion of the code implements thread specific information.
2131  * The thread id is passed in as the key. The semaphore saved area
2132  * is hashed on this key.
2133  */
2134
2135 /* why is this hash table required ?
2136  * The AFS code is written in such a way that a GLOCK() is done in
2137  * one function and the GUNLOCK() is done in another function further
2138  * down the call chain. The GLOCK() call has to save the current
2139  * semaphore status before acquiring afs_global_sema. The GUNLOCK
2140  * has to release afs_global_sema and reacquire the sempahore status
2141  * that existed before the corresponding GLOCK. If GLOCK() and
2142  * GUNLOCK() were called in the same function, the GLOCK call could
2143  * have stored the saved sempahore status in a local variable and the
2144  * corresponding GUNLOCK() call could have restored the original
2145  * status from this local variable. But this is not the case with
2146  * AFS code. Hence, we have to implement a thread specific semaphore
2147  * save area. This is implemented as a hash table. The key is the
2148  * thread id.
2149  */
2150
2151 /* In order for multithreaded processes to work, the sv_sema structures
2152  * must be saved on a per-thread basis, not a per-process basis.  There
2153  * is no per-thread storage available to hijack in the OS per-thread
2154  * data structures (e.g. struct user) so we revive this code.
2155  * I removed the upper limit on the memory consumption since we don't
2156  * know how many threads there will be.  Now the code first checks the
2157  * freeList.  If that fails it then tries garbage collecting.  If that
2158  * doesn't free up anything then it allocs what it needs.
2159  */
2160
2161 #define ELEMENT         sv_sema_t
2162 #define KEY             tid_t
2163 #define Hash(xx)        (  (xx) % sizeOfHashTable )
2164 #define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2165 #define hashLock(xx)    MP_PSEMA(&xx)
2166 #define hashUnlock(xx)  MP_VSEMA(&xx)
2167
2168 typedef struct elem
2169 {
2170         struct elem*    next;
2171         ELEMENT         element;
2172         KEY             key;
2173         int             refCnt;
2174 } Element;
2175
2176 typedef struct bucket
2177 {
2178         sema_t          lock;
2179         Element*        element;
2180 } Bucket;
2181
2182 static int      sizeOfHashTable;
2183 static Bucket*  hashTable;
2184
2185 static int      currentSize=0;
2186 static Element* freeList;               /* free list */
2187
2188 #pragma align 64
2189 static sema_t  afsHashLock = { 0 };     /* global lock for hash table */
2190
2191 static void afsHashGarbageCollect();
2192
2193 /*
2194 ** The global lock protects the global data structures,
2195 ** e.g. freeList and currentSize.
2196 ** The bucket lock protects the link list hanging off that bucket.
2197 ** The lock hierarchy : one can obtain the bucket lock while holding
2198 ** the global lock, but not vice versa.
2199 */
2200
2201
2202 void
2203 afsHash(int nbuckets)           /* allocate the hash table */
2204 {
2205         int i;
2206
2207 #if AFS_SV_SEMA_HASH_DEBUG
2208 printf("afsHash: enter\n");
2209 #endif
2210
2211         sizeOfHashTable = nbuckets;
2212         currentSize     = nbuckets * sizeof(Bucket);
2213
2214         if ( hashTable )
2215                 osi_Panic("afs: SEMA Hashtable already created\n");
2216
2217         hashTable       = (Bucket *)AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2218         if ( ! hashTable )
2219                 osi_Panic("afs: cannot create SEMA Hashtable\n");
2220
2221         /* initialize the hash table and associated locks */
2222         bzero((char *)hashTable, sizeOfHashTable * sizeof(Bucket ));
2223         for ( i=0;i < sizeOfHashTable; i ++)
2224                 hashLockInit( hashTable[i].lock);
2225         hashLockInit(afsHashLock);
2226
2227 #if AFS_SV_SEMA_HASH_DEBUG
2228 printf("afsHash: exit\n");
2229 #endif
2230 }
2231
2232 ELEMENT*
2233 afsHashInsertFind(KEY key)
2234 {
2235         int             index;
2236         Element*        ptr;
2237
2238 #if AFS_SV_SEMA_HASH_DEBUG
2239 printf("afsHashInsertFind: %d\n", key);
2240 #endif
2241         if ( ! hashTable )
2242                 osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2243
2244         index   = Hash(key);            /* get bucket number */
2245         hashLock(hashTable[index].lock); /* lock this bucket */
2246         ptr     = hashTable[index].element;
2247
2248         /* if it is already there */
2249         while ( ptr ) {
2250                 if ( ptr->key == key ) {
2251                         ptr->refCnt++;  /* hold it */
2252                         hashUnlock(hashTable[index].lock);
2253 #if AFS_SV_SEMA_HASH_DEBUG
2254 printf("afsHashInsertFind: %d FOUND\n", key);
2255 #endif
2256                         return &(ptr->element);
2257                 } else {
2258                         ptr = ptr->next;
2259                 }
2260         }
2261
2262         hashUnlock(hashTable[index].lock);
2263
2264         /*  if something exists in the freeList, take it from there */
2265         ptr = NULL;
2266         hashLock(afsHashLock);
2267
2268         if ( freeList ) {
2269                 ptr = freeList;                 /* reuse entry */
2270                 freeList = freeList->next;
2271         } else {
2272                 afsHashGarbageCollect();        /* afsHashLock locked */
2273                 if ( freeList ) {
2274                         ptr = freeList;                 /* reuse entry */
2275                         freeList = freeList->next;
2276                 } else {
2277                         ptr = (Element *)AFS_KALLOC(sizeof(Element));
2278                 }
2279         }
2280
2281         currentSize += sizeof(Element); /* update memory used */
2282         hashUnlock(afsHashLock);
2283
2284         if ( ! ptr )
2285                 osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2286                                         /* create new entry */
2287         ptr->key     = key;
2288         bzero((char *)&ptr->element, sizeof(ptr->element));
2289         ptr->refCnt  = 1;               /* this guy */
2290
2291                                         /* insert new entry in bucket */
2292         hashLock(hashTable[index].lock); /* lock this bucket */
2293         ptr->next    =  hashTable[index].element;
2294         hashTable[index].element = ptr;
2295         hashUnlock(hashTable[index].lock);
2296
2297 #if AFS_SV_SEMA_HASH_DEBUG
2298 printf("afsHashInsertFind: %d MADE\n", key);
2299 #endif
2300
2301         return &(ptr->element);
2302 }
2303
2304 ELEMENT*
2305 afsHashFind(KEY key)
2306 {
2307         int             index;
2308         Element*        ptr;
2309
2310 #if AFS_SV_SEMA_HASH_DEBUG
2311 printf("afsHashFind: %d\n", key);
2312 #endif
2313         if ( ! hashTable )
2314                 osi_Panic("afs: afsHashFind: no hashTable\n");
2315
2316         index   = Hash(key);            /* get bucket number */
2317         hashLock(hashTable[index].lock); /* lock this bucket */
2318         ptr     = hashTable[index].element;
2319
2320         /* it should be in the hash table */
2321         while ( ptr ) {
2322                 if ( ptr->key == key )
2323                 {
2324                         if(ptr->refCnt <= 0 )
2325                          osi_Panic("afs: SEMA HashTable entry already released\n");
2326                         hashUnlock(hashTable[index].lock);
2327 #if AFS_SV_SEMA_HASH_DEBUG
2328 printf("afsHashFind: %d FOUND\n", key);
2329 #endif
2330                         return &(ptr->element);
2331                 } else {
2332                         ptr = ptr->next;
2333                 }
2334         }
2335
2336         hashUnlock(hashTable[index].lock);
2337         /* it better be in the hash table */
2338         osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2339         return 0;
2340 }
2341
2342 void
2343 afsHashRelease(KEY key)
2344 {
2345         int             index;
2346         Element*        ptr;
2347
2348 #if AFS_SV_SEMA_HASH_DEBUG
2349 printf("afsHashRelease: %d\n", key);
2350 #endif
2351         if ( ! hashTable )
2352                 osi_Panic("afs: afsHashRelease: no hashTable\n");
2353
2354         index   = Hash(key);            /* get bucket number */
2355         hashLock(hashTable[index].lock); /* lock this bucket */
2356         ptr     = hashTable[index].element;
2357
2358         /* it should be in the hash table */
2359         while ( ptr ) {
2360                 if ( ptr->key == key ) {
2361                         if(ptr->refCnt <= 0 )
2362                          osi_Panic("afs: SEMA HashTable entry already released\n");
2363                         ptr->refCnt--;  /* release this guy */
2364                         hashUnlock(hashTable[index].lock);
2365 #if AFS_SV_SEMA_HASH_DEBUG
2366 printf("afsHashRelease: %d FOUND\n", key);
2367 #endif
2368                         return;
2369                 } else {
2370                         ptr = ptr->next;
2371                 }
2372         }
2373
2374         hashUnlock(hashTable[index].lock);
2375         /* it better be in the hash table */
2376         osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2377 }
2378
2379 /* this should be called with afsHashLock WRITE locked */
2380 static void
2381 afsHashGarbageCollect()
2382 {
2383         int             index;
2384         Element*        ptr;
2385         int             foundFlag=0;
2386
2387         if ( ! hashTable )
2388                 osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2389
2390         for ( index = 0; index < sizeOfHashTable; index++) {
2391                 hashLock(hashTable[index].lock);
2392                 ptr = hashTable[index].element; /* pick up bucket */
2393
2394                 while ( ptr && !ptr->refCnt ) {
2395                         /* insert this element into free list */
2396                         Element*        temp;
2397                         temp            = ptr->next;
2398                         ptr->next       = freeList;
2399                         freeList        = ptr;
2400
2401                         foundFlag       = 1;    /* found at least one */
2402                         currentSize     -= sizeof(Element);
2403                         ptr             = temp;
2404                 }
2405                 hashTable[index].element = ptr;
2406
2407                 /* scan thru the remaining list */
2408                 if ( ptr ) {
2409                         while ( ptr->next ) {
2410                                 if ( ptr->next->refCnt == 0 ) {
2411                                         /* collect this element */
2412                                         Element*        temp;
2413                                         temp            = ptr->next;
2414                                         ptr->next       = ptr->next->next;
2415                                         temp->next      = freeList;
2416                                         freeList        = temp;
2417                                         foundFlag       = 1;
2418                                         currentSize     -= sizeof(Element);
2419                                 } else {
2420                                         ptr = ptr->next;
2421                                 }
2422                         }
2423                 }
2424                 hashUnlock(hashTable[index].lock);
2425         }
2426 #if 0
2427         if(!foundFlag)
2428                 osi_Panic("afs: SEMA HashTable full\n");
2429 #endif
2430 }
2431
2432 #endif /* AFS_SV_SEMA_HASH */
2433
2434
2435 afs_hp_strategy(bp)
2436     register struct buf *bp;
2437 {
2438     register afs_int32 code;
2439     struct uio tuio;
2440     struct iovec tiovec[1];
2441     extern caddr_t hdl_kmap_bp();
2442     register struct kthread *t = u.u_kthreadp;
2443
2444     AFS_STATCNT(afs_hp_strategy);
2445     /*
2446      * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2447      * the I/O.  We must save and restore the count because pageiodone()
2448      * uses b_bcount to determine how many pages to unlock.
2449      *
2450      * Remap the entire range.
2451      */
2452     hdl_kmap_bp(bp);
2453
2454     AFS_GLOCK();
2455     afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER,
2456                 bp->b_vp, ICL_TYPE_LONG,
2457                 (int)bp->b_blkno*DEV_BSIZE, ICL_TYPE_LONG, bp->b_bcount,
2458                 ICL_TYPE_LONG,0);
2459
2460         /* Set up the uio structure */
2461         tuio.afsio_iov = tiovec;
2462         tuio.afsio_iovcnt = 1;
2463         tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2464         tuio.afsio_seg = AFS_UIOSYS;
2465         tuio.afsio_resid = bp->b_bcount;
2466         tuio.uio_fpflags = 0;
2467         tiovec[0].iov_base = bp->b_un.b_addr;
2468         tiovec[0].iov_len = bp->b_bcount;
2469
2470         /* Do the I/O */
2471         if ((bp->b_flags & B_READ) == B_READ)
2472         {
2473             /* read b_bcount bytes into kernel address b_un.b_addr
2474                starting at byte DEV_BSIZE * b_blkno. Bzero anything
2475                we can't read, and finally call iodone(bp).  File is
2476                in bp->b_vp. Credentials are from u area??
2477             */
2478            code = afs_rdwr((struct vcache *)bp->b_vp,&tuio,UIO_READ,0,kt_cred(t));
2479            if (code == 0)
2480               if (tuio.afsio_resid > 0)
2481               {
2482                  privlbzero(bvtospace(bp, bp->b_un.b_addr),
2483                         bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2484                         (size_t) tuio.afsio_resid);
2485
2486               }
2487         } else
2488            code = afs_rdwr((struct vcache *)bp->b_vp,&tuio,UIO_WRITE,0,kt_cred(t));
2489
2490     /* Remap back to the user's space */
2491     hdl_remap_bp(bp);
2492
2493     AFS_GUNLOCK();
2494
2495     iodone(bp);
2496     return code;
2497 }
2498
2499 afs_pathconf(vp, name, resultp, cred)
2500 struct vnode *vp;
2501 int     name;
2502 int     *resultp;
2503 struct ucred *cred;     /* unused */
2504 {
2505         switch(name)
2506         {
2507         case _PC_LINK_MAX:      /* Maximum number of links to a file */
2508                 *resultp = 255; /* an unsigned short on the fileserver*/
2509                 break;          /* a unsigned char in the client.... */
2510
2511         case _PC_NAME_MAX:      /* Max length of file name */
2512                 *resultp = 255;
2513                 break;
2514
2515         case _PC_PATH_MAX:      /* Maximum length of Path Name */
2516                 *resultp = 1024;
2517                 break;
2518
2519         case _PC_PIPE_BUF:      /* Max atomic write to pipe.  See fifo_vnops */
2520         case _PC_CHOWN_RESTRICTED:      /* Anybody can chown? */
2521         case _PC_NO_TRUNC:      /* No file name truncation on overflow? */
2522                 u.u_error = EOPNOTSUPP;
2523                 return(EOPNOTSUPP);
2524                 break;
2525
2526         case _PC_MAX_CANON:     /* TTY buffer size for canonical input */
2527                 /* need more work here for pty, ite buffer size, if differ */
2528                 if (vp->v_type != VCHR) {
2529                         u.u_error = EINVAL;
2530                         return(EINVAL);
2531                 }
2532                 *resultp = CANBSIZ;     /*for tty*/
2533                 break;
2534
2535         case _PC_MAX_INPUT:
2536                 /* need more work here for pty, ite buffer size, if differ */
2537                 if (vp->v_type != VCHR) {       /* TTY buffer size */
2538                         u.u_error = EINVAL;
2539                         return(EINVAL);
2540                 }
2541                 *resultp = TTYHOG;      /*for tty*/
2542                 break;
2543
2544         case _PC_VDISABLE:
2545                 /* Terminal special characters can be disabled? */
2546                 if (vp->v_type != VCHR) {
2547                         u.u_error = EINVAL;
2548                         return(EINVAL);
2549                 }
2550                 *resultp = 1;
2551                 break;
2552
2553         case _PC_SYNC_IO:
2554                 if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2555                         *resultp = -1;
2556                         return EINVAL;
2557                 }
2558                 *resultp = 1; /* Synchronized IO supported for this file */
2559                 break;
2560
2561         case _PC_FILESIZEBITS:
2562                 if (vp->v_type != VDIR)
2563                         return(EINVAL);
2564                 *resultp = MAX_SMALL_FILE_BITS;
2565                 break;
2566
2567         default:
2568                 return(EINVAL);
2569         }
2570
2571         return(0);
2572 }