src/afs/HPUX/osi_vnodeops.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /* This is a placeholder for routines unique to the port of AFS to hp-ux*/
  11
  12 #include <afsconfig.h>
  13 #include "afs/param.h"
  14
  15 RCSID
  16     ("$Header$");
  17
  18 #include "afs/sysincludes.h"    /* Standard vendor system headers */
  19 #include "afsincludes.h"        /* Afs-based standard headers */
  20 #include "afs/afs_stats.h"      /* statistics stuff */
  21
  22 #include <sys/uio.h>
  23 #include <sys/vfs.h>
  24 #include <sys/mount.h>
  25 #include <sys/vnode.h>
  26 #include <sys/pathname.h>
  27
  28 extern struct vfsops Afs_vfsops;
  29 extern int afs_hp_strategy();
  30 extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
  31 extern int afs_pagein();
  32 extern int afs_pageout();
  33 extern int afs_ioctl();
  34 extern int afs_prealloc();
  35 extern int afs_mapdbd();
  36 extern int afs_mmap();
  37 extern int afs_cachelimit();
  38 extern int afs_vm_checkpage();
  39 extern int afs_vm_fscontiguous();
  40 extern int afs_vm_stopio();
  41 extern int afs_read_ahead();
  42 extern int afs_unmap();
  43 extern int afs_release();
  44 extern int afs_swapfs_len();
  45 extern int afs_readdir2();
  46 extern int afs_readdir();
  47 extern int afs_readdir3();
  48 extern int afs_pathconf();
  49 extern int afs_close();
  50
  51 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
  52
  53 #if defined(AFS_HPUX110_ENV)
  54 /* We no longer need to lock on the VM Empire,
  55  * or at least that is what is claimed.
  56  * so we will noopt the vmemp_ routines
  57  * This needs to be looked at closer.
  58  */
  59 #define vmemp_lockx()
  60 #undef  vmemp_returnx
  61 #define vmemp_returnx(a) return(a)
  62 #define vmemp_unlockx()
  63 #endif
  64
  65 #if !defined(AFS_HPUX110_ENV)
  66 /*
  67  * Copy an mbuf to the contiguous area pointed to by cp.
  68  * Skip <off> bytes and copy <len> bytes.
  69  * Returns the number of bytes not transferred.
  70  * The mbuf is NOT changed.
  71  */
  72 int
  73 m_cpytoc(m, off, len, cp)
  74      register struct mbuf *m;
  75      register int off, len;
  76      register caddr_t cp;
  77 {
  78     register int ml;
  79
  80     if (m == NULL || off < 0 || len < 0 || cp == NULL)
  81         osi_Panic("m_cpytoc");
  82     while (off && m)
  83         if (m->m_len <= off) {
  84             off -= m->m_len;
  85             m = m->m_next;
  86             continue;
  87         } else
  88             break;
  89     if (m == NULL)
  90         return (len);
  91
  92     ml = MIN(len, m->m_len - off);
  93     memcpy(cp, mtod(m, caddr_t) + off, (u_int) ml);
  94     cp += ml;
  95     len -= ml;
  96     m = m->m_next;
  97
  98     while (len && m) {
  99         ml = m->m_len;
 100         memcpy(cp, mtod(m, caddr_t), (u_int) ml);
 101         cp += ml;
 102         len -= ml;
 103         m = m->m_next;
 104     }
 105
 106     return (len);
 107 }
 108 #endif
 109
 110 /*
 111  *  Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
 112  * totally new.  This came about because HP-UX has lockf() implemented as
 113  * a system call while Sun has it implemented as a library (apparently).
 114  * To handle this, we have to translate the lockf() request into an
 115  * fcntl() looking request, and then translate the results back if necessary.
 116  * we call afs_lockctl() directly .
 117  */
 118 afs_lockf(vp, flag, len, cred, fp, LB, UB)
 119      struct vnode *vp;
 120      int flag;
 121      struct AFS_UCRED *cred;
 122      struct file *fp;
 123      k_off_t len, LB, UB;
 124 {
 125     /*for now, just pretend it works */
 126     struct k_flock flock;
 127     int cmd, code;
 128
 129     /*
 130      * Create a flock structure and translate the lockf request
 131      * into an appropriate looking fcntl() type request for afs_lockctl()
 132      */
 133     flock.l_whence = 0;
 134     flock.l_len = len;
 135     flock.l_start = fp->f_offset;
 136     /* convert negative lengths to positive */
 137     if (flock.l_len < 0) {
 138         flock.l_start += flock.l_len;
 139         flock.l_len = -(flock.l_len);
 140     }
 141     /*
 142      * Adjust values to look like fcntl() requests.
 143      * All locks are write locks, only F_LOCK requests
 144      * are blocking.  F_TEST has to be translated into
 145      * a get lock and then back again.
 146      */
 147     flock.l_type = F_WRLCK;
 148     cmd = F_SETLK;
 149     switch (flag) {
 150     case F_ULOCK:
 151         flock.l_type = F_UNLCK;
 152         break;
 153     case F_LOCK:
 154         cmd = F_SETLKW;
 155         break;
 156     case F_TEST:
 157         cmd = F_GETLK;
 158         break;
 159     }
 160     u.u_error = mp_afs_lockctl(vp, &flock, cmd, fp->f_cred);
 161     if (u.u_error) {
 162         return (u.u_error);     /* some other error code */
 163     }
 164     /*
 165      * if request is F_TEST, and GETLK changed
 166      * the lock type to ULOCK, then return 0, else
 167      * set errno to EACCESS and return.
 168      */
 169     if (flag == F_TEST && flock.l_type != F_UNLCK) {
 170         u.u_error = EACCES;
 171         return (u.u_error);
 172     }
 173     return (0);
 174 }
 175
 176
 177 #if defined(AFS_HPUX1122_ENV)
 178 #include "machine/vm/vmparam.h"
 179 #else
 180 #include "../machine/vmparam.h" /* For KERNELSPACE */
 181 #endif
 182 #include "h/debug.h"
 183 #include "h/types.h"
 184 #include "h/param.h"
 185 #include "h/vmmac.h"
 186 #include "h/time.h"
 187 #include "ufs/inode.h"
 188 #include "ufs/fs.h"
 189 #include "h/dbd.h"
 190 #include "h/vfd.h"
 191 #include "h/region.h"
 192 #include "h/pregion.h"
 193 #include "h/vmmeter.h"
 194 #include "h/user.h"
 195 #include "h/sysinfo.h"
 196 #include "h/pfdat.h"
 197 #include "h/tuneable.h"
 198 #include "h/buf.h"
 199 #include "netinet/in.h"
 200
 201 /* a freelist of one */
 202 struct buf *afs_bread_freebp = 0;
 203
 204 /*
 205  *  Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
 206  *  Thus we can use fake bufs (ie not from the real buffer pool).
 207  */
 208 afs_bread(vp, lbn, bpp)
 209      struct vnode *vp;
 210      daddr_t lbn;
 211      struct buf **bpp;
 212 {
 213     int offset, fsbsize, error;
 214     struct buf *bp;
 215     struct iovec iov;
 216     struct uio uio;
 217
 218     AFS_STATCNT(afs_bread);
 219     fsbsize = vp->v_vfsp->vfs_bsize;
 220     offset = lbn * fsbsize;
 221     if (afs_bread_freebp) {
 222         bp = afs_bread_freebp;
 223         afs_bread_freebp = 0;
 224     } else {
 225         bp = (struct buf *)AFS_KALLOC(sizeof(*bp));
 226         bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
 227     }
 228
 229     iov.iov_base = bp->b_un.b_addr;
 230     iov.iov_len = fsbsize;
 231     uio.afsio_iov = &iov;
 232     uio.afsio_iovcnt = 1;
 233     uio.afsio_seg = AFS_UIOSYS;
 234     uio.afsio_offset = offset;
 235     uio.afsio_resid = fsbsize;
 236     uio.uio_fpflags = 0;
 237     *bpp = 0;
 238
 239     error = afs_read(VTOAFS(vp), &uio, p_cred(u.u_procp), lbn, bpp, 0);
 240     if (error) {
 241         afs_bread_freebp = bp;
 242         return error;
 243     }
 244     if (*bpp) {
 245         afs_bread_freebp = bp;
 246     } else {
 247         *(struct buf **)&bp->b_vp = bp; /* mark as fake */
 248         *bpp = bp;
 249     }
 250     return 0;
 251 }
 252
 253 afs_brelse(vp, bp)
 254      struct vnode *vp;
 255      struct buf *bp;
 256 {
 257     AFS_STATCNT(afs_brelse);
 258
 259     if ((struct buf *)bp->b_vp != bp) { /* not fake */
 260         ufs_brelse(bp->b_vp, bp);
 261     } else if (afs_bread_freebp) {
 262         AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
 263         AFS_KFREE(bp, sizeof(*bp));
 264     } else {
 265         afs_bread_freebp = bp;
 266     }
 267 }
 268
 269
 270 afs_bmap(avc, abn, anvp, anbn)
 271      register struct vcache *avc;
 272      afs_int32 abn, *anbn;
 273      struct vcache **anvp;
 274 {
 275     AFS_STATCNT(afs_bmap);
 276     if (anvp)
 277         *anvp = avc;
 278     if (anbn)
 279         *anbn = abn * (8192 / DEV_BSIZE);       /* in 512 byte units */
 280     return 0;
 281 }
 282
 283 afs_inactive(avc, acred)
 284      register struct vcache *avc;
 285      struct AFS_UCRED *acred;
 286 {
 287     struct vnode *vp = AFSTOV(avc);
 288     ulong_t context;
 289     lock_t *sv_lock;
 290     if (afs_shuttingdown)
 291         return;
 292
 293     /*
 294      * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
 295      * v_count 1 on last reference!
 296      */
 297     MP_H_SPINLOCK_USAV(vn_h_sl_pool, vp, &sv_lock, &context);
 298     if (avc->vrefCount < 1)
 299         osi_Panic("afs_inactive : v_count < 1\n");
 300
 301     /*
 302      * If more than 1 don't unmap the vnode but do decrement the ref count
 303      */
 304     vp->v_count--;
 305     if (vp->v_count > 0) {
 306         MP_SPINUNLOCK_USAV(sv_lock, context);
 307         return 0;
 308     }
 309     MP_SPINUNLOCK_USAV(sv_lock, context);
 310     afs_InactiveVCache(avc, acred);
 311     return 0;
 312 }
 313
 314
 315 int
 316 mp_afs_open(register struct vnode **avcp, int aflags, struct AFS_UCRED *acred)
 317 {
 318     register int code;
 319
 320     AFS_GLOCK();
 321     code = afs_open(avcp, aflags, acred);
 322     AFS_GUNLOCK();
 323     return (code);
 324 }
 325
 326 int
 327 mp_afs_close(register struct vnode *avcp, int aflags, struct AFS_UCRED *acred)
 328 {
 329     register int code;
 330
 331     AFS_GLOCK();
 332     code = afs_close(avcp, aflags, acred);
 333     AFS_GUNLOCK();
 334     return (code);
 335 }
 336
 337 int
 338 mp_afs_rdwr(register struct vnode *avcp, struct uio *uio, enum uio_rw arw,
 339             int aio, struct AFS_UCRED *acred)
 340 {
 341     register int code;
 342     long save_resid;
 343
 344     AFS_GLOCK();
 345     save_resid = uio->uio_resid;
 346     code = afs_rdwr(avcp, uio, arw, aio, acred);
 347     if (arw == UIO_WRITE && code == ENOSPC) {
 348         /* HP clears code if any data written. */
 349         uio->uio_resid = save_resid;
 350     }
 351     AFS_GUNLOCK();
 352     return (code);
 353 }
 354
 355 int
 356 mp_afs_getattr(register struct vnode *avcp, struct vattr *attrs,
 357                struct AFS_UCRED *acred, enum vsync unused1)
 358 {
 359     register int code;
 360
 361     AFS_GLOCK();
 362     code = afs_getattr(avcp, attrs, acred);
 363     AFS_GUNLOCK();
 364     return (code);
 365 }
 366
 367 int
 368 mp_afs_setattr(register struct vnode *avcp, register struct vattr *attrs,
 369                struct AFS_UCRED *acred, int unused1)
 370 {
 371     register int code;
 372
 373     AFS_GLOCK();
 374     code = afs_setattr(avcp, attrs, acred);
 375     AFS_GUNLOCK();
 376     return (code);
 377 }
 378
 379 int
 380 mp_afs_access(register struct vnode *avcp, int mode, struct AFS_UCRED *acred)
 381 {
 382     register int code;
 383
 384     AFS_GLOCK();
 385     code = afs_access(avcp, mode, acred);
 386     AFS_GUNLOCK();
 387     return (code);
 388 }
 389
 390 int
 391 mp_afs_lookup(register struct vnode *adp, char *aname,
 392               register struct vnode **avcp, struct AFS_UCRED *acred,
 393               struct vnode *unused1)
 394 {
 395     register int code;
 396
 397     AFS_GLOCK();
 398     code = afs_lookup(adp, aname, avcp, acred);
 399     AFS_GUNLOCK();
 400     return (code);
 401 }
 402
 403 int
 404 mp_afs_create(register struct vnode *adp, char *aname, struct vattr *attrs,
 405               enum vcexcl aexcl, int amode, struct vnode **avcp,
 406               struct AFS_UCRED *acred)
 407 {
 408     register int code;
 409
 410     AFS_GLOCK();
 411     code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
 412     AFS_GUNLOCK();
 413     return (code);
 414 }
 415
 416
 417 int
 418 mp_afs_remove(register struct vnode *adp, char *aname,
 419               struct AFS_UCRED *acred)
 420 {
 421     register int code;
 422
 423     AFS_GLOCK();
 424     code = afs_remove(adp, aname, acred);
 425     AFS_GUNLOCK();
 426     return (code);
 427 }
 428
 429 int
 430 mp_afs_link(register struct vnode *avc, register struct vnode *adp,
 431             char *aname, struct AFS_UCRED *acred)
 432 {
 433     register int code;
 434
 435     AFS_GLOCK();
 436     code = afs_link(avc, adp, aname, acred);
 437     AFS_GUNLOCK();
 438     return (code);
 439 }
 440
 441 int
 442 mp_afs_rename(register struct vnode *aodp, char *aname1,
 443               register struct vnode *andp, char *aname2,
 444               struct AFS_UCRED *acred)
 445 {
 446     register int code;
 447
 448     AFS_GLOCK();
 449     code = afs_rename(aodp, aname1, andp, aname2, acred);
 450     AFS_GUNLOCK();
 451     return (code);
 452 }
 453
 454 int
 455 mp_afs_mkdir(register struct vnode *adp, char *aname, struct vattr *attrs,
 456              register struct vnode **avcp, struct AFS_UCRED *acred)
 457 {
 458     register int code;
 459
 460     AFS_GLOCK();
 461     code = afs_mkdir(adp, aname, attrs, avcp, acred);
 462     AFS_GUNLOCK();
 463     return (code);
 464 }
 465
 466
 467 int
 468 mp_afs_rmdir(register struct vnode *adp, char *aname, struct AFS_UCRED *acred)
 469 {
 470     register int code;
 471
 472     AFS_GLOCK();
 473     code = afs_rmdir(adp, aname, acred);
 474     AFS_GUNLOCK();
 475     return (code);
 476 }
 477
 478
 479 int
 480 mp_afs_readdir(register struct vnode *avc, struct uio *auio,
 481                struct AFS_UCRED *acred)
 482 {
 483     register int code;
 484
 485     AFS_GLOCK();
 486     code = afs_readdir(avc, auio, acred);
 487     AFS_GUNLOCK();
 488     return (code);
 489 }
 490
 491 int
 492 mp_afs_symlink(register struct vnode *adp, char *aname, struct vattr *attrs,
 493                char *atargetName, struct AFS_UCRED *acred)
 494 {
 495     register int code;
 496
 497     AFS_GLOCK();
 498     code = afs_symlink(adp, aname, attrs, atargetName, acred);
 499     AFS_GUNLOCK();
 500     return (code);
 501 }
 502
 503
 504 int
 505 mp_afs_readlink(register struct vnode *avc, struct uio *auio,
 506                 struct AFS_UCRED *acred)
 507 {
 508     register int code;
 509
 510     AFS_GLOCK();
 511     code = afs_readlink(avc, auio, acred);
 512     AFS_GUNLOCK();
 513     return (code);
 514 }
 515
 516 int
 517 mp_afs_fsync(register struct vnode *avc, struct AFS_UCRED *acred, int unused1)
 518 {
 519     register int code;
 520
 521     AFS_GLOCK();
 522     code = afs_fsync(avc, acred);
 523     AFS_GUNLOCK();
 524     return (code);
 525 }
 526
 527 int
 528 mp_afs_bread(register struct vnode *avc, daddr_t lbn, struct buf **bpp,
 529              struct vattr *unused1, struct ucred *unused2)
 530 {
 531     register int code;
 532
 533     AFS_GLOCK();
 534     code = afs_bread(avc, lbn, bpp);
 535     AFS_GUNLOCK();
 536     return (code);
 537 }
 538
 539 int
 540 mp_afs_brelse(register struct vnode *avc, struct buf *bp)
 541 {
 542     register int code;
 543
 544     AFS_GLOCK();
 545     code = afs_brelse(avc, bp);
 546     AFS_GUNLOCK();
 547     return (code);
 548 }
 549
 550
 551 int
 552 mp_afs_inactive(register struct vnode *avc, struct AFS_UCRED *acred)
 553 {
 554     register int code;
 555
 556     AFS_GLOCK();
 557     code = afs_inactive(avc, acred);
 558     AFS_GUNLOCK();
 559     return (code);
 560 }
 561
 562 int
 563 mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd,
 564                struct AFS_UCRED *acred, struct file *unused1, off_t unused2,
 565                off_t unused3)
 566 {
 567     register int code;
 568
 569     AFS_GLOCK();
 570     code = afs_lockctl(avc, af, cmd, acred);
 571     AFS_GUNLOCK();
 572     return (code);
 573 }
 574
 575 int
 576 mp_afs_fid(struct vnode *avc, struct fid **fidpp)
 577 {
 578     register int code;
 579
 580     AFS_GLOCK();
 581     code = afs_fid(avc, fidpp);
 582     AFS_GUNLOCK();
 583     return (code);
 584 }
 585
 586 int
 587 mp_afs_readdir2(register struct vnode *avc, struct uio *auio,
 588                 struct AFS_UCRED *acred)
 589 {
 590     register int code;
 591
 592     AFS_GLOCK();
 593     code = afs_readdir2(avc, auio, acred);
 594     AFS_GUNLOCK();
 595     return (code);
 596 }
 597
 598
 599 struct vnodeops Afs_vnodeops = {
 600     mp_afs_open,
 601     mp_afs_close,
 602     mp_afs_rdwr,
 603     afs_ioctl,
 604     afs_noop,
 605     mp_afs_getattr,
 606     mp_afs_setattr,
 607     mp_afs_access,
 608     mp_afs_lookup,
 609     mp_afs_create,
 610     mp_afs_remove,
 611     mp_afs_link,
 612     mp_afs_rename,
 613     mp_afs_mkdir,
 614     mp_afs_rmdir,
 615     afs_readdir,
 616     mp_afs_symlink,
 617     mp_afs_readlink,
 618     mp_afs_fsync,
 619     mp_afs_inactive,
 620     afs_bmap,
 621     afs_hp_strategy,
 622 #if     !defined(AFS_NONFSTRANS)
 623     /* on HPUX102 the nfs translator calls afs_bread but does
 624      * not call afs_brelse. Hence we see a memory leak. If the
 625      * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
 626      * the same data : this is the path we follow now. */
 627     afs_noop,
 628     afs_noop,
 629 #else
 630     mp_afs_bread,
 631     mp_afs_brelse,
 632 #endif
 633     afs_badop,                  /* pathsend */
 634     afs_noop,                   /* setacl */
 635     afs_noop,                   /* getacl */
 636     afs_pathconf,
 637     afs_pathconf,
 638     mp_afs_lockctl,
 639     afs_lockf,                  /* lockf */
 640     mp_afs_fid,
 641     afs_noop,                   /*fsctl */
 642     afs_badop,
 643     afs_pagein,
 644     afs_pageout,
 645     NULL,
 646     NULL,
 647     afs_prealloc,
 648     afs_mapdbd,
 649     afs_mmap,
 650     afs_cachelimit,
 651     afs_vm_checkpage,
 652     afs_vm_fscontiguous,
 653     afs_vm_stopio,
 654     afs_read_ahead,
 655     afs_release,
 656     afs_unmap,
 657     afs_swapfs_len,
 658     mp_afs_readdir2,
 659     afs_readdir3,
 660 };
 661
 662 struct vnodeops *afs_ops = &Afs_vnodeops;
 663
 664 /* vnode file operations, and our own */
 665 extern int vno_rw();
 666 extern int vno_ioctl();
 667 extern int vno_select();
 668 extern int afs_closex();
 669 extern int vno_close();
 670 struct fileops afs_fileops = {
 671     vno_rw,
 672     vno_ioctl,
 673     vno_select,
 674     afs_close,
 675 };
 676
 677 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
 678
 679 /*
 680  ********************************************************************
 681  ****
 682  ****                   afspgin_setup_io_ranges ()
 683  ****    similar to:    nfspgin_setup_io_ranges ()
 684  ********************************************************************
 685  */
 686 pgcnt_t
 687 afspgin_setup_io_ranges(vfspage_t * vm_info, pgcnt_t bpages, k_off_t isize,
 688                         pgcnt_t startindex)
 689 {
 690     pgcnt_t file_offset = VM_FILE_OFFSET(vm_info);
 691     pgcnt_t minpage;            /* first page to bring in */
 692     pgcnt_t maxpage;            /* one past last page to bring in */
 693     pgcnt_t maxpagein;
 694     pgcnt_t multio_maxpage;
 695     daddr_t start_blk;
 696     dbd_t *dbd;
 697     expnd_flags_t up_reason, down_reason;
 698     int count = 1;
 699     int indx = 0;
 700     int max_num_io;
 701     int dbdtype;
 702     preg_t *prp;
 703
 704     VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
 705
 706     /*
 707      * We do not go past the end of the current pregion nor past the end
 708      * of the current file.
 709      */
 710
 711     maxpage = startindex + (bpages - (startindex + file_offset) % bpages);
 712     maxpage = vm_reset_maxpage(vm_info, maxpage);
 713     maxpage = MIN(maxpage, (pgcnt_t) btorp(isize) - file_offset);
 714     maxpage = MIN(maxpage, startindex + maxpagein);
 715     multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
 716
 717     if (!maxpage)
 718         return (0);
 719
 720     VASSERT(maxpage >= startindex);
 721
 722     /*
 723      * Expanding the fault will create calls to FINDENTRY() for new
 724      * pages, which will obsolete "dbd", so copy what it points to
 725      * and clear it to prevent using stale data.
 726      */
 727
 728     prp = VM_PRP(vm_info);
 729     dbdtype = DBD_TYPE(vm_info);
 730     start_blk = DBD_DATA(vm_info);
 731     vm_info->dbd = NULL;
 732     vm_info->vfd = NULL;
 733     VASSERT(dbdtype != DBD_NONE);
 734
 735     if (max_num_io == 1) {
 736         /*
 737          * We need to set up one I/O: First we attempt to expand the
 738          * I/O forward. Then we expand the I/O backwards.
 739          */
 740         count =
 741             expand_faultin_up(vm_info, dbdtype, (int)bpages, maxpage, count,
 742                               startindex, start_blk, &up_reason);
 743         maxpage = startindex + count;
 744         VASSERT(maxpage <= startindex + maxpagein);
 745         minpage = startindex - (startindex + file_offset) % bpages;
 746         minpage = MAX(minpage, maxpage - maxpagein);
 747         VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
 748         minpage = vm_minpage(vm_info, minpage);
 749         VASSERT(minpage <= startindex);
 750         count =
 751             expand_faultin_down(vm_info, dbdtype, (int)bpages, minpage, count,
 752                                 &startindex, &start_blk, &down_reason);
 753         VM_SET_IO_STARTINDX(vm_info, 0, startindex);
 754         VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
 755         VM_SET_IO_COUNT(vm_info, 0, count);
 756         VM_SET_NUM_IO(vm_info, 1);
 757     }
 758
 759     if (max_num_io > 1) {
 760         /*
 761          * We need to set up multiple I/O information; beginning
 762          * with the startindex, we will expand upwards. The expansion
 763          * could stop for one of 2 reasons; we take the appropriate
 764          * action in each of these cases:
 765          *      o VM reasons: abort setting up the multiple I/O
 766          *        information and return to our caller indicating
 767          *        that "retry" is required.
 768          *      o pagelimit: set up the next I/O info [we may have
 769          *        reached multio_maxpage at this point].
 770          * Note that expansion involves no more than a block at a time;
 771          * hence it could never stop due to "discontiguous block"
 772          * reason.
 773          */
 774         startindex = minpage = vm_minpage(vm_info, 0);
 775         for (indx = 0; (indx < max_num_io) && (startindex < multio_maxpage);
 776              indx++, startindex += count) {
 777             dbd = FINDDBD(prp->p_reg, startindex);
 778             start_blk = dbd->dbd_data;
 779             maxpage =
 780                 startindex + (bpages - (startindex + file_offset) % bpages);
 781             maxpage = min(maxpage, multio_maxpage);
 782             count =
 783                 expand_faultin_up(vm_info, dbdtype, bpages, maxpage,
 784                                   1 /* count */ ,
 785                                   startindex, start_blk, &up_reason);
 786             VM_SET_IO_STARTINDX(vm_info, indx, startindex);
 787             VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
 788             VM_SET_IO_COUNT(vm_info, indx, count);
 789             if (up_reason & VM_REASONS)
 790                 break;
 791             VASSERT(!(up_reason & NONCONTIGUOUS_BLOCK));
 792             VASSERT(up_reason & PAGELIMIT);
 793         }
 794         if (startindex < multio_maxpage) {
 795             VM_MULT_IO_FAILURE(vm_info);
 796             VM_REINIT_FAULT_DBDVFD(vm_info);
 797             return (0);         /* retry */
 798         }
 799         count = maxpagein;
 800         VM_SET_NUM_IO(vm_info, indx);
 801     }
 802
 803     /*
 804      * Tell VM where the I/O intends to start.  This may be different
 805      * from the faulting point.
 806      */
 807
 808     VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
 809
 810     return (count);
 811
 812 }
 813
 814 /*
 815  ********************************************************************
 816  ****
 817  ****                   afspgin_blkflsh ()
 818  ****   similar to:     nfspgin_blkflsh ()
 819  ********************************************************************
 820  */
 821 retval_t
 822 afspgin_blkflsh(vfspage_t * vm_info, struct vnode * devvp, pgcnt_t * num_4k)
 823 {
 824     int flush_reslt = 0;
 825     pgcnt_t count = *num_4k;
 826     pgcnt_t page_count;
 827     int indx = 0;
 828     int num_io = VM_GET_NUM_IO(vm_info);
 829
 830     /*
 831      * On this blkflush() we don't want to purge the buffer cache and we do
 832      * want to wait, so the flags are '0'.
 833      */
 834
 835     for (indx = 0; indx < num_io; indx++) {
 836         flush_reslt =
 837             blkflush(devvp, (daddr_t) VM_GET_IO_STARTBLK(vm_info, indx),
 838                      ptob(VM_GET_IO_COUNT(vm_info, indx)), 0,
 839                      VM_REGION(vm_info));
 840         if (flush_reslt) {
 841             vm_lock(vm_info);
 842             if (vm_page_now_valid(vm_info, &page_count)) {
 843                 vm_release_memory(vm_info);
 844                 vm_release_structs(vm_info);
 845                 *num_4k = page_count;
 846                 return (VM_PAGE_PRESENT);
 847             }
 848             return (VM_RETRY);
 849         }
 850     }
 851     return (VM_DONE);
 852 }
 853
 854 /*
 855  ********************************************************************
 856  ****
 857  ****                   afspgin_io ()
 858  ****    similar to:    nfspgin_io ()
 859  ********************************************************************
 860  */
 861 int
 862 afspgin_io(vfspage_t * vm_info, struct vnode *devvp, pgcnt_t bpages,
 863            pgcnt_t maxpagein, pgcnt_t count)
 864 {
 865     int i;
 866     int error = 0;
 867     caddr_t vaddr = VM_ADDR(vm_info);
 868     caddr_t virt_addr = VM_MAPPED_ADDR(vm_info);
 869     pagein_info_t *io = VM_PAGEIN_INFO(vm_info);
 870     preg_t *prp = VM_PRP(vm_info);
 871     int wrt = VM_WRT(vm_info);
 872     space_t space = VM_SPACE(vm_info);
 873     int num_io = VM_GET_NUM_IO(vm_info);
 874
 875 #ifdef notdef                   /* Not used in AFS */
 876     /*
 877      * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
 878      * be used in this case.
 879      *
 880      * Unlike UFS, NFS does not start the faulting page I/O
 881      * asynchronously. Why?  Asynchronous requests are handled by the
 882      * biod's.  It doesn't make sense to queue up the faulting request
 883      * behind other asynchrnous requests.  This is not true for UFS
 884      * where the asynchrnous request is immediately handled.
 885      */
 886
 887     if ((VM_READ_AHEAD_ALLOWED(vm_info)) && (nfs_read_ahead_on)
 888         && (NFS_DO_READ_AHEAD) && (should_do_read_ahead(prp, vaddr))) {
 889
 890         pgcnt_t max_rhead_io;
 891         caddr_t rhead_vaddr;
 892         pgcnt_t total_rheads_allowed;
 893
 894         /*
 895          * Determine the maximum amount of read-ahead I/O.
 896          */
 897         total_rheads_allowed = maxpagein - count;
 898
 899         /*
 900          * If the count is less than a block, raise it to one.
 901          */
 902         if (total_rheads_allowed < bpages)
 903             total_rheads_allowed = bpages;
 904
 905         max_rhead_io = total_rheads_allowed;
 906         rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count * NBPG);
 907         error =
 908             nfs_read_ahead(vm_info->vp, prp, wrt, space, rhead_vaddr,
 909                            &max_rhead_io);
 910
 911         /*
 912          * Set the next fault location.  If read_ahead launches any
 913          * I/O it will adjust it accordingly.
 914          */
 915         vm_info->prp->p_nextfault = vm_info->startindex + count;
 916
 917         /*
 918          * Now perform the faulting I/O synchronously.
 919          */
 920         vm_unlock(vm_info);
 921
 922         error =
 923             syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, 0),
 924                        VM_MAPPED_SPACE(vm_info), VM_MAPPED_ADDR(vm_info),
 925                        (int)ptob(count), B_READ, devvp,
 926                        B_vfs_pagein | B_pagebf, VM_REGION(vm_info));
 927     } else
 928 #endif
 929     {
 930         virt_addr = VM_MAPPED_ADDR(vm_info);
 931         vm_unlock(vm_info);
 932         for (i = 0; i < num_io; i++) {
 933             /*
 934              * REVISIT -- investigate doing asyncpageio().
 935              */
 936             error |= (io[i].error =
 937                       syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, i),
 938                                  VM_MAPPED_SPACE(vm_info), virt_addr,
 939                                  (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
 940                                  B_READ, devvp, B_vfs_pagein | B_pagebf,
 941                                  VM_REGION(vm_info)));
 942             virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
 943         }
 944         /*
 945          * Set the next fault location.  If read_ahead launches any
 946          * I/O it will adjust it accordingly.
 947          */
 948         vm_info->prp->p_nextfault = vm_info->startindex + count;
 949     }
 950
 951     return (error);
 952 }
 953
 954 /*
 955  ********************************************************************
 956  ****
 957  ****                   afspgin_update_dbd ()
 958  ****    similar to:    nfspgin_update_dbd ()
 959  ********************************************************************
 960  */
 961 void
 962 afspgin_update_dbd(vfspage_t * vm_info, int bsize)
 963 {
 964     k_off_t off;
 965     pgcnt_t count = bsize / NBPG;
 966     k_off_t rem;
 967     pgcnt_t m;
 968     pgcnt_t pgindx;
 969     daddr_t blkno;
 970     int num_io = VM_GET_NUM_IO(vm_info);
 971     int i;
 972
 973     for (i = 0; i < num_io; i++) {
 974
 975         pgindx = VM_GET_IO_STARTINDX(vm_info, i);
 976         off = vnodindx(VM_REGION(vm_info), pgindx);
 977         rem = off % bsize;
 978         blkno = VM_GET_IO_STARTBLK(vm_info, i);
 979
 980         VASSERT(bsize % NBPG == 0);
 981         VASSERT(rem % NBPG == 0);
 982
 983         pgindx -= (pgcnt_t) btop(rem);
 984         blkno -= (daddr_t) btodb(rem);
 985
 986         /*
 987          * This region could start in mid-block.  If so, pgindx
 988          * could be less than 0, so we adjust pgindx and blkno back
 989          * up so that pgindx is 0.
 990          */
 991
 992         if (pgindx < 0) {
 993             pgcnt_t prem;
 994             prem = 0 - pgindx;
 995             pgindx = 0;
 996             count -= prem;
 997             blkno += btodb(ptob(prem));
 998         }
 999
1000         for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
1001              m++, pgindx++, blkno += btodb(NBPG)) {
1002             /*
1003              * Note:  since this only changes one block, it
1004              * assumes only one block was faulted in.  Currently
1005              * this is always true for remote files, and we only
1006              * get here for remote files, so everything is ok.
1007              */
1008             vm_mark_dbd(vm_info, pgindx, blkno);
1009         }
1010     }
1011 }
1012
1013 int
1014 afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
1015      struct vnode *vp;
1016      preg_t *prp;
1017      int wrt;
1018      space_t space;
1019      caddr_t vaddr;
1020      pgcnt_t *ret_startindex;
1021 {
1022     pgcnt_t startindex;
1023     pgcnt_t pgindx = *ret_startindex;
1024     pgcnt_t maxpagein;
1025     struct vnode *devvp;
1026     pgcnt_t count;
1027     daddr_t start_blk = 0;
1028     int bsize;
1029     int error;
1030     k_off_t isize;
1031     int shared;                 /* writable memory mapped file */
1032     retval_t retval = 0;
1033     pgcnt_t ok_dbd_limit = 0;   /* last dbd that we can trust */
1034     pgcnt_t bpages;             /* number of pages per block */
1035     pgcnt_t page_count;
1036     vfspage_t *vm_info = NULL;
1037     int done;
1038
1039     struct vattr va;
1040
1041     caddr_t nvaddr;
1042     space_t nspace;
1043     int change_to_fstore = 0;   /* need to change dbds to DBD_FSTORE */
1044     int flush_start_blk = 0;
1045     int flush_end_blk = 0;
1046
1047     int i, j;
1048
1049     AFS_STATCNT(afs_pagein);
1050     vmemp_lockx();              /* lock down VM empire */
1051
1052     /* Initialize the VM info structure */
1053     done =
1054         vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1055                        LGPG_ENABLE);
1056
1057     /* Check to see if we slept and the page was falted in. */
1058     if (done) {
1059         vm_release_structs(vm_info);
1060         vmemp_returnx(1);
1061     }
1062
1063     vp = VM_GET_PAGEIN_VNODE(vm_info);
1064     VASSERT(vp != NULL);
1065     shared = VM_SHARED_OBJECT(vm_info);
1066     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1067
1068     /*
1069      * Get the devvp and block size for this vnode type
1070      */
1071     devvp = vp;
1072     bsize = vp->v_vfsp->vfs_bsize;
1073     if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1074         osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1075
1076     bpages = (pgcnt_t) btop(bsize);
1077     VASSERT(bpages > 0);
1078     VM_SET_FS_MAX_PAGES(vm_info, bpages);
1079
1080     /* this trace cannot be here because the afs_global lock might not be
1081      * held at this point. We hold the vm global lock throughout
1082      * this procedure ( and not the AFS global lock )
1083      * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1084      * ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1085      * ICL_TYPE_LONG, shared);
1086      */
1087     /* Come here if we have to release the region lock before
1088      * locking pages.  This can happen in memreserve() and
1089      * blkflush().
1090      */
1091   retry:
1092     /*
1093      * For remote files like ours, we want to check to see if the file has shrunk.
1094      * If so, we should invalidate any pages past the end.  In the name
1095      * of efficiency, we only do this if the page we want to fault is
1096      * past the end of the file.
1097      */
1098     {
1099         if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1100             VM_ZOMBIE_OBJECT(vm_info);
1101             vm_release_memory(vm_info);
1102             vm_release_structs(vm_info);
1103             vmemp_returnx(0);
1104         }
1105         isize = va.va_size;
1106         if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1107             /*
1108              * The file has shrunk and someone is trying to access a
1109              * page past the end of the object.  Shrink the object back
1110              * to its currrent size, send a SIGBUS to the faulting
1111              * process and return.
1112              *
1113              * We must release the region lock before calling mtrunc(),
1114              * since mtrunc() locks all the regions that are using this
1115              * file.
1116              */
1117             vm_release_memory(vm_info);
1118             vm_truncate_region(vm_info, isize);
1119             vm_release_structs(vm_info);
1120             vmemp_returnx(-SIGBUS);
1121         }
1122     }
1123
1124     maxpagein = vm_pick_maxpagein(vm_info);
1125     if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1126         /* Check to see if we should continue faulting.  */
1127         if (vm_page_now_valid(vm_info, &page_count)) {
1128             vm_release_memory(vm_info);
1129             vm_release_structs(vm_info);
1130             vmemp_returnx(page_count);
1131         }
1132     }
1133     if (count = vm_no_io_required(vm_info)) {
1134         /* Release any excess memory.  */
1135         vm_release_memory(vm_info);
1136         vm_release_structs(vm_info);
1137         vmemp_returnx(count);
1138     }
1139 #ifdef OSDEBUG
1140     /*
1141      * We should never have DBD_HOLE pages in a non-MMF region.
1142      */
1143     if (!shared)
1144         VASSERT(dbd->dbd_type != DBD_HOLE);
1145 #endif
1146     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1147
1148     startindex = *ret_startindex;
1149
1150     /*
1151      * If the page we want is in memory already, take it
1152      */
1153     if (VM_MEMORY_RESERVED(vm_info) < maxpagein) {
1154         /* pick up the rest of memory now.  */
1155         if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1156             if (vm_page_now_valid(vm_info, &page_count)) {
1157                 vm_release_memory(vm_info);
1158                 vm_release_structs(vm_info);
1159                 vmemp_returnx(page_count);
1160             }
1161             goto retry;
1162         }
1163     }
1164
1165     if (!
1166         (count =
1167          afspgin_setup_io_ranges(vm_info, bpages, isize, startindex))) {
1168         goto retry;
1169     }
1170
1171     startindex = VM_GET_STARTINDX(vm_info);
1172
1173     VASSERT(maxpagein >= count);
1174
1175     /*
1176      * Release the memory we won't need.
1177      */
1178     if (count < maxpagein) {
1179         vm_release_excess_memory(vm_info,
1180                                  (VM_MEMORY_RESERVED(vm_info) - count));
1181     }
1182
1183     retval = afspgin_blkflsh(vm_info, devvp, &count);
1184
1185     if (retval == VM_RETRY) {
1186         goto retry;
1187     }
1188
1189     if (retval == VM_PAGE_PRESENT)
1190         return (count);
1191
1192 #if 0
1193     /*
1194      * The definition of krusage_cntr_t is in h/kmetric.h, which
1195      * is not shipped.  Since it's just statistics, we punt and do
1196      * not update it.  If it's a problem we'll need to get HP to export
1197      * an interface that we can use to increment the counter.
1198      */
1199
1200     /* It's a real fault, not a reclaim */
1201     {
1202         krusage_cntr_t *temp;
1203         temp = kt_cntrp(u.u_kthreadp);
1204         temp->krc_majflt++;
1205     }
1206 #endif
1207
1208     /*
1209      * Tell VM where the I/O intends to start.  This may be different
1210      * from the faulting point.
1211      */
1212
1213     /*
1214      * vm_prepare_io will fill the region with pages and release the
1215      * region lock.
1216      */
1217     vm_prepare_io(vm_info, &count);
1218
1219     /*
1220      * Count may have been adjusted, check to make sure it's non-zero.
1221      */
1222     if (count == 0) {
1223         if (vm_retry(vm_info)) {
1224             goto retry;
1225         }
1226
1227         /*
1228          * Release resources and retry the fault.  Release any excess
1229          * memory.
1230          */
1231
1232         vm_release_memory(vm_info);
1233         vm_release_structs(vm_info);
1234         vmemp_returnx(0);
1235     }
1236
1237     error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1238
1239     if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1240         retval = -SIGBUS;
1241         VM_ZOMBIE_OBJECT(vm_info);
1242         goto backout;
1243     }
1244     /*
1245      * For a writable memory mapped file that is remote we must
1246      * detect potential holes in the file and force allocation of
1247      * disk space on the remote system.  Unfortunately, there is
1248      * no easy way to do this, so this gets a little ugly.
1249      */
1250     if (shared && wrt) {
1251         /*
1252          * See if The user wants to write to this page.  Write some
1253          * minimal amount of data back to the remote file to
1254          * force allocation of file space.  We only need to
1255          * write a small amount, since holes are always at
1256          * least one filesystem block in size.
1257          */
1258         error = vm_alloc_hole(vm_info);
1259
1260         /*
1261          * If some sort of I/O error occurred we generate a
1262          * SIGBUS for the process that caused the write,
1263          * undo our page locks, etc and return.
1264          */
1265         if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1266             VM_ZOMBIE_OBJECT(vm_info);
1267             retval = -SIGBUS;
1268             goto backout;
1269         }
1270
1271         /*
1272          * Change these dbds to DBD_FSTORE.  We cannot do it here,
1273          * since the region must be locked, and it is not locked
1274          * at the moment.  We cannot lock the region yet, as we
1275          * first have to release the page locks.
1276          */
1277         change_to_fstore = 1;
1278     }
1279
1280     vm_finish_io(vm_info, count);
1281
1282     /*
1283      * Acquire the lock before we play around with changing the vfd's.
1284      */
1285     vm_lock(vm_info);
1286
1287     if (change_to_fstore)
1288         afspgin_update_dbd(vm_info, bsize);
1289
1290 #if defined(AFS_HPUX110_ENV)
1291     getppdp()->cnt.v_exfod += count;
1292 #else
1293     mpproc_info[getprocindex()].cnt.v_exfod += count;
1294 #endif
1295     vmemp_unlockx();            /* free up VM empire */
1296     *ret_startindex = startindex;
1297
1298     /*
1299      * In case we have any excess memory...
1300      */
1301     if (VM_MEMORY_RESERVED(vm_info))
1302         vm_release_memory(vm_info);
1303     vm_release_structs(vm_info);
1304
1305     return count;
1306
1307   backout:
1308
1309     vm_finish_io_failed(vm_info, count);
1310
1311     vm_lock(vm_info);
1312
1313     vm_undo_validation(vm_info, count);
1314
1315     /*
1316      * In case we have any excess memory...
1317      */
1318     if (VM_MEMORY_RESERVED(vm_info))
1319         vm_release_memory(vm_info);
1320     vm_release_structs(vm_info);
1321
1322     vmemp_unlockx();            /* free up VM empire */
1323     return retval;
1324 }
1325
1326 int
1327 afs_pageout(vp, prp, start, end, flags)
1328      struct vnode *vp;          /* not used */
1329      preg_t *prp;
1330      pgcnt_t start;
1331      pgcnt_t end;
1332      int flags;
1333 {
1334     struct vnode *filevp;
1335     struct vnode *devvp;
1336     pgcnt_t i;
1337     int steal;
1338     int vhand;
1339     int hard;
1340     int *piocnt;                /* wakeup counter used if PAGEOUT_WAIT */
1341     struct ucred *old_cred;
1342     vfspage_t vm_info;
1343     fsdata_t args;
1344
1345     int inode_changed = 0;
1346     int file_is_remote;
1347     struct inode *ip;
1348
1349     AFS_STATCNT(afs_pageout);
1350
1351     steal = (flags & PAGEOUT_FREE);
1352     vhand = (flags & PAGEOUT_VHAND);
1353     hard = (flags & PAGEOUT_HARD);
1354
1355     vmemp_lockx();
1356
1357     /*  Initialize the VM info structure.  */
1358     vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1359
1360     /*
1361      * If the region is marked "don't swap", then don't steal any pages
1362      * from it.  We can, however, write dirty pages out to disk (only if
1363      * PAGEOUT_FREE is not set).
1364      */
1365     if (vm_no_pageout(&vm_info)) {
1366         vmemp_unlockx();
1367         return (0);
1368     }
1369
1370     /*
1371      * If caller wants to wait until the I/O is complete.
1372      */
1373     vm_setup_wait_for_io(&vm_info);
1374
1375     filevp = VM_GET_PAGEOUT_VNODE(&vm_info);    /* always page out to back store */
1376     VASSERT(filevp != NULL);
1377
1378     memset((caddr_t) & args, 0, sizeof(fsdata_t));
1379     args.remote_down = 0;       /* assume remote file servers are up */
1380     args.remote = 1;            /* we are remote */
1381     args.bsize = 0;             /* filled up later by afs_vm_checkpage() */
1382
1383     if (filevp->v_fstype == VUFS) {
1384         ip = VTOI(filevp);
1385         devvp = ip->i_devvp;
1386         file_is_remote = 0;
1387     } else {
1388         file_is_remote = 1;
1389         devvp = filevp;
1390
1391         /*
1392          * If we are vhand(), and this is an NFS file, we need to
1393          * see if the NFS server is "down".  If so, we decide
1394          * if we will try to talk to it again, or defer pageouts
1395          * of dirty NFS pages until a future time.
1396          */
1397 #ifdef  notdef
1398         if (vhand && filevp->v_fstype == VNFS && vtomi(filevp)->mi_down
1399             && vtomi(filevp)->mi_hard) {
1400             extern afs_int32 vhand_nfs_retry;
1401             /*
1402              * If there is still time left on our timer, we will
1403              * not talk to this server right now.
1404              */
1405             if (vhand_nfs_retry > 0)
1406                 args.remote_down = 1;
1407         }
1408 #endif
1409     }
1410
1411     /*
1412      * Initialize args.  We set bsize to 0 to tell vfs_vfdcheck() that
1413      * it must get the file size and other attributes if it comes across
1414      * a dirty page.
1415      */
1416     vm_info.fs_data = (caddr_t) & args;
1417
1418     /* this trace cannot be here because the afs_global lock might not be
1419      * held at this point. We hold the vm global lock throughout
1420      * this procedure ( and not the AFS global lock )
1421      * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1422      * ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1423      */
1424
1425     i = start;
1426
1427     while (i <= end) {
1428         struct buf *bp;
1429         k_off_t start;
1430         pgcnt_t npages;
1431         k_off_t nbytes;
1432         int error;
1433
1434         extern int pageiodone();
1435         space_t nspace;
1436         caddr_t nvaddr;
1437
1438         /*
1439          * Ask the VM system to find the next run of pages.
1440          */
1441         vm_find_next_range(&vm_info, i, end);
1442
1443         /*
1444          * It's possible that the remote file shrunk in size.  Check the flags
1445          * to see if the request was beyond the end of the file.  If it was,
1446          * truncate the region to the file size and continue.  We could be on a
1447          * run so after trunction continue, there may be some I/O to write
1448          * out.
1449          */
1450         if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1451             pgcnt_t pglen = (pgcnt_t) btorp(args.isize);
1452
1453             /*
1454              * This page is past the end of the file.  Unlock this page
1455              * (region_trunc will throw it away) and then call
1456              * region_trunc() to invalidate all pages past the new end of
1457              * the file.
1458              */
1459             region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1460
1461             /*
1462              * remove the truncation flag.
1463              */
1464             VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1465         }
1466
1467         if (VM_NO_PAGEOUT_RUN(&vm_info))
1468             break;
1469
1470         /*
1471          * We have a run of dirty pages [args.start...args.end].
1472          */
1473         VASSERT(filevp->v_fstype != VCDFS);
1474         VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1475         VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1476
1477         /*
1478          * We will be doing an I/O on the region, let the VM system know.
1479          */
1480         (void)vm_up_physio_count(&vm_info);
1481
1482         /*
1483          * Okay, get set to perform the I/O.
1484          */
1485         inode_changed = 1;
1486         npages =
1487             (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1488             VM_START_PAGEOUT_INDX(&vm_info);
1489
1490         /*
1491          * Allocate and initialize an I/O buffer.
1492          */
1493         bp = bswalloc();
1494         vm_init_bp(&vm_info, bp);       /* Let the VM system initialize */
1495
1496         /* Identify this buffer for KI */
1497         bp->b_bptype = B_vfs_pageout | B_pagebf;
1498
1499         if (steal)
1500             bp->b_flags = B_CALL | B_BUSY | B_PAGEOUT;  /* steal pages */
1501         else
1502             bp->b_flags = B_CALL | B_BUSY;      /* keep pages */
1503
1504         /*
1505          * If we are vhand paging over NFS, we will wait for the I/O
1506          * to complete.
1507          */
1508         if (vhand && filevp->v_fstype == VNFS) {
1509             bp->b_flags &= ~B_CALL;
1510         } else {
1511             bp->b_iodone = (int (*)())pageiodone;
1512         }
1513
1514         /*
1515          * Make sure we do not write past the end of the file.
1516          */
1517         nbytes = ptob(npages);
1518         start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1519         if (start + nbytes > args.isize) {
1520 #ifdef OSDEBUG
1521             /*
1522              * The amount we are off better not be bigger than a
1523              * filesystem block.
1524              */
1525             if (start + nbytes - args.isize >= args.bsize) {
1526                 osi_Panic("afs_pageout: remainder too large");
1527             }
1528 #endif
1529             /*
1530              * Reset the size of the I/O as necessary.  For remote
1531              * files, we set the size to the exact number of bytes to
1532              * the end of the file.  For local files, we round this up
1533              * to the nearest DEV_BSIZE chunk since disk I/O must always
1534              * be in multiples of DEV_BSIZE.  In this case, we do not
1535              * bother to zero out the data past the "real" end of the
1536              * file, this is done when the data is read (either through
1537              * mmap() or by normal file system access).
1538              */
1539             if (file_is_remote)
1540                 nbytes = args.isize - start;
1541             else
1542                 nbytes = roundup(args.isize - start, DEV_BSIZE);
1543         }
1544
1545         /*
1546          * Now get ready to perform the I/O
1547          */
1548         if (!vm_protect_pageout(&vm_info, npages)) {
1549             VASSERT(vhand);
1550             vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1551             vm_finish_io_failed(&vm_info, npages);
1552             bswfree(bp);
1553             break;
1554         }
1555         /*
1556          * If this is an NFS write by vhand(), we will not be calling
1557          * pageiodone().  asyncpageio() increments parolemem for us
1558          * if bp->b_iodone is pageiodone, so we must do it manually
1559          * if pageiodone() will not be called automatically.
1560          */
1561         if (!(bp->b_flags & B_CALL) && steal) {
1562             register ulong_t context;
1563
1564             SPINLOCK_USAV(pfdat_lock, context);
1565             parolemem += btorp(nbytes);
1566             SPINUNLOCK_USAV(pfdat_lock, context);
1567         }
1568         blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1569                  (BX_NOBUFWAIT | BX_PURGE), VM_REGION(&vm_info));
1570
1571         /*
1572          * If vhand is the one paging things out, and this is an NFS
1573          * file, we need to temporarily become a different user so
1574          * that we are not trying to page over NFS as root.  We use
1575          * the user credentials associated with the writable file
1576          * pointer that is in the psuedo-vas for this MMF.
1577          *
1578          * NOTE: we are currently using "va_rss" to store the ucred
1579          *       value in the vas (this should be fixed in 10.0).
1580          */
1581         old_cred = kt_cred(u.u_kthreadp);
1582         if (vhand) {
1583             set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1584
1585             /*
1586              * If root was the one who opened the mmf for write,
1587              * va_cred will be NULL.  So reset kt_cred(u.u_kthreadp) to what it
1588              * was.  We will page out as root, but that is the
1589              * correct thing to do in this case anyway.
1590              */
1591             if (kt_cred(u.u_kthreadp) == NULL)
1592                 set_kt_cred(u.u_kthreadp, old_cred);
1593         }
1594
1595         /*
1596          * Really do the I/O.
1597          */
1598         error =
1599             asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1600                         VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1601                         (int)nbytes, B_WRITE, devvp);
1602
1603         VASSERT(error == 0);
1604
1605 #ifdef  notdef
1606         /*
1607          * If we are vhand paging over NFS we want to wait for the
1608          * I/O to complete and take the appropriate actions if an
1609          * error is encountered.
1610          */
1611         if (vhand) {
1612             if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1613                 /*
1614                  * The server is down, ignore this failure, and
1615                  * try again later. (rfscall() has set our retry
1616                  * timer).
1617                  */
1618                 fsdata.remote_down = 1;
1619                 pageiocleanup(bp, 0);
1620
1621                 /*
1622                  * vm_vfdcheck() has cleared the valid bit on the
1623                  * vfds for these pages.  We must go back and set the
1624                  * valid bit, as the pages are really not gone.
1625                  *
1626                  * NOTE: we can do this because we still hold (and have
1627                  * not released) the region lock.
1628                  */
1629                 if (steal)
1630                     vm_undo_invalidation(&vm_info, vm_info.start,
1631                                          vm_info.end);
1632             } else {
1633                 /*
1634                  * The I/O succeeded, or we had an error that we do
1635                  * not want to defer until later.  Call pageidone()
1636                  * to handle things.
1637                  */
1638                 pageiodone(bp);
1639             }
1640         }
1641 #endif
1642
1643         /*
1644          * And restore our credentials to what they were.
1645          */
1646         set_kt_cred(u.u_kthreadp, old_cred);
1647
1648         /*
1649          * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1650          * can now unreserve it.
1651          */
1652         if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1653             vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1654             vm_release_malloc_memory();
1655         }
1656
1657         /*
1658          * Update statistics
1659          */
1660         if (steal) {
1661             if (flags & PF_DEACT) {
1662 #if defined(AFS_HPUX110_ENV)
1663                 getppdp()->cnt.v_pswpout += npages;
1664 #else
1665                 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1666 #endif
1667 /*              sar_bswapout += ptod(npages);*/
1668             } else if (vhand) {
1669 #if defined(AFS_HPUX110_ENV)
1670                 getppdp()->cnt.v_pgout++;
1671                 getppdp()->cnt.v_pgpgout += npages;
1672 #else
1673                 mpproc_info[getprocindex()].cnt.v_pgout++;
1674                 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1675 #endif
1676             }
1677         }
1678
1679         /*
1680          * If time and patience have delivered enough
1681          * pages, then quit now while we are ahead.
1682          */
1683         if (VM_STOP_PAGING(&vm_info))
1684             break;
1685
1686         i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1687     }
1688
1689     vm_finish_pageout(&vm_info);        /* update vhand's stealscan */
1690
1691     vmemp_unlockx();
1692
1693     /*
1694      * If we wanted to wait for the I/O to complete, sleep on piocnt.
1695      * We must decrement it by one first, and then make sure that it
1696      * is non-zero before going to sleep.
1697      */
1698     vm_wait_for_io(&vm_info);
1699
1700     if (inode_changed && !file_is_remote) {
1701         imark(ip, IUPD | ICHG);
1702         iupdat(ip, 0, 0);
1703     }
1704     return 0;
1705 }
1706
1707 int
1708 afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1709      struct vnode *filevp;
1710      off_t offset;
1711      daddr_t *bn;               /* Block number. */
1712      int flags;                 /* B_READ or B_WRITE */
1713      int *hole;                 /* To be used for read-ahead. */
1714      pgcnt_t *startidx;         /* To be used for read-ahead. */
1715      pgcnt_t *endidx;           /* To be used for read-ahead. */
1716 {
1717     daddr_t lbn, local_bn;
1718     int on;
1719     int err;
1720     long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1721
1722     if (startidx)
1723         *startidx = (pgcnt_t) (offset / NBPG);
1724     if (endidx)
1725         *endidx = (pgcnt_t) (offset / NBPG);
1726     if (hole)
1727         *hole = 0;              /* Can't have holes. */
1728     if (bsize <= 0)
1729         osi_Panic("afs_mapdbd: zero size");
1730
1731     lbn = (daddr_t) (offset / bsize);
1732     on = offset % bsize;
1733
1734     err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1735     VASSERT(err == 0);
1736
1737     /*
1738      * We can never get a bn less than zero on remote files.
1739      */
1740     VASSERT(local_bn >= 0);
1741
1742     local_bn = local_bn + btodb(on);
1743     *bn = local_bn;
1744
1745     return (0);
1746 }
1747
1748 /*
1749  * Return values:
1750  *      1: The blocks are contiguous.
1751  *      0: The blocks are not contiguous.
1752  */
1753 int
1754 afs_vm_fscontiguous(vp, args, cur_data)
1755      struct vnode *vp;
1756      vfspage_t *args;
1757      u_int cur_data;
1758 {
1759     if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1760         return (1);
1761     } else {
1762         return (0);
1763     }
1764 }
1765
1766 /*
1767  * Return values:
1768  *      1: Stop, this page is the last in the block.
1769  *      0: Continue on
1770  * Terminate requests at filesystem block boundaries
1771  */
1772 afs_vm_stopio(vp, args)
1773      struct vnode *vp;
1774      vfspage_t *args;
1775 {
1776     fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1777
1778     if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0) {
1779         return (1);
1780     } else {
1781         return (0);
1782     }
1783 }
1784
1785 /*
1786  *      afs_vm_checkpage is called by the VM while collecting a run of
1787  *      pages on a pageout.  afs_vm_checkpage() is called for each page
1788  *      VM wants to write to disk.
1789  */
1790 afs_vm_checkpage(vp, args, pgindx, cur_data)
1791      struct vnode *vp;
1792      vfspage_t *args;
1793      pgcnt_t pgindx;
1794      int cur_data;
1795 {
1796     fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1797
1798     if (fsdata->remote_down) {  /* never happens for AFS */
1799         /*
1800          * The remote system is down.
1801          */
1802         VASSERT(args->run == 0);
1803         return 1;
1804     }
1805     /*
1806      * A dirty page.  If we have not yet determined the file size and
1807      * other attributes that we need to write out pages (the block
1808      * size and ok_dbd_limit), get that information now.
1809      */
1810     if (fsdata->bsize == 0) {
1811         k_off_t isize;
1812         long bsize;
1813         struct vattr va;
1814         struct vnode *filevp;
1815         /*
1816          * Get the various attributes about the file.  Store them
1817          * in args for the next time around.
1818          */
1819         filevp = args->vp;
1820
1821         bsize = vtoblksz(filevp);
1822         args->maxpgs = (pgcnt_t) btop(bsize);
1823
1824         if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1825             /*
1826              * The VOP_GETATTR() failed.
1827              * we are vhand, and this is a hard mount, we will
1828              * skip dirty pages for a while and try again later.
1829              */
1830             if (args->vm_flags & PAGEOUT_VHAND) {
1831                 VASSERT(args->run == 0);
1832                 return 1;
1833             }
1834             /*
1835              * This is a "soft" mount, or some other error was
1836              * returned from the server.  Mark this region
1837              * as a zombie, and free this dirty page.
1838              */
1839             VM_ZOMBIE_OBJECT(args);
1840
1841             /*
1842              * The caller will see r_zomb and remove the page
1843              * appropriately.
1844              */
1845             return (1);
1846         }
1847         isize = va.va_size;
1848         fsdata->isize = isize;
1849         fsdata->bsize = bsize;
1850         fsdata->remote = 1;
1851     }
1852     /*
1853      * See if the file has shrunk (this could have happened
1854      * asynchronously because of NFS or DUX).  If so, invalidate
1855      * all of the pages past the end of the file. This is only
1856      * needed for remote files, as local files are truncated
1857      * synchronously.
1858      */
1859
1860     if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1861         /*
1862          * This page is past the end of the file.  Unlock this page
1863          * (region_trunc will throw it away) and then call region_trunc()
1864          * to invalidate all pages past the new end of the file.
1865          */
1866         VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1867         return (1);
1868     }
1869 #ifdef notdef
1870     if ((args->vm_flags & PAGEOUT_VHAND)
1871         && (!(args->vm_flags & PAGEOUT_RESERVED))
1872         && (!(VM_IS_ZOMBIE(args)))) {
1873         VASSERT(args->run == 0);
1874         if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1875             /*
1876              * Got enough memory to pageout.  Mark the fact that we did
1877              * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1878              * later (in remote_pageout()).
1879              */
1880             args->vm_flags |= PAGEOUT_RESERVED;
1881         } else {
1882             /*
1883              * We do not have enough memory to do this pageout.  By
1884              * definition, we do not yet have a run, so we just unlock
1885              * this page and tell foreach_valid() to continue scanning.
1886              * If we come across another dirty page, we will try to
1887              * reserve memory again.  That is okay, in fact some memory
1888              * may have freed up (as earlier pageouts complete under
1889              * interrupt).
1890              */
1891             return 1;
1892         }
1893     }
1894 #endif
1895     return (0);
1896 }
1897
1898 afs_swapfs_len(bp)
1899      struct buf *bp;
1900 {
1901     long fs_bsize;
1902     long max_size;
1903     long bnrem;
1904
1905     fs_bsize = vtoblksz(bp->b_vp);
1906     /*
1907      * Check to see if we are starting mid block.  If so, then
1908      * we must return the remainder of the block or less depending
1909      * on the length.
1910      */
1911     bnrem = bp->b_offset % fs_bsize;
1912     if (bnrem) {
1913         max_size = fs_bsize - bnrem;
1914     } else {
1915         max_size = fs_bsize;
1916     }
1917
1918     if (bp->b_bcount > max_size) {
1919         return (max_size);
1920     } else {
1921         return (bp->b_bcount);
1922     }
1923 }
1924
1925 afs_mmap(vp, off, size_bytes, access)
1926      struct vnode *vp;
1927      u_int off;
1928 #if defined(AFS_HPUX1111_ENV)
1929      u_long size_bytes;
1930 #else
1931      u_int size_bytes;
1932 #endif
1933      int access;
1934 {
1935     long bsize = vtoblksz(vp);
1936
1937     if (bsize % NBPG != 0) {
1938         return (EINVAL);
1939     }
1940
1941     return (0);
1942 }
1943
1944 afs_cachelimit(vp, len, location)
1945      struct vnode *vp;
1946      k_off_t len;
1947      int *location;
1948 {
1949     /*
1950      * Disk addresses are logical, not physical, so fragments are
1951      * transparent.
1952      */
1953     *location = btorp(len) + 1;
1954 }
1955
1956 afs_release(vp)
1957      struct vnode *vp;
1958 {
1959     return (0);
1960 }
1961
1962 int
1963 afs_unmap(vp, off, size_bytes, access)
1964      struct vnode *vp;
1965      u_int off;
1966 #if defined(AFS_HPUX1111_ENV)
1967      u_long size_bytes;
1968 #else
1969      u_int size_bytes;
1970 #endif
1971      int access;
1972 {
1973     return 0;
1974 }
1975
1976 int
1977 afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
1978      struct vnode *vp;
1979      preg_t *prp;
1980      int wrt;
1981      space_t space;
1982      caddr_t vaddr;
1983      pgcnt_t *rhead_cnt;
1984 {
1985     printf("afs_read_ahead returning 0 \n");
1986     return 0;
1987 }
1988
1989 int
1990 afs_prealloc(vp, size, ignore_minfree, reserved)
1991      struct vnode *vp;
1992       /* DEE on 11.22 following is off_t */
1993      size_t size;
1994      int ignore_minfree;
1995      int reserved;
1996 {
1997     printf("afs_prealloc returning ENOSPC\n");
1998     return ENOSPC;
1999 }
2000
2001 int
2002 afs_ioctl(vp, com, data, flag, cred)
2003      struct vnode *vp;
2004      int com;
2005      caddr_t data;
2006      int flag;
2007      struct ucred *cred;
2008 {
2009     int error;
2010     struct afs_ioctl afsioctl, *ai;
2011
2012     AFS_STATCNT(afs_ioctl);
2013
2014     /* The call must be a VICEIOCTL call */
2015     if (((com >> 8) & 0xff) == 'V') {
2016 #ifdef notdef
2017         /* AFS_COPYIN returns error 14. Copy data in instead */
2018         AFS_COPYIN(data, (caddr_t) & afsioctl, sizeof(afsioctl), error);
2019         if (error)
2020             return (error);
2021 #endif
2022         ai = (struct afs_ioctl *)data;
2023         afsioctl.in = ai->in;
2024         afsioctl.out = ai->out;
2025         afsioctl.in_size = ai->in_size;
2026         afsioctl.out_size = ai->out_size;
2027         error = HandleIoctl(VTOAFS(vp), com, &afsioctl);
2028         return (error);
2029     }
2030     return (ENOTTY);
2031 }
2032
2033 #if defined(AFS_HPUX1111_ENV)
2034 /* looks like even if appl is 32 bit, we need to round to 8 bytes */
2035 /* This had no effect, it must not be being used */
2036
2037 #define roundtoint(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2038 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2039                                 sizeof(u_int) + 2 * sizeof(u_short)))
2040 #else
2041
2042 #define roundtoint(x)   (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
2043 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2044                                 2 * sizeof(u_short)))
2045 #endif
2046
2047 int
2048 afs_readdir(vp, uiop, cred)
2049      struct vnode *vp;
2050      struct uio *uiop;
2051      struct ucred *cred;
2052 {
2053     struct uio auio;
2054     struct iovec aiov;
2055     caddr_t ibuf, obuf, ibufend, obufend;
2056     struct __dirent32 *idp;
2057     struct dirent *odp;
2058     int count, outcount;
2059     dir_off_t offset;
2060     uint64_t tmp_offset;
2061
2062     count = uiop->uio_resid;
2063     /* Allocate temporary space for format conversion */
2064     ibuf = kmem_alloc(2 * count);       /* overkill - fix later */
2065     obuf = kmem_alloc(count + sizeof(struct dirent));
2066     aiov.iov_base = ibuf;
2067     aiov.iov_len = count;
2068     auio.uio_iov = &aiov;
2069     auio.uio_iovcnt = 1;
2070     offset = auio.uio_offset = uiop->uio_offset;
2071     auio.uio_seg = UIOSEG_KERNEL;
2072     auio.uio_resid = count;
2073     auio.uio_fpflags = 0;
2074
2075     u.u_error = mp_afs_readdir2(vp, &auio, cred);
2076     if (u.u_error)
2077         goto out;
2078
2079     /* Convert entries from __dirent32 to dirent format */
2080
2081     for (idp = (struct __dirent32 *)ibuf, odp =
2082          (struct dirent *)obuf, ibufend =
2083          ibuf + (count - auio.uio_resid), obufend = obuf + count;
2084          (caddr_t) idp < ibufend;
2085          idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2086          (struct dirent *)((caddr_t) odp + odp->d_reclen)) {
2087         odp->d_ino = idp->__d_ino;
2088         odp->d_namlen = idp->__d_namlen;
2089         (void)strcpy(odp->d_name, idp->__d_name);
2090         odp->d_reclen = reclen(odp);
2091         if ((caddr_t) odp + odp->d_reclen > obufend)
2092             break;
2093         /* record offset *after* we're sure to use this entry */
2094         memcpy((char *)&tmp_offset, (char *)&idp->__d_off, sizeof tmp_offset);
2095         offset = tmp_offset;
2096     }
2097
2098     outcount = (caddr_t) odp - obuf;
2099     AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2100     if (u.u_error)
2101         goto out;
2102     uiop->uio_offset = offset;
2103   out:
2104     kmem_free(ibuf, count);
2105     kmem_free(obuf, count + sizeof(struct dirent));
2106     return u.u_error;
2107 }
2108
2109
2110 #define roundtolong(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2111 #define reclen_dirent64(dp)      roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2112                                 2 * sizeof(u_short)))
2113
2114 int
2115 afs_readdir3(vp, uiop, cred)
2116      struct vnode *vp;
2117      struct uio *uiop;
2118      struct ucred *cred;
2119 {
2120     struct uio auio;
2121     struct iovec aiov;
2122     caddr_t ibuf, obuf, ibufend, obufend;
2123     struct __dirent32 *idp;
2124     struct __dirent64 *odp;
2125     int count, outcount;
2126     dir_off_t offset;
2127
2128     count = uiop->uio_resid;
2129     /* Allocate temporary space for format conversion */
2130     ibuf = kmem_alloc(2 * count);       /* overkill - fix later */
2131     obuf = kmem_alloc(count + sizeof(struct __dirent64));
2132     aiov.iov_base = ibuf;
2133     aiov.iov_len = count;
2134     auio.uio_iov = &aiov;
2135     auio.uio_iovcnt = 1;
2136     offset = auio.uio_offset = uiop->uio_offset;
2137     auio.uio_seg = UIOSEG_KERNEL;
2138     auio.uio_resid = count;
2139     auio.uio_fpflags = 0;
2140
2141     u.u_error = mp_afs_readdir2(vp, &auio, cred);
2142     if (u.u_error)
2143         goto out;
2144
2145     /* Convert entries from __dirent32 to __dirent64 format */
2146
2147     for (idp = (struct __dirent32 *)ibuf, odp =
2148          (struct __dirent64 *)obuf, ibufend =
2149          ibuf + (count - auio.uio_resid), obufend = obuf + count;
2150          (caddr_t) idp < ibufend;
2151          idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2152          (struct __dirent64 *)((caddr_t) odp + odp->__d_reclen)) {
2153         memcpy((char *)&odp->__d_off, (char *)&idp->__d_off,
2154                sizeof odp->__d_off);
2155         odp->__d_ino = idp->__d_ino;
2156         odp->__d_namlen = idp->__d_namlen;
2157         (void)strcpy(odp->__d_name, idp->__d_name);
2158         odp->__d_reclen = reclen_dirent64(odp);
2159         if ((caddr_t) odp + odp->__d_reclen > obufend)
2160             break;
2161         /* record offset *after* we're sure to use this entry */
2162         offset = odp->__d_off;
2163     }
2164
2165     outcount = (caddr_t) odp - obuf;
2166     AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2167     if (u.u_error)
2168         goto out;
2169     uiop->uio_offset = offset;
2170   out:
2171     kmem_free(ibuf, count);
2172     kmem_free(obuf, count + sizeof(struct __dirent64));
2173     return u.u_error;
2174 }
2175
2176 #define AFS_SV_SEMA_HASH 1
2177 #define AFS_SV_SEMA_HASH_DEBUG 0
2178
2179 #if AFS_SV_SEMA_HASH
2180 /* This portion of the code was originally used to implement
2181  * thread specific storage for the semaphore save area. However,
2182  * there were some spare fields in the proc structure, this is
2183  * now being used for the saving semapores.  Hence, this portion of
2184  * the code is no longer used.
2185  */
2186
2187 /* This portion of the code implements thread specific information.
2188  * The thread id is passed in as the key. The semaphore saved area
2189  * is hashed on this key.
2190  */
2191
2192 /* why is this hash table required ?
2193  * The AFS code is written in such a way that a GLOCK() is done in
2194  * one function and the GUNLOCK() is done in another function further
2195  * down the call chain. The GLOCK() call has to save the current
2196  * semaphore status before acquiring afs_global_sema. The GUNLOCK
2197  * has to release afs_global_sema and reacquire the sempahore status
2198  * that existed before the corresponding GLOCK. If GLOCK() and
2199  * GUNLOCK() were called in the same function, the GLOCK call could
2200  * have stored the saved sempahore status in a local variable and the
2201  * corresponding GUNLOCK() call could have restored the original
2202  * status from this local variable. But this is not the case with
2203  * AFS code. Hence, we have to implement a thread specific semaphore
2204  * save area. This is implemented as a hash table. The key is the
2205  * thread id.
2206  */
2207
2208 /* In order for multithreaded processes to work, the sv_sema structures
2209  * must be saved on a per-thread basis, not a per-process basis.  There
2210  * is no per-thread storage available to hijack in the OS per-thread
2211  * data structures (e.g. struct user) so we revive this code.
2212  * I removed the upper limit on the memory consumption since we don't
2213  * know how many threads there will be.  Now the code first checks the
2214  * freeList.  If that fails it then tries garbage collecting.  If that
2215  * doesn't free up anything then it allocs what it needs.
2216  */
2217
2218 #define ELEMENT         sv_sema_t
2219 #define KEY             tid_t
2220 #define Hash(xx)        (  (xx) % sizeOfHashTable )
2221 #define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2222 #define hashLock(xx)    MP_PSEMA(&xx)
2223 #define hashUnlock(xx)  MP_VSEMA(&xx)
2224
2225 typedef struct elem {
2226     struct elem *next;
2227     ELEMENT element;
2228     KEY key;
2229     int refCnt;
2230 } Element;
2231
2232 typedef struct bucket {
2233     sema_t lock;
2234     Element *element;
2235 } Bucket;
2236
2237 static int sizeOfHashTable;
2238 static Bucket *hashTable;
2239
2240 static int currentSize = 0;
2241 static Element *freeList;       /* free list */
2242
2243 #pragma align 64
2244 static sema_t afsHashLock = { 0 };      /* global lock for hash table */
2245
2246 static void afsHashGarbageCollect();
2247
2248 /*
2249 ** The global lock protects the global data structures,
2250 ** e.g. freeList and currentSize.
2251 ** The bucket lock protects the link list hanging off that bucket.
2252 ** The lock hierarchy : one can obtain the bucket lock while holding
2253 ** the global lock, but not vice versa.
2254 */
2255
2256
2257 void
2258 afsHash(int nbuckets)
2259 {                               /* allocate the hash table */
2260     int i;
2261
2262 #if AFS_SV_SEMA_HASH_DEBUG
2263     printf("afsHash: enter\n");
2264 #endif
2265
2266     sizeOfHashTable = nbuckets;
2267     currentSize = nbuckets * sizeof(Bucket);
2268
2269     if (hashTable)
2270         osi_Panic("afs: SEMA Hashtable already created\n");
2271
2272     hashTable = (Bucket *) AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2273     if (!hashTable)
2274         osi_Panic("afs: cannot create SEMA Hashtable\n");
2275
2276     /* initialize the hash table and associated locks */
2277     memset((char *)hashTable, 0, sizeOfHashTable * sizeof(Bucket));
2278     for (i = 0; i < sizeOfHashTable; i++)
2279         hashLockInit(hashTable[i].lock);
2280     hashLockInit(afsHashLock);
2281
2282 #if AFS_SV_SEMA_HASH_DEBUG
2283     printf("afsHash: exit\n");
2284 #endif
2285 }
2286
2287 ELEMENT *
2288 afsHashInsertFind(KEY key)
2289 {
2290     int index;
2291     Element *ptr;
2292
2293 #if AFS_SV_SEMA_HASH_DEBUG
2294     printf("afsHashInsertFind: %d\n", key);
2295 #endif
2296     if (!hashTable)
2297         osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2298
2299     index = Hash(key);          /* get bucket number */
2300     hashLock(hashTable[index].lock);    /* lock this bucket */
2301     ptr = hashTable[index].element;
2302
2303     /* if it is already there */
2304     while (ptr) {
2305         if (ptr->key == key) {
2306             ptr->refCnt++;      /* hold it */
2307             hashUnlock(hashTable[index].lock);
2308 #if AFS_SV_SEMA_HASH_DEBUG
2309             printf("afsHashInsertFind: %d FOUND\n", key);
2310 #endif
2311             return &(ptr->element);
2312         } else {
2313             ptr = ptr->next;
2314         }
2315     }
2316
2317     hashUnlock(hashTable[index].lock);
2318
2319     /*  if something exists in the freeList, take it from there */
2320     ptr = NULL;
2321     hashLock(afsHashLock);
2322
2323     if (freeList) {
2324         ptr = freeList;         /* reuse entry */
2325         freeList = freeList->next;
2326     } else {
2327         afsHashGarbageCollect();        /* afsHashLock locked */
2328         if (freeList) {
2329             ptr = freeList;     /* reuse entry */
2330             freeList = freeList->next;
2331         } else {
2332             ptr = (Element *) AFS_KALLOC(sizeof(Element));
2333         }
2334     }
2335
2336     currentSize += sizeof(Element);     /* update memory used */
2337     hashUnlock(afsHashLock);
2338
2339     if (!ptr)
2340         osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2341     /* create new entry */
2342     ptr->key = key;
2343     memset((char *)&ptr->element, 0, sizeof(ptr->element));
2344     ptr->refCnt = 1;            /* this guy */
2345
2346     /* insert new entry in bucket */
2347     hashLock(hashTable[index].lock);    /* lock this bucket */
2348     ptr->next = hashTable[index].element;
2349     hashTable[index].element = ptr;
2350     hashUnlock(hashTable[index].lock);
2351
2352 #if AFS_SV_SEMA_HASH_DEBUG
2353     printf("afsHashInsertFind: %d MADE\n", key);
2354 #endif
2355
2356     return &(ptr->element);
2357 }
2358
2359 ELEMENT *
2360 afsHashFind(KEY key)
2361 {
2362     int index;
2363     Element *ptr;
2364
2365 #if AFS_SV_SEMA_HASH_DEBUG
2366     printf("afsHashFind: %d\n", key);
2367 #endif
2368     if (!hashTable)
2369         osi_Panic("afs: afsHashFind: no hashTable\n");
2370
2371     index = Hash(key);          /* get bucket number */
2372     hashLock(hashTable[index].lock);    /* lock this bucket */
2373     ptr = hashTable[index].element;
2374
2375     /* it should be in the hash table */
2376     while (ptr) {
2377         if (ptr->key == key) {
2378             if (ptr->refCnt <= 0)
2379                 osi_Panic("afs: SEMA HashTable entry already released\n");
2380             hashUnlock(hashTable[index].lock);
2381 #if AFS_SV_SEMA_HASH_DEBUG
2382             printf("afsHashFind: %d FOUND\n", key);
2383 #endif
2384             return &(ptr->element);
2385         } else {
2386             ptr = ptr->next;
2387         }
2388     }
2389
2390     hashUnlock(hashTable[index].lock);
2391     /* it better be in the hash table */
2392     osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2393     return 0;
2394 }
2395
2396 void
2397 afsHashRelease(KEY key)
2398 {
2399     int index;
2400     Element *ptr;
2401
2402 #if AFS_SV_SEMA_HASH_DEBUG
2403     printf("afsHashRelease: %d\n", key);
2404 #endif
2405     if (!hashTable)
2406         osi_Panic("afs: afsHashRelease: no hashTable\n");
2407
2408     index = Hash(key);          /* get bucket number */
2409     hashLock(hashTable[index].lock);    /* lock this bucket */
2410     ptr = hashTable[index].element;
2411
2412     /* it should be in the hash table */
2413     while (ptr) {
2414         if (ptr->key == key) {
2415             if (ptr->refCnt <= 0)
2416                 osi_Panic("afs: SEMA HashTable entry already released\n");
2417             ptr->refCnt--;      /* release this guy */
2418             hashUnlock(hashTable[index].lock);
2419 #if AFS_SV_SEMA_HASH_DEBUG
2420             printf("afsHashRelease: %d FOUND\n", key);
2421 #endif
2422             return;
2423         } else {
2424             ptr = ptr->next;
2425         }
2426     }
2427
2428     hashUnlock(hashTable[index].lock);
2429     /* it better be in the hash table */
2430     osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2431 }
2432
2433 /* this should be called with afsHashLock WRITE locked */
2434 static void
2435 afsHashGarbageCollect()
2436 {
2437     int index;
2438     Element *ptr;
2439     int foundFlag = 0;
2440
2441     if (!hashTable)
2442         osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2443
2444     for (index = 0; index < sizeOfHashTable; index++) {
2445         hashLock(hashTable[index].lock);
2446         ptr = hashTable[index].element; /* pick up bucket */
2447
2448         while (ptr && !ptr->refCnt) {
2449             /* insert this element into free list */
2450             Element *temp;
2451             temp = ptr->next;
2452             ptr->next = freeList;
2453             freeList = ptr;
2454
2455             foundFlag = 1;      /* found at least one */
2456             currentSize -= sizeof(Element);
2457             ptr = temp;
2458         }
2459         hashTable[index].element = ptr;
2460
2461         /* scan thru the remaining list */
2462         if (ptr) {
2463             while (ptr->next) {
2464                 if (ptr->next->refCnt == 0) {
2465                     /* collect this element */
2466                     Element *temp;
2467                     temp = ptr->next;
2468                     ptr->next = ptr->next->next;
2469                     temp->next = freeList;
2470                     freeList = temp;
2471                     foundFlag = 1;
2472                     currentSize -= sizeof(Element);
2473                 } else {
2474                     ptr = ptr->next;
2475                 }
2476             }
2477         }
2478         hashUnlock(hashTable[index].lock);
2479     }
2480 #if 0
2481     if (!foundFlag)
2482         osi_Panic("afs: SEMA HashTable full\n");
2483 #endif
2484 }
2485
2486 #endif /* AFS_SV_SEMA_HASH */
2487
2488
2489 afs_hp_strategy(bp)
2490      register struct buf *bp;
2491 {
2492     register afs_int32 code;
2493     struct uio tuio;
2494     struct iovec tiovec[1];
2495     extern caddr_t hdl_kmap_bp();
2496     register struct kthread *t = u.u_kthreadp;
2497
2498     AFS_STATCNT(afs_hp_strategy);
2499     /*
2500      * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2501      * the I/O.  We must save and restore the count because pageiodone()
2502      * uses b_bcount to determine how many pages to unlock.
2503      *
2504      * Remap the entire range.
2505      */
2506     hdl_kmap_bp(bp);
2507
2508     AFS_GLOCK();
2509     afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER, bp->b_vp,
2510                ICL_TYPE_LONG, (int)bp->b_blkno * DEV_BSIZE, ICL_TYPE_LONG,
2511                bp->b_bcount, ICL_TYPE_LONG, 0);
2512
2513     /* Set up the uio structure */
2514     tuio.afsio_iov = tiovec;
2515     tuio.afsio_iovcnt = 1;
2516     tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2517     tuio.afsio_seg = AFS_UIOSYS;
2518     tuio.afsio_resid = bp->b_bcount;
2519     tuio.uio_fpflags = 0;
2520     tiovec[0].iov_base = bp->b_un.b_addr;
2521     tiovec[0].iov_len = bp->b_bcount;
2522
2523     /* Do the I/O */
2524     if ((bp->b_flags & B_READ) == B_READ) {
2525         /* read b_bcount bytes into kernel address b_un.b_addr
2526          * starting at byte DEV_BSIZE * b_blkno. Bzero anything
2527          * we can't read, and finally call iodone(bp).  File is
2528          * in bp->b_vp. Credentials are from u area??
2529          */
2530         code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_READ, 0, kt_cred(t));
2531         if (code == 0)
2532             if (tuio.afsio_resid > 0) {
2533                 privlbzero(bvtospace(bp, bp->b_un.b_addr),
2534                            bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2535                            (size_t) tuio.afsio_resid);
2536
2537             }
2538     } else
2539         code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_WRITE, 0, kt_cred(t));
2540
2541     /* Remap back to the user's space */
2542     hdl_remap_bp(bp);
2543
2544     AFS_GUNLOCK();
2545
2546     iodone(bp);
2547     return code;
2548 }
2549
2550 afs_pathconf(vp, name, resultp, cred)
2551      struct vnode *vp;
2552      int name;
2553      int *resultp;
2554      struct ucred *cred;        /* unused */
2555 {
2556     switch (name) {
2557     case _PC_LINK_MAX:          /* Maximum number of links to a file */
2558         *resultp = 255;         /* an unsigned short on the fileserver */
2559         break;                  /* a unsigned char in the client.... */
2560
2561     case _PC_NAME_MAX:          /* Max length of file name */
2562         *resultp = 255;
2563         break;
2564
2565     case _PC_PATH_MAX:          /* Maximum length of Path Name */
2566         *resultp = 1024;
2567         break;
2568
2569     case _PC_PIPE_BUF:          /* Max atomic write to pipe.  See fifo_vnops */
2570     case _PC_CHOWN_RESTRICTED:  /* Anybody can chown? */
2571     case _PC_NO_TRUNC:          /* No file name truncation on overflow? */
2572         u.u_error = EOPNOTSUPP;
2573         return (EOPNOTSUPP);
2574         break;
2575
2576     case _PC_MAX_CANON: /* TTY buffer size for canonical input */
2577         /* need more work here for pty, ite buffer size, if differ */
2578         if (vp->v_type != VCHR) {
2579             u.u_error = EINVAL;
2580             return (EINVAL);
2581         }
2582         *resultp = CANBSIZ;     /*for tty */
2583         break;
2584
2585     case _PC_MAX_INPUT:
2586         /* need more work here for pty, ite buffer size, if differ */
2587         if (vp->v_type != VCHR) {       /* TTY buffer size */
2588             u.u_error = EINVAL;
2589             return (EINVAL);
2590         }
2591         *resultp = TTYHOG;      /*for tty */
2592         break;
2593
2594     case _PC_VDISABLE:
2595         /* Terminal special characters can be disabled? */
2596         if (vp->v_type != VCHR) {
2597             u.u_error = EINVAL;
2598             return (EINVAL);
2599         }
2600         *resultp = 1;
2601         break;
2602
2603     case _PC_SYNC_IO:
2604         if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2605             *resultp = -1;
2606             return EINVAL;
2607         }
2608         *resultp = 1;           /* Synchronized IO supported for this file */
2609         break;
2610
2611     case _PC_FILESIZEBITS:
2612         if (vp->v_type != VDIR)
2613             return (EINVAL);
2614         *resultp = MAX_SMALL_FILE_BITS;
2615         break;
2616
2617     default:
2618         return (EINVAL);
2619     }
2620
2621     return (0);
2622 }