src/afs/HPUX/osi_vnodeops.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /* This is a placeholder for routines unique to the port of AFS to hp-ux*/
  11
  12 #include <afsconfig.h>
  13 #include "afs/param.h"
  14
  15
  16 #include "afs/sysincludes.h"    /* Standard vendor system headers */
  17 #include "afsincludes.h"        /* Afs-based standard headers */
  18 #include "afs/afs_stats.h"      /* statistics stuff */
  19
  20 #include <sys/uio.h>
  21 #include <sys/vfs.h>
  22 #include <sys/mount.h>
  23 #include <sys/vnode.h>
  24 #include <sys/pathname.h>
  25
  26 extern struct vfsops Afs_vfsops;
  27 extern int afs_hp_strategy();
  28 extern int afs_bmap(), afs_badop(), afs_noop(), afs_lockf();
  29 extern int afs_pagein();
  30 extern int afs_pageout();
  31 extern int afs_ioctl();
  32 extern int afs_prealloc();
  33 extern int afs_mapdbd();
  34 extern int afs_mmap();
  35 extern int afs_cachelimit();
  36 extern int afs_vm_checkpage();
  37 extern int afs_vm_fscontiguous();
  38 extern int afs_vm_stopio();
  39 extern int afs_read_ahead();
  40 extern int afs_unmap();
  41 extern int afs_release();
  42 extern int afs_swapfs_len();
  43 extern int afs_readdir2();
  44 extern int afs_readdir();
  45 extern int afs_readdir3();
  46 extern int afs_pathconf();
  47 extern int afs_close();
  48
  49 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
  50
  51 #if defined(AFS_HPUX110_ENV)
  52 /* We no longer need to lock on the VM Empire,
  53  * or at least that is what is claimed.
  54  * so we will noopt the vmemp_ routines
  55  * This needs to be looked at closer.
  56  */
  57 #define vmemp_lockx()
  58 #undef  vmemp_returnx
  59 #define vmemp_returnx(a) return(a)
  60 #define vmemp_unlockx()
  61 #endif
  62
  63 #if !defined(AFS_HPUX110_ENV)
  64 /*
  65  * Copy an mbuf to the contiguous area pointed to by cp.
  66  * Skip <off> bytes and copy <len> bytes.
  67  * Returns the number of bytes not transferred.
  68  * The mbuf is NOT changed.
  69  */
  70 int
  71 m_cpytoc(m, off, len, cp)
  72      struct mbuf *m;
  73      int off, len;
  74      caddr_t cp;
  75 {
  76     int ml;
  77
  78     if (m == NULL || off < 0 || len < 0 || cp == NULL)
  79         osi_Panic("m_cpytoc");
  80     while (off && m)
  81         if (m->m_len <= off) {
  82             off -= m->m_len;
  83             m = m->m_next;
  84             continue;
  85         } else
  86             break;
  87     if (m == NULL)
  88         return (len);
  89
  90     ml = MIN(len, m->m_len - off);
  91     memcpy(cp, mtod(m, caddr_t) + off, (u_int) ml);
  92     cp += ml;
  93     len -= ml;
  94     m = m->m_next;
  95
  96     while (len && m) {
  97         ml = m->m_len;
  98         memcpy(cp, mtod(m, caddr_t), (u_int) ml);
  99         cp += ml;
 100         len -= ml;
 101         m = m->m_next;
 102     }
 103
 104     return (len);
 105 }
 106 #endif
 107
 108 /*
 109  *  Note that the standard Sun vnode interface doesn't haven't an vop_lockf(), so this code is
 110  * totally new.  This came about because HP-UX has lockf() implemented as
 111  * a system call while Sun has it implemented as a library (apparently).
 112  * To handle this, we have to translate the lockf() request into an
 113  * fcntl() looking request, and then translate the results back if necessary.
 114  * we call afs_lockctl() directly .
 115  */
 116 afs_lockf(vp, flag, len, cred, fp, LB, UB)
 117      struct vnode *vp;
 118      int flag;
 119      afs_ucred_t *cred;
 120      struct file *fp;
 121      k_off_t len, LB, UB;
 122 {
 123     /*for now, just pretend it works */
 124     struct k_flock flock;
 125     int cmd, code;
 126
 127     /*
 128      * Create a flock structure and translate the lockf request
 129      * into an appropriate looking fcntl() type request for afs_lockctl()
 130      */
 131     flock.l_whence = 0;
 132     flock.l_len = len;
 133     flock.l_start = fp->f_offset;
 134     /* convert negative lengths to positive */
 135     if (flock.l_len < 0) {
 136         flock.l_start += flock.l_len;
 137         flock.l_len = -(flock.l_len);
 138     }
 139     /*
 140      * Adjust values to look like fcntl() requests.
 141      * All locks are write locks, only F_LOCK requests
 142      * are blocking.  F_TEST has to be translated into
 143      * a get lock and then back again.
 144      */
 145     flock.l_type = F_WRLCK;
 146     cmd = F_SETLK;
 147     switch (flag) {
 148     case F_ULOCK:
 149         flock.l_type = F_UNLCK;
 150         break;
 151     case F_LOCK:
 152         cmd = F_SETLKW;
 153         break;
 154     case F_TEST:
 155         cmd = F_GETLK;
 156         break;
 157     }
 158     u.u_error = mp_afs_lockctl(vp, &flock, cmd, fp->f_cred);
 159     if (u.u_error) {
 160         return (u.u_error);     /* some other error code */
 161     }
 162     /*
 163      * if request is F_TEST, and GETLK changed
 164      * the lock type to ULOCK, then return 0, else
 165      * set errno to EACCESS and return.
 166      */
 167     if (flag == F_TEST && flock.l_type != F_UNLCK) {
 168         u.u_error = EACCES;
 169         return (u.u_error);
 170     }
 171     return (0);
 172 }
 173
 174
 175 #if defined(AFS_HPUX1122_ENV)
 176 #include "machine/vm/vmparam.h"
 177 #else
 178 #include "../machine/vmparam.h" /* For KERNELSPACE */
 179 #endif
 180 #include "h/debug.h"
 181 #include "h/types.h"
 182 #if !defined(AFS_HPUX1123_ENV)
 183         /* 11.23 is using 64 bit in many cases */
 184 #define kern_daddr_t daddr_t
 185 #endif
 186 #include "h/param.h"
 187 #include "h/vmmac.h"
 188 #include "h/time.h"
 189 #include "ufs/inode.h"
 190 #include "ufs/fs.h"
 191 #include "h/dbd.h"
 192 #if defined(AFS_HPUX1123_ENV)
 193 dbd_t       *finddbd();
 194 #endif /* AFS_HPUX1123_ENV */
 195 #include "h/vfd.h"
 196 #include "h/region.h"
 197 #include "h/pregion.h"
 198 #include "h/vmmeter.h"
 199 #include "h/user.h"
 200 #include "h/sysinfo.h"
 201 #include "h/pfdat.h"
 202 #if !defined(AFS_HPUX1123_ENV)
 203 #include "h/tuneable.h"
 204 #endif
 205 #include "h/buf.h"
 206 #include "netinet/in.h"
 207
 208 /* a freelist of one */
 209 struct buf *afs_bread_freebp = 0;
 210
 211 /*
 212  *  Only rfs_read calls this, and it only looks at bp->b_un.b_addr.
 213  *  Thus we can use fake bufs (ie not from the real buffer pool).
 214  */
 215 afs_bread(vp, lbn, bpp)
 216      struct vnode *vp;
 217      kern_daddr_t lbn;
 218      struct buf **bpp;
 219 {
 220     int offset, fsbsize, error;
 221     struct buf *bp;
 222     struct iovec iov;
 223     struct uio uio;
 224
 225     AFS_STATCNT(afs_bread);
 226     fsbsize = vp->v_vfsp->vfs_bsize;
 227     offset = lbn * fsbsize;
 228     if (afs_bread_freebp) {
 229         bp = afs_bread_freebp;
 230         afs_bread_freebp = 0;
 231     } else {
 232         bp = (struct buf *)AFS_KALLOC(sizeof(*bp));
 233         bp->b_un.b_addr = (caddr_t) AFS_KALLOC(fsbsize);
 234     }
 235
 236     iov.iov_base = bp->b_un.b_addr;
 237     iov.iov_len = fsbsize;
 238     uio.afsio_iov = &iov;
 239     uio.afsio_iovcnt = 1;
 240     uio.afsio_seg = AFS_UIOSYS;
 241     uio.afsio_offset = offset;
 242     uio.afsio_resid = fsbsize;
 243     uio.uio_fpflags = 0;
 244     *bpp = 0;
 245
 246     error = afs_read(VTOAFS(vp), &uio, p_cred(u.u_procp), 0);
 247     if (error) {
 248         afs_bread_freebp = bp;
 249         return error;
 250     }
 251     if (*bpp) {
 252         afs_bread_freebp = bp;
 253     } else {
 254         *(struct buf **)&bp->b_vp = bp; /* mark as fake */
 255         *bpp = bp;
 256     }
 257     return 0;
 258 }
 259
 260 afs_brelse(vp, bp)
 261      struct vnode *vp;
 262      struct buf *bp;
 263 {
 264     AFS_STATCNT(afs_brelse);
 265
 266     if ((struct buf *)bp->b_vp != bp) { /* not fake */
 267         ufs_brelse(bp->b_vp, bp);
 268     } else if (afs_bread_freebp) {
 269         AFS_KFREE(bp->b_un.b_addr, vp->v_vfsp->vfs_bsize);
 270         AFS_KFREE(bp, sizeof(*bp));
 271     } else {
 272         afs_bread_freebp = bp;
 273     }
 274 }
 275
 276
 277 afs_bmap(avc, abn, anvp, anbn)
 278      struct vcache *avc;
 279      kern_daddr_t abn, *anbn;
 280      struct vcache **anvp;
 281 {
 282     AFS_STATCNT(afs_bmap);
 283     if (anvp)
 284         *anvp = avc;
 285     if (anbn)
 286         *anbn = abn * (8192 / DEV_BSIZE);       /* in 512 byte units */
 287     return 0;
 288 }
 289
 290 afs_inactive(avc, acred)
 291      struct vcache *avc;
 292      afs_ucred_t *acred;
 293 {
 294     struct vnode *vp = AFSTOV(avc);
 295     ulong_t context;
 296     lock_t *sv_lock;
 297     if (afs_shuttingdown)
 298         return;
 299
 300     /*
 301      * In Solaris and HPUX s800 and HP-UX10.0 they actually call us with
 302      * v_count 1 on last reference!
 303      */
 304     MP_H_SPINLOCK_USAV(vn_h_sl_pool, vp, &sv_lock, &context);
 305     if (avc->vrefCount < 1)
 306         osi_Panic("afs_inactive : v_count < 1\n");
 307
 308     /*
 309      * If more than 1 don't unmap the vnode but do decrement the ref count
 310      */
 311     vp->v_count--;
 312     if (vp->v_count > 0) {
 313         MP_SPINUNLOCK_USAV(sv_lock, context);
 314         return 0;
 315     }
 316     MP_SPINUNLOCK_USAV(sv_lock, context);
 317     afs_InactiveVCache(avc, acred);
 318     return 0;
 319 }
 320
 321
 322 int
 323 mp_afs_open(struct vnode **avcp, int aflags, afs_ucred_t *acred)
 324 {
 325     int code;
 326
 327     AFS_GLOCK();
 328     code = afs_open(avcp, aflags, acred);
 329     AFS_GUNLOCK();
 330     return (code);
 331 }
 332
 333 int
 334 mp_afs_close(struct vnode *avcp, int aflags, afs_ucred_t *acred)
 335 {
 336     int code;
 337
 338     AFS_GLOCK();
 339     code = afs_close(avcp, aflags, acred);
 340     AFS_GUNLOCK();
 341     return (code);
 342 }
 343
 344 int
 345 mp_afs_rdwr(struct vnode *avcp, struct uio *uio, enum uio_rw arw,
 346             int aio, afs_ucred_t *acred)
 347 {
 348     int code;
 349     long save_resid;
 350
 351     AFS_GLOCK();
 352     save_resid = uio->uio_resid;
 353     code = afs_rdwr(avcp, uio, arw, aio, acred);
 354     if (arw == UIO_WRITE && code == ENOSPC) {
 355         /* HP clears code if any data written. */
 356         uio->uio_resid = save_resid;
 357     }
 358     AFS_GUNLOCK();
 359     return (code);
 360 }
 361
 362 int
 363 mp_afs_getattr(struct vnode *avcp, struct vattr *attrs,
 364                afs_ucred_t *acred, enum vsync unused1)
 365 {
 366     int code;
 367
 368     AFS_GLOCK();
 369     code = afs_getattr(avcp, attrs, acred);
 370     AFS_GUNLOCK();
 371     return (code);
 372 }
 373
 374 int
 375 mp_afs_setattr(struct vnode *avcp, struct vattr *attrs,
 376                afs_ucred_t *acred, int unused1)
 377 {
 378     int code;
 379
 380     AFS_GLOCK();
 381     code = afs_setattr(avcp, attrs, acred);
 382     AFS_GUNLOCK();
 383     return (code);
 384 }
 385
 386 int
 387 mp_afs_access(struct vnode *avcp, int mode, afs_ucred_t *acred)
 388 {
 389     int code;
 390
 391     AFS_GLOCK();
 392     code = afs_access(avcp, mode, acred);
 393     AFS_GUNLOCK();
 394     return (code);
 395 }
 396
 397 int
 398 mp_afs_lookup(struct vnode *adp, char *aname,
 399               struct vnode **avcp, afs_ucred_t *acred,
 400               struct vnode *unused1)
 401 {
 402     int code;
 403
 404     AFS_GLOCK();
 405     code = afs_lookup(adp, aname, avcp, acred);
 406     AFS_GUNLOCK();
 407     return (code);
 408 }
 409
 410 int
 411 mp_afs_create(struct vnode *adp, char *aname, struct vattr *attrs,
 412               enum vcexcl aexcl, int amode, struct vnode **avcp,
 413               afs_ucred_t *acred)
 414 {
 415     int code;
 416
 417     AFS_GLOCK();
 418     code = afs_create(adp, aname, attrs, aexcl, amode, avcp, acred);
 419     AFS_GUNLOCK();
 420     return (code);
 421 }
 422
 423
 424 int
 425 mp_afs_remove(struct vnode *adp, char *aname,
 426               afs_ucred_t *acred)
 427 {
 428     int code;
 429
 430     AFS_GLOCK();
 431     code = afs_remove(adp, aname, acred);
 432     AFS_GUNLOCK();
 433     return (code);
 434 }
 435
 436 int
 437 mp_afs_link(struct vnode *avc, struct vnode *adp,
 438             char *aname, afs_ucred_t *acred)
 439 {
 440     int code;
 441
 442     AFS_GLOCK();
 443     code = afs_link(avc, adp, aname, acred);
 444     AFS_GUNLOCK();
 445     return (code);
 446 }
 447
 448 int
 449 mp_afs_rename(struct vnode *aodp, char *aname1,
 450               struct vnode *andp, char *aname2,
 451               afs_ucred_t *acred)
 452 {
 453     int code;
 454
 455     AFS_GLOCK();
 456     code = afs_rename(aodp, aname1, andp, aname2, acred);
 457     AFS_GUNLOCK();
 458     return (code);
 459 }
 460
 461 int
 462 mp_afs_mkdir(struct vnode *adp, char *aname, struct vattr *attrs,
 463              struct vnode **avcp, afs_ucred_t *acred)
 464 {
 465     int code;
 466
 467     AFS_GLOCK();
 468     code = afs_mkdir(adp, aname, attrs, avcp, acred);
 469     AFS_GUNLOCK();
 470     return (code);
 471 }
 472
 473
 474 int
 475 mp_afs_rmdir(struct vnode *adp, char *aname, afs_ucred_t *acred)
 476 {
 477     int code;
 478
 479     AFS_GLOCK();
 480     code = afs_rmdir(adp, aname, acred);
 481     AFS_GUNLOCK();
 482     return (code);
 483 }
 484
 485
 486 int
 487 mp_afs_readdir(struct vnode *avc, struct uio *auio,
 488                afs_ucred_t *acred)
 489 {
 490     int code;
 491
 492     AFS_GLOCK();
 493     code = afs_readdir(avc, auio, acred);
 494     AFS_GUNLOCK();
 495     return (code);
 496 }
 497
 498 int
 499 mp_afs_symlink(struct vnode *adp, char *aname, struct vattr *attrs,
 500                char *atargetName, afs_ucred_t *acred)
 501 {
 502     int code;
 503
 504     AFS_GLOCK();
 505     code = afs_symlink(adp, aname, attrs, atargetName, acred);
 506     AFS_GUNLOCK();
 507     return (code);
 508 }
 509
 510
 511 int
 512 mp_afs_readlink(struct vnode *avc, struct uio *auio,
 513                 afs_ucred_t *acred)
 514 {
 515     int code;
 516
 517     AFS_GLOCK();
 518     code = afs_readlink(avc, auio, acred);
 519     AFS_GUNLOCK();
 520     return (code);
 521 }
 522
 523 int
 524 mp_afs_fsync(struct vnode *avc, afs_ucred_t *acred, int unused1)
 525 {
 526     int code;
 527
 528     AFS_GLOCK();
 529     code = afs_fsync(avc, acred);
 530     AFS_GUNLOCK();
 531     return (code);
 532 }
 533
 534 int
 535 mp_afs_bread(struct vnode *avc, kern_daddr_t lbn, struct buf **bpp,
 536              struct vattr *unused1, struct ucred *unused2)
 537 {
 538     int code;
 539
 540     AFS_GLOCK();
 541     code = afs_bread(avc, lbn, bpp);
 542     AFS_GUNLOCK();
 543     return (code);
 544 }
 545
 546 int
 547 mp_afs_brelse(struct vnode *avc, struct buf *bp)
 548 {
 549     int code;
 550
 551     AFS_GLOCK();
 552     code = afs_brelse(avc, bp);
 553     AFS_GUNLOCK();
 554     return (code);
 555 }
 556
 557
 558 int
 559 mp_afs_inactive(struct vnode *avc, afs_ucred_t *acred)
 560 {
 561     int code;
 562
 563     AFS_GLOCK();
 564     code = afs_inactive(avc, acred);
 565     AFS_GUNLOCK();
 566     return (code);
 567 }
 568
 569 int
 570 mp_afs_lockctl(struct vnode *avc, struct flock *af, int cmd,
 571                afs_ucred_t *acred, struct file *unused1, off_t unused2,
 572                off_t unused3)
 573 {
 574     int code;
 575
 576     AFS_GLOCK();
 577     code = afs_lockctl(avc, af, cmd, acred);
 578     AFS_GUNLOCK();
 579     return (code);
 580 }
 581
 582 int
 583 mp_afs_fid(struct vnode *avc, struct fid **fidpp)
 584 {
 585     int code;
 586
 587     AFS_GLOCK();
 588     code = afs_fid(avc, fidpp);
 589     AFS_GUNLOCK();
 590     return (code);
 591 }
 592
 593 int
 594 mp_afs_readdir2(struct vnode *avc, struct uio *auio,
 595                 afs_ucred_t *acred)
 596 {
 597     int code;
 598
 599     AFS_GLOCK();
 600     code = afs_readdir2(avc, auio, acred);
 601     AFS_GUNLOCK();
 602     return (code);
 603 }
 604
 605
 606 struct vnodeops Afs_vnodeops = {
 607     mp_afs_open,
 608     mp_afs_close,
 609     mp_afs_rdwr,
 610     afs_ioctl,
 611     afs_noop,
 612     mp_afs_getattr,
 613     mp_afs_setattr,
 614     mp_afs_access,
 615     mp_afs_lookup,
 616     mp_afs_create,
 617     mp_afs_remove,
 618     mp_afs_link,
 619     mp_afs_rename,
 620     mp_afs_mkdir,
 621     mp_afs_rmdir,
 622     afs_readdir,
 623     mp_afs_symlink,
 624     mp_afs_readlink,
 625     mp_afs_fsync,
 626     mp_afs_inactive,
 627     afs_bmap,
 628     afs_hp_strategy,
 629 #if     !defined(AFS_NONFSTRANS)
 630     /* on HPUX102 the nfs translator calls afs_bread but does
 631      * not call afs_brelse. Hence we see a memory leak. If the
 632      * VOP_BREAD() call fails, then nfs does VOP_RDWR() to get
 633      * the same data : this is the path we follow now. */
 634     afs_noop,
 635     afs_noop,
 636 #else
 637     mp_afs_bread,
 638     mp_afs_brelse,
 639 #endif
 640     afs_badop,                  /* pathsend */
 641     afs_noop,                   /* setacl */
 642     afs_noop,                   /* getacl */
 643     afs_pathconf,
 644     afs_pathconf,
 645     mp_afs_lockctl,
 646     afs_lockf,                  /* lockf */
 647     mp_afs_fid,
 648     afs_noop,                   /*fsctl */
 649     afs_badop,
 650     afs_pagein,
 651     afs_pageout,
 652     NULL,
 653     NULL,
 654     afs_prealloc,
 655     afs_mapdbd,
 656     afs_mmap,
 657     afs_cachelimit,
 658     afs_vm_checkpage,
 659     afs_vm_fscontiguous,
 660     afs_vm_stopio,
 661     afs_read_ahead,
 662     afs_release,
 663     afs_unmap,
 664     afs_swapfs_len,
 665     mp_afs_readdir2,
 666     afs_readdir3,
 667 };
 668
 669 struct vnodeops *afs_ops = &Afs_vnodeops;
 670
 671 /* vnode file operations, and our own */
 672 extern int vno_rw();
 673 extern int vno_ioctl();
 674 extern int vno_select();
 675 extern int afs_closex();
 676 extern int vno_close();
 677 struct fileops afs_fileops = {
 678     vno_rw,
 679     vno_ioctl,
 680     vno_select,
 681     afs_close,
 682 };
 683
 684 #define vtoblksz(vp)    ((vp)->v_vfsp->vfs_bsize)
 685
 686 /*
 687  ********************************************************************
 688  ****
 689  ****                   afspgin_setup_io_ranges ()
 690  ****    similar to:    nfspgin_setup_io_ranges ()
 691  ********************************************************************
 692  */
 693 pgcnt_t
 694 afspgin_setup_io_ranges(vfspage_t * vm_info, pgcnt_t bpages, k_off_t isize,
 695                         pgcnt_t startindex)
 696 {
 697     pgcnt_t file_offset = VM_FILE_OFFSET(vm_info);
 698     pgcnt_t minpage;            /* first page to bring in */
 699     pgcnt_t maxpage;            /* one past last page to bring in */
 700     pgcnt_t maxpagein;
 701     pgcnt_t multio_maxpage;
 702     kern_daddr_t start_blk;
 703     dbd_t *dbd;
 704     expnd_flags_t up_reason, down_reason;
 705     int count = 1;
 706     int indx = 0;
 707     int max_num_io;
 708     int dbdtype;
 709     preg_t *prp;
 710
 711     VM_GET_IO_INFO(vm_info, maxpagein, max_num_io);
 712
 713     /*
 714      * We do not go past the end of the current pregion nor past the end
 715      * of the current file.
 716      */
 717
 718     maxpage = startindex + (bpages - (startindex + file_offset) % bpages);
 719     maxpage = vm_reset_maxpage(vm_info, maxpage);
 720     maxpage = MIN(maxpage, (pgcnt_t) btorp(isize) - file_offset);
 721     maxpage = MIN(maxpage, startindex + maxpagein);
 722     multio_maxpage = maxpage = vm_maxpage(vm_info, maxpage);
 723
 724     if (!maxpage)
 725         return (0);
 726
 727     VASSERT(maxpage >= startindex);
 728
 729     /*
 730      * Expanding the fault will create calls to FINDENTRY() for new
 731      * pages, which will obsolete "dbd", so copy what it points to
 732      * and clear it to prevent using stale data.
 733      */
 734
 735     prp = VM_PRP(vm_info);
 736     dbdtype = DBD_TYPE(vm_info);
 737     start_blk = DBD_DATA(vm_info);
 738     vm_info->dbd = NULL;
 739     vm_info->vfd = NULL;
 740     VASSERT(dbdtype != DBD_NONE);
 741
 742     if (max_num_io == 1) {
 743         /*
 744          * We need to set up one I/O: First we attempt to expand the
 745          * I/O forward. Then we expand the I/O backwards.
 746          */
 747         count =
 748             expand_faultin_up(vm_info, dbdtype, (int)bpages, maxpage, count,
 749                               startindex, start_blk, &up_reason);
 750         maxpage = startindex + count;
 751         VASSERT(maxpage <= startindex + maxpagein);
 752         minpage = startindex - (startindex + file_offset) % bpages;
 753         minpage = MAX(minpage, maxpage - maxpagein);
 754         VASSERT(startindex >= VM_BASE_OFFSET(vm_info));
 755         minpage = vm_minpage(vm_info, minpage);
 756         VASSERT(minpage <= startindex);
 757         count =
 758             expand_faultin_down(vm_info, dbdtype, (int)bpages, minpage, count,
 759                                 &startindex, &start_blk, &down_reason);
 760         VM_SET_IO_STARTINDX(vm_info, 0, startindex);
 761         VM_SET_IO_STARTBLK(vm_info, 0, start_blk);
 762         VM_SET_IO_COUNT(vm_info, 0, count);
 763         VM_SET_NUM_IO(vm_info, 1);
 764     }
 765
 766     if (max_num_io > 1) {
 767         /*
 768          * We need to set up multiple I/O information; beginning
 769          * with the startindex, we will expand upwards. The expansion
 770          * could stop for one of 2 reasons; we take the appropriate
 771          * action in each of these cases:
 772          *      o VM reasons: abort setting up the multiple I/O
 773          *        information and return to our caller indicating
 774          *        that "retry" is required.
 775          *      o pagelimit: set up the next I/O info [we may have
 776          *        reached multio_maxpage at this point].
 777          * Note that expansion involves no more than a block at a time;
 778          * hence it could never stop due to "discontiguous block"
 779          * reason.
 780          */
 781         startindex = minpage = vm_minpage(vm_info, 0);
 782         for (indx = 0; (indx < max_num_io) && (startindex < multio_maxpage);
 783              indx++, startindex += count) {
 784             dbd = FINDDBD(prp->p_reg, startindex);
 785             start_blk = dbd->dbd_data;
 786             maxpage =
 787                 startindex + (bpages - (startindex + file_offset) % bpages);
 788             maxpage = min(maxpage, multio_maxpage);
 789             count =
 790                 expand_faultin_up(vm_info, dbdtype, bpages, maxpage,
 791                                   1 /* count */ ,
 792                                   startindex, start_blk, &up_reason);
 793             VM_SET_IO_STARTINDX(vm_info, indx, startindex);
 794             VM_SET_IO_STARTBLK(vm_info, indx, start_blk);
 795             VM_SET_IO_COUNT(vm_info, indx, count);
 796             if (up_reason & VM_REASONS)
 797                 break;
 798             VASSERT(!(up_reason & NONCONTIGUOUS_BLOCK));
 799             VASSERT(up_reason & PAGELIMIT);
 800         }
 801         if (startindex < multio_maxpage) {
 802             VM_MULT_IO_FAILURE(vm_info);
 803             VM_REINIT_FAULT_DBDVFD(vm_info);
 804             return (0);         /* retry */
 805         }
 806         count = maxpagein;
 807         VM_SET_NUM_IO(vm_info, indx);
 808     }
 809
 810     /*
 811      * Tell VM where the I/O intends to start.  This may be different
 812      * from the faulting point.
 813      */
 814
 815     VM_SET_STARTINDX(vm_info, VM_GET_IO_STARTINDX(vm_info, 0));
 816
 817     return (count);
 818
 819 }
 820
 821 /*
 822  ********************************************************************
 823  ****
 824  ****                   afspgin_blkflsh ()
 825  ****   similar to:     nfspgin_blkflsh ()
 826  ********************************************************************
 827  */
 828 retval_t
 829 afspgin_blkflsh(vfspage_t * vm_info, struct vnode * devvp, pgcnt_t * num_4k)
 830 {
 831     int flush_reslt = 0;
 832     pgcnt_t count = *num_4k;
 833     pgcnt_t page_count;
 834     int indx = 0;
 835     int num_io = VM_GET_NUM_IO(vm_info);
 836
 837     /*
 838      * On this blkflush() we don't want to purge the buffer cache and we do
 839      * want to wait, so the flags are '0'.
 840      */
 841
 842     for (indx = 0; indx < num_io; indx++) {
 843         flush_reslt =
 844             blkflush(devvp, (kern_daddr_t) VM_GET_IO_STARTBLK(vm_info, indx),
 845                      ptob(VM_GET_IO_COUNT(vm_info, indx)), 0,
 846                      VM_REGION(vm_info));
 847         if (flush_reslt) {
 848             vm_lock(vm_info);
 849             if (vm_page_now_valid(vm_info, &page_count)) {
 850                 vm_release_memory(vm_info);
 851                 vm_release_structs(vm_info);
 852                 *num_4k = page_count;
 853                 return (VM_PAGE_PRESENT);
 854             }
 855             return (VM_RETRY);
 856         }
 857     }
 858     return (VM_DONE);
 859 }
 860
 861 /*
 862  ********************************************************************
 863  ****
 864  ****                   afspgin_io ()
 865  ****    similar to:    nfspgin_io ()
 866  ********************************************************************
 867  */
 868 int
 869 afspgin_io(vfspage_t * vm_info, struct vnode *devvp, pgcnt_t bpages,
 870            pgcnt_t maxpagein, pgcnt_t count)
 871 {
 872     int i;
 873     int error = 0;
 874     caddr_t vaddr = VM_ADDR(vm_info);
 875     caddr_t virt_addr = VM_MAPPED_ADDR(vm_info);
 876     pagein_info_t *io = VM_PAGEIN_INFO(vm_info);
 877     preg_t *prp = VM_PRP(vm_info);
 878     int wrt = VM_WRT(vm_info);
 879     space_t space = VM_SPACE(vm_info);
 880     int num_io = VM_GET_NUM_IO(vm_info);
 881
 882 #ifdef notdef                   /* Not used in AFS */
 883     /*
 884      * With VM_READ_AHEAD_ALLOWED() macro, check if read-ahead should
 885      * be used in this case.
 886      *
 887      * Unlike UFS, NFS does not start the faulting page I/O
 888      * asynchronously. Why?  Asynchronous requests are handled by the
 889      * biod's.  It doesn't make sense to queue up the faulting request
 890      * behind other asynchrnous requests.  This is not true for UFS
 891      * where the asynchrnous request is immediately handled.
 892      */
 893
 894     if ((VM_READ_AHEAD_ALLOWED(vm_info)) && (nfs_read_ahead_on)
 895         && (NFS_DO_READ_AHEAD) && (should_do_read_ahead(prp, vaddr))) {
 896
 897         pgcnt_t max_rhead_io;
 898         caddr_t rhead_vaddr;
 899         pgcnt_t total_rheads_allowed;
 900
 901         /*
 902          * Determine the maximum amount of read-ahead I/O.
 903          */
 904         total_rheads_allowed = maxpagein - count;
 905
 906         /*
 907          * If the count is less than a block, raise it to one.
 908          */
 909         if (total_rheads_allowed < bpages)
 910             total_rheads_allowed = bpages;
 911
 912         max_rhead_io = total_rheads_allowed;
 913         rhead_vaddr = VM_MAPPED_ADDR(vm_info) + (count * NBPG);
 914         error =
 915             nfs_read_ahead(vm_info->vp, prp, wrt, space, rhead_vaddr,
 916                            &max_rhead_io);
 917
 918         /*
 919          * Set the next fault location.  If read_ahead launches any
 920          * I/O it will adjust it accordingly.
 921          */
 922         vm_info->prp->p_nextfault = vm_info->startindex + count;
 923
 924         /*
 925          * Now perform the faulting I/O synchronously.
 926          */
 927         vm_unlock(vm_info);
 928
 929         error =
 930             syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, 0),
 931                        VM_MAPPED_SPACE(vm_info), VM_MAPPED_ADDR(vm_info),
 932                        (int)ptob(count), B_READ, devvp,
 933                        B_vfs_pagein | B_pagebf, VM_REGION(vm_info));
 934     } else
 935 #endif
 936     {
 937         virt_addr = VM_MAPPED_ADDR(vm_info);
 938         vm_unlock(vm_info);
 939         for (i = 0; i < num_io; i++) {
 940             /*
 941              * REVISIT -- investigate doing asyncpageio().
 942              */
 943             error |= (io[i].error =
 944                       syncpageio((swblk_t) VM_GET_IO_STARTBLK(vm_info, i),
 945                                  VM_MAPPED_SPACE(vm_info), virt_addr,
 946                                  (int)ptob(VM_GET_IO_COUNT(vm_info, i)),
 947                                  B_READ, devvp, B_vfs_pagein | B_pagebf,
 948                                  VM_REGION(vm_info)));
 949             virt_addr += ptob(VM_GET_IO_COUNT(vm_info, i));
 950         }
 951         /*
 952          * Set the next fault location.  If read_ahead launches any
 953          * I/O it will adjust it accordingly.
 954          */
 955         vm_info->prp->p_nextfault = vm_info->startindex + count;
 956     }
 957
 958     return (error);
 959 }
 960
 961 /*
 962  ********************************************************************
 963  ****
 964  ****                   afspgin_update_dbd ()
 965  ****    similar to:    nfspgin_update_dbd ()
 966  ********************************************************************
 967  */
 968 void
 969 afspgin_update_dbd(vfspage_t * vm_info, int bsize)
 970 {
 971     k_off_t off;
 972     pgcnt_t count = bsize / NBPG;
 973     k_off_t rem;
 974     pgcnt_t m;
 975     pgcnt_t pgindx;
 976     kern_daddr_t blkno;
 977     int num_io = VM_GET_NUM_IO(vm_info);
 978     int i;
 979
 980     for (i = 0; i < num_io; i++) {
 981
 982         pgindx = VM_GET_IO_STARTINDX(vm_info, i);
 983         off = vnodindx(VM_REGION(vm_info), pgindx);
 984         rem = off % bsize;
 985         blkno = VM_GET_IO_STARTBLK(vm_info, i);
 986
 987         VASSERT(bsize % NBPG == 0);
 988         VASSERT(rem % NBPG == 0);
 989
 990         pgindx -= (pgcnt_t) btop(rem);
 991         blkno -= (kern_daddr_t) btodb(rem);
 992
 993         /*
 994          * This region could start in mid-block.  If so, pgindx
 995          * could be less than 0, so we adjust pgindx and blkno back
 996          * up so that pgindx is 0.
 997          */
 998
 999         if (pgindx < 0) {
1000             pgcnt_t prem;
1001             prem = 0 - pgindx;
1002             pgindx = 0;
1003             count -= prem;
1004             blkno += btodb(ptob(prem));
1005         }
1006
1007         for (m = 0; m < count && pgindx < VM_REGION_SIZE(vm_info);
1008              m++, pgindx++, blkno += btodb(NBPG)) {
1009             /*
1010              * Note:  since this only changes one block, it
1011              * assumes only one block was faulted in.  Currently
1012              * this is always true for remote files, and we only
1013              * get here for remote files, so everything is ok.
1014              */
1015             vm_mark_dbd(vm_info, pgindx, blkno);
1016         }
1017     }
1018 }
1019
1020 int
1021 afs_pagein(vp, prp, wrt, space, vaddr, ret_startindex)
1022      struct vnode *vp;
1023      preg_t *prp;
1024      int wrt;
1025      space_t space;
1026      caddr_t vaddr;
1027      pgcnt_t *ret_startindex;
1028 {
1029     pgcnt_t startindex;
1030     pgcnt_t pgindx = *ret_startindex;
1031     pgcnt_t maxpagein;
1032     struct vnode *devvp;
1033     pgcnt_t count;
1034     kern_daddr_t start_blk = 0;
1035     int bsize;
1036     int error;
1037     k_off_t isize;
1038     int shared;                 /* writable memory mapped file */
1039     retval_t retval = 0;
1040     pgcnt_t ok_dbd_limit = 0;   /* last dbd that we can trust */
1041     pgcnt_t bpages;             /* number of pages per block */
1042     pgcnt_t page_count;
1043     vfspage_t *vm_info = NULL;
1044     int done;
1045
1046     struct vattr va;
1047
1048     caddr_t nvaddr;
1049     space_t nspace;
1050     int change_to_fstore = 0;   /* need to change dbds to DBD_FSTORE */
1051     int flush_start_blk = 0;
1052     int flush_end_blk = 0;
1053
1054     int i, j;
1055
1056     AFS_STATCNT(afs_pagein);
1057     vmemp_lockx();              /* lock down VM empire */
1058
1059     /* Initialize the VM info structure */
1060     done =
1061         vm_pagein_init(&vm_info, prp, pgindx, space, vaddr, wrt, 0,
1062                        LGPG_ENABLE);
1063
1064     /* Check to see if we slept and the page was falted in. */
1065     if (done) {
1066         vm_release_structs(vm_info);
1067         vmemp_returnx(1);
1068     }
1069
1070     vp = VM_GET_PAGEIN_VNODE(vm_info);
1071     VASSERT(vp != NULL);
1072     shared = VM_SHARED_OBJECT(vm_info);
1073     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1074
1075     /*
1076      * Get the devvp and block size for this vnode type
1077      */
1078     devvp = vp;
1079     bsize = vp->v_vfsp->vfs_bsize;
1080     if (bsize <= 0 || (bsize & (DEV_BSIZE - 1)))
1081         osi_Panic("afs_pagein: bsize is zero or not a multiple of DEV_BSIZE");
1082
1083     bpages = (pgcnt_t) btop(bsize);
1084     VASSERT(bpages > 0);
1085     VM_SET_FS_MAX_PAGES(vm_info, bpages);
1086
1087     /* this trace cannot be here because the afs_global lock might not be
1088      * held at this point. We hold the vm global lock throughout
1089      * this procedure ( and not the AFS global lock )
1090      * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEIN, ICL_TYPE_POINTER, (afs_int32) vp,
1091      * ICL_TYPE_LONG, DBD_TYPE(vm_info), ICL_TYPE_LONG, bpages,
1092      * ICL_TYPE_LONG, shared);
1093      */
1094     /* Come here if we have to release the region lock before
1095      * locking pages.  This can happen in memreserve() and
1096      * blkflush().
1097      */
1098   retry:
1099     /*
1100      * For remote files like ours, we want to check to see if the file has shrunk.
1101      * If so, we should invalidate any pages past the end.  In the name
1102      * of efficiency, we only do this if the page we want to fault is
1103      * past the end of the file.
1104      */
1105     {
1106         if (VOP_GETATTR(vp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1107             VM_ZOMBIE_OBJECT(vm_info);
1108             vm_release_memory(vm_info);
1109             vm_release_structs(vm_info);
1110             vmemp_returnx(0);
1111         }
1112         isize = va.va_size;
1113         if (vnodindx(VM_REGION(vm_info), pgindx) >= isize) {
1114             /*
1115              * The file has shrunk and someone is trying to access a
1116              * page past the end of the object.  Shrink the object back
1117              * to its currrent size, send a SIGBUS to the faulting
1118              * process and return.
1119              *
1120              * We must release the region lock before calling mtrunc(),
1121              * since mtrunc() locks all the regions that are using this
1122              * file.
1123              */
1124             vm_release_memory(vm_info);
1125             vm_truncate_region(vm_info, isize);
1126             vm_release_structs(vm_info);
1127             vmemp_returnx(-SIGBUS);
1128         }
1129     }
1130
1131     maxpagein = vm_pick_maxpagein(vm_info);
1132     if (vm_wait_for_memory(vm_info, maxpagein, 1)) {
1133         /* Check to see if we should continue faulting.  */
1134         if (vm_page_now_valid(vm_info, &page_count)) {
1135             vm_release_memory(vm_info);
1136             vm_release_structs(vm_info);
1137             vmemp_returnx(page_count);
1138         }
1139     }
1140     if (count = vm_no_io_required(vm_info)) {
1141         /* Release any excess memory.  */
1142         vm_release_memory(vm_info);
1143         vm_release_structs(vm_info);
1144         vmemp_returnx(count);
1145     }
1146 #ifdef OSDEBUG
1147     /*
1148      * We should never have DBD_HOLE pages in a non-MMF region.
1149      */
1150     if (!shared)
1151         VASSERT(dbd->dbd_type != DBD_HOLE);
1152 #endif
1153     VASSERT(DBD_TYPE(vm_info) != DBD_NONE);
1154
1155     startindex = *ret_startindex;
1156
1157     /*
1158      * If the page we want is in memory already, take it
1159      */
1160     if (VM_MEMORY_RESERVED(vm_info) < maxpagein) {
1161         /* pick up the rest of memory now.  */
1162         if (vm_wait_for_memory(vm_info, maxpagein, 0)) {
1163             if (vm_page_now_valid(vm_info, &page_count)) {
1164                 vm_release_memory(vm_info);
1165                 vm_release_structs(vm_info);
1166                 vmemp_returnx(page_count);
1167             }
1168             goto retry;
1169         }
1170     }
1171
1172     if (!
1173         (count =
1174          afspgin_setup_io_ranges(vm_info, bpages, isize, startindex))) {
1175         goto retry;
1176     }
1177
1178     startindex = VM_GET_STARTINDX(vm_info);
1179
1180     VASSERT(maxpagein >= count);
1181
1182     /*
1183      * Release the memory we won't need.
1184      */
1185     if (count < maxpagein) {
1186         vm_release_excess_memory(vm_info,
1187                                  (VM_MEMORY_RESERVED(vm_info) - count));
1188     }
1189
1190     retval = afspgin_blkflsh(vm_info, devvp, &count);
1191
1192     if (retval == VM_RETRY) {
1193         goto retry;
1194     }
1195
1196     if (retval == VM_PAGE_PRESENT)
1197         return (count);
1198
1199 #if 0
1200     /*
1201      * The definition of krusage_cntr_t is in h/kmetric.h, which
1202      * is not shipped.  Since it's just statistics, we punt and do
1203      * not update it.  If it's a problem we'll need to get HP to export
1204      * an interface that we can use to increment the counter.
1205      */
1206
1207     /* It's a real fault, not a reclaim */
1208     {
1209         krusage_cntr_t *temp;
1210         temp = kt_cntrp(u.u_kthreadp);
1211         temp->krc_majflt++;
1212     }
1213 #endif
1214
1215     /*
1216      * Tell VM where the I/O intends to start.  This may be different
1217      * from the faulting point.
1218      */
1219
1220     /*
1221      * vm_prepare_io will fill the region with pages and release the
1222      * region lock.
1223      */
1224     vm_prepare_io(vm_info, &count);
1225
1226     /*
1227      * Count may have been adjusted, check to make sure it's non-zero.
1228      */
1229     if (count == 0) {
1230         if (vm_retry(vm_info)) {
1231             goto retry;
1232         }
1233
1234         /*
1235          * Release resources and retry the fault.  Release any excess
1236          * memory.
1237          */
1238
1239         vm_release_memory(vm_info);
1240         vm_release_structs(vm_info);
1241         vmemp_returnx(0);
1242     }
1243
1244     error = afspgin_io(vm_info, devvp, bpages, maxpagein, count);
1245
1246     if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1247         retval = -SIGBUS;
1248         VM_ZOMBIE_OBJECT(vm_info);
1249         goto backout;
1250     }
1251     /*
1252      * For a writable memory mapped file that is remote we must
1253      * detect potential holes in the file and force allocation of
1254      * disk space on the remote system.  Unfortunately, there is
1255      * no easy way to do this, so this gets a little ugly.
1256      */
1257     if (shared && wrt) {
1258         /*
1259          * See if The user wants to write to this page.  Write some
1260          * minimal amount of data back to the remote file to
1261          * force allocation of file space.  We only need to
1262          * write a small amount, since holes are always at
1263          * least one filesystem block in size.
1264          */
1265         error = vm_alloc_hole(vm_info);
1266
1267         /*
1268          * If some sort of I/O error occurred we generate a
1269          * SIGBUS for the process that caused the write,
1270          * undo our page locks, etc and return.
1271          */
1272         if ((VM_IS_ZOMBIE(vm_info)) || (error)) {
1273             VM_ZOMBIE_OBJECT(vm_info);
1274             retval = -SIGBUS;
1275             goto backout;
1276         }
1277
1278         /*
1279          * Change these dbds to DBD_FSTORE.  We cannot do it here,
1280          * since the region must be locked, and it is not locked
1281          * at the moment.  We cannot lock the region yet, as we
1282          * first have to release the page locks.
1283          */
1284         change_to_fstore = 1;
1285     }
1286
1287     vm_finish_io(vm_info, count);
1288
1289     /*
1290      * Acquire the lock before we play around with changing the vfd's.
1291      */
1292     vm_lock(vm_info);
1293
1294     if (change_to_fstore)
1295         afspgin_update_dbd(vm_info, bsize);
1296
1297 #if defined(AFS_HPUX110_ENV)
1298     getppdp()->cnt.v_exfod += count;
1299 #else
1300     mpproc_info[getprocindex()].cnt.v_exfod += count;
1301 #endif
1302     vmemp_unlockx();            /* free up VM empire */
1303     *ret_startindex = startindex;
1304
1305     /*
1306      * In case we have any excess memory...
1307      */
1308     if (VM_MEMORY_RESERVED(vm_info))
1309         vm_release_memory(vm_info);
1310     vm_release_structs(vm_info);
1311
1312     return count;
1313
1314   backout:
1315
1316     vm_finish_io_failed(vm_info, count);
1317
1318     vm_lock(vm_info);
1319
1320     vm_undo_validation(vm_info, count);
1321
1322     /*
1323      * In case we have any excess memory...
1324      */
1325     if (VM_MEMORY_RESERVED(vm_info))
1326         vm_release_memory(vm_info);
1327     vm_release_structs(vm_info);
1328
1329     vmemp_unlockx();            /* free up VM empire */
1330     return retval;
1331 }
1332
1333 int
1334 afs_pageout(vp, prp, start, end, flags)
1335      struct vnode *vp;          /* not used */
1336      preg_t *prp;
1337      pgcnt_t start;
1338      pgcnt_t end;
1339      int flags;
1340 {
1341     struct vnode *filevp;
1342     struct vnode *devvp;
1343     pgcnt_t i;
1344     int steal;
1345     int vhand;
1346     int hard;
1347     int *piocnt;                /* wakeup counter used if PAGEOUT_WAIT */
1348     struct ucred *old_cred;
1349     vfspage_t vm_info;
1350     fsdata_t args;
1351
1352     int inode_changed = 0;
1353     int file_is_remote;
1354     struct inode *ip;
1355
1356     AFS_STATCNT(afs_pageout);
1357
1358     steal = (flags & PAGEOUT_FREE);
1359     vhand = (flags & PAGEOUT_VHAND);
1360     hard = (flags & PAGEOUT_HARD);
1361
1362     vmemp_lockx();
1363
1364     /*  Initialize the VM info structure.  */
1365     vm_pageout_init(&vm_info, prp, start, end, 0, 0, 0, flags);
1366
1367     /*
1368      * If the region is marked "don't swap", then don't steal any pages
1369      * from it.  We can, however, write dirty pages out to disk (only if
1370      * PAGEOUT_FREE is not set).
1371      */
1372     if (vm_no_pageout(&vm_info)) {
1373         vmemp_unlockx();
1374         return (0);
1375     }
1376
1377     /*
1378      * If caller wants to wait until the I/O is complete.
1379      */
1380     vm_setup_wait_for_io(&vm_info);
1381
1382     filevp = VM_GET_PAGEOUT_VNODE(&vm_info);    /* always page out to back store */
1383     VASSERT(filevp != NULL);
1384
1385     memset((caddr_t) & args, 0, sizeof(fsdata_t));
1386     args.remote_down = 0;       /* assume remote file servers are up */
1387     args.remote = 1;            /* we are remote */
1388     args.bsize = 0;             /* filled up later by afs_vm_checkpage() */
1389
1390     if (filevp->v_fstype == VUFS) {
1391         ip = VTOI(filevp);
1392         devvp = ip->i_devvp;
1393         file_is_remote = 0;
1394     } else {
1395         file_is_remote = 1;
1396         devvp = filevp;
1397
1398         /*
1399          * If we are vhand(), and this is an NFS file, we need to
1400          * see if the NFS server is "down".  If so, we decide
1401          * if we will try to talk to it again, or defer pageouts
1402          * of dirty NFS pages until a future time.
1403          */
1404 #ifdef  notdef
1405         if (vhand && filevp->v_fstype == VNFS && vtomi(filevp)->mi_down
1406             && vtomi(filevp)->mi_hard) {
1407             extern afs_int32 vhand_nfs_retry;
1408             /*
1409              * If there is still time left on our timer, we will
1410              * not talk to this server right now.
1411              */
1412             if (vhand_nfs_retry > 0)
1413                 args.remote_down = 1;
1414         }
1415 #endif
1416     }
1417
1418     /*
1419      * Initialize args.  We set bsize to 0 to tell vfs_vfdcheck() that
1420      * it must get the file size and other attributes if it comes across
1421      * a dirty page.
1422      */
1423     vm_info.fs_data = (caddr_t) & args;
1424
1425     /* this trace cannot be here because the afs_global lock might not be
1426      * held at this point. We hold the vm global lock throughout
1427      * this procedure ( and not the AFS global lock )
1428      * afs_Trace4(afs_iclSetp, CM_TRACE_HPPAGEOUT, ICL_TYPE_POINTER, (afs_int32) filevp,
1429      * ICL_TYPE_LONG, start, ICL_TYPE_LONG, end, ICL_TYPE_LONG, flags);
1430      */
1431
1432     i = start;
1433
1434     while (i <= end) {
1435         struct buf *bp;
1436         k_off_t start;
1437         pgcnt_t npages;
1438         k_off_t nbytes;
1439         int error;
1440
1441         extern int pageiodone();
1442         space_t nspace;
1443         caddr_t nvaddr;
1444
1445         /*
1446          * Ask the VM system to find the next run of pages.
1447          */
1448         vm_find_next_range(&vm_info, i, end);
1449
1450         /*
1451          * It's possible that the remote file shrunk in size.  Check the flags
1452          * to see if the request was beyond the end of the file.  If it was,
1453          * truncate the region to the file size and continue.  We could be on a
1454          * run so after trunction continue, there may be some I/O to write
1455          * out.
1456          */
1457         if (VM_FS_FLAGS(&vm_info) & PAGEOUT_TRUNCATE) {
1458             pgcnt_t pglen = (pgcnt_t) btorp(args.isize);
1459
1460             /*
1461              * This page is past the end of the file.  Unlock this page
1462              * (region_trunc will throw it away) and then call
1463              * region_trunc() to invalidate all pages past the new end of
1464              * the file.
1465              */
1466             region_trunc(VM_REGION(&vm_info), pglen, pglen + 1);
1467
1468             /*
1469              * remove the truncation flag.
1470              */
1471             VM_UNSETFS_FLAGS(&vm_info, PAGEOUT_TRUNCATE);
1472         }
1473
1474         if (VM_NO_PAGEOUT_RUN(&vm_info))
1475             break;
1476
1477         /*
1478          * We have a run of dirty pages [args.start...args.end].
1479          */
1480         VASSERT(filevp->v_fstype != VCDFS);
1481         VASSERT((filevp->v_vfsp->vfs_flag & VFS_RDONLY) == 0);
1482         VASSERT(VM_GET_NUM_IO(&vm_info) == 1);
1483
1484         /*
1485          * We will be doing an I/O on the region, let the VM system know.
1486          */
1487         (void)vm_up_physio_count(&vm_info);
1488
1489         /*
1490          * Okay, get set to perform the I/O.
1491          */
1492         inode_changed = 1;
1493         npages =
1494             (VM_END_PAGEOUT_INDX(&vm_info) + 1) -
1495             VM_START_PAGEOUT_INDX(&vm_info);
1496
1497         /*
1498          * Allocate and initialize an I/O buffer.
1499          */
1500         bp = bswalloc();
1501         vm_init_bp(&vm_info, bp);       /* Let the VM system initialize */
1502
1503         /* Identify this buffer for KI */
1504         bp->b_bptype = B_vfs_pageout | B_pagebf;
1505
1506         if (steal)
1507             bp->b_flags = B_CALL | B_BUSY | B_PAGEOUT;  /* steal pages */
1508         else
1509             bp->b_flags = B_CALL | B_BUSY;      /* keep pages */
1510
1511         /*
1512          * If we are vhand paging over NFS, we will wait for the I/O
1513          * to complete.
1514          */
1515         if (vhand && filevp->v_fstype == VNFS) {
1516             bp->b_flags &= ~B_CALL;
1517         } else {
1518             bp->b_iodone = (int (*)())pageiodone;
1519         }
1520
1521         /*
1522          * Make sure we do not write past the end of the file.
1523          */
1524         nbytes = ptob(npages);
1525         start = vnodindx(VM_REGION(&vm_info), vm_info.start);
1526         if (start + nbytes > args.isize) {
1527 #ifdef OSDEBUG
1528             /*
1529              * The amount we are off better not be bigger than a
1530              * filesystem block.
1531              */
1532             if (start + nbytes - args.isize >= args.bsize) {
1533                 osi_Panic("afs_pageout: remainder too large");
1534             }
1535 #endif
1536             /*
1537              * Reset the size of the I/O as necessary.  For remote
1538              * files, we set the size to the exact number of bytes to
1539              * the end of the file.  For local files, we round this up
1540              * to the nearest DEV_BSIZE chunk since disk I/O must always
1541              * be in multiples of DEV_BSIZE.  In this case, we do not
1542              * bother to zero out the data past the "real" end of the
1543              * file, this is done when the data is read (either through
1544              * mmap() or by normal file system access).
1545              */
1546             if (file_is_remote)
1547                 nbytes = args.isize - start;
1548             else
1549                 nbytes = roundup(args.isize - start, DEV_BSIZE);
1550         }
1551
1552         /*
1553          * Now get ready to perform the I/O
1554          */
1555         if (!vm_protect_pageout(&vm_info, npages)) {
1556             VASSERT(vhand);
1557             vm_undo_invalidation(&vm_info, vm_info.start, vm_info.end);
1558             vm_finish_io_failed(&vm_info, npages);
1559             bswfree(bp);
1560             break;
1561         }
1562         /*
1563          * If this is an NFS write by vhand(), we will not be calling
1564          * pageiodone().  asyncpageio() increments parolemem for us
1565          * if bp->b_iodone is pageiodone, so we must do it manually
1566          * if pageiodone() will not be called automatically.
1567          */
1568         if (!(bp->b_flags & B_CALL) && steal) {
1569             ulong_t context;
1570
1571             SPINLOCK_USAV(pfdat_lock, context);
1572             parolemem += btorp(nbytes);
1573             SPINUNLOCK_USAV(pfdat_lock, context);
1574         }
1575         blkflush(devvp, VM_START_PAGEOUT_BLK(&vm_info), (long)nbytes,
1576                  (BX_NOBUFWAIT | BX_PURGE), VM_REGION(&vm_info));
1577
1578         /*
1579          * If vhand is the one paging things out, and this is an NFS
1580          * file, we need to temporarily become a different user so
1581          * that we are not trying to page over NFS as root.  We use
1582          * the user credentials associated with the writable file
1583          * pointer that is in the psuedo-vas for this MMF.
1584          *
1585          * NOTE: we are currently using "va_rss" to store the ucred
1586          *       value in the vas (this should be fixed in 10.0).
1587          */
1588         old_cred = kt_cred(u.u_kthreadp);
1589         if (vhand) {
1590 #if defined(AFS_HPUX1123_ENV)
1591                 /*
1592                  * DEE - 1123 does not have the vas.h, and it looks
1593                  * we should never be called with a NFS type file anyway.
1594                  * so where did this come from? Was it copied from NFS?
1595                  * I assume it was, so we will add an assert for now
1596                  * and see if the code runs at all.
1597                  */
1598                 VASSERT(filevp->v_fstype != VNFS);
1599 #else
1600             set_kt_cred(u.u_kthreadp, filevp->v_vas->va_cred);
1601
1602             /*
1603              * If root was the one who opened the mmf for write,
1604              * va_cred will be NULL.  So reset kt_cred(u.u_kthreadp) to what it
1605              * was.  We will page out as root, but that is the
1606              * correct thing to do in this case anyway.
1607              */
1608             if (kt_cred(u.u_kthreadp) == NULL)
1609                 set_kt_cred(u.u_kthreadp, old_cred);
1610 #endif
1611         }
1612
1613         /*
1614          * Really do the I/O.
1615          */
1616         error =
1617             asyncpageio(bp, VM_START_PAGEOUT_BLK(&vm_info),
1618                         VM_MAPPED_SPACE(&vm_info), VM_MAPPED_ADDR(&vm_info),
1619                         (int)nbytes, B_WRITE, devvp);
1620
1621         VASSERT(error == 0);
1622
1623 #ifdef  notdef
1624         /*
1625          * If we are vhand paging over NFS we want to wait for the
1626          * I/O to complete and take the appropriate actions if an
1627          * error is encountered.
1628          */
1629         if (vhand) {
1630             if (waitforpageio(bp) && nfs_mi_harddown(filevp)) {
1631                 /*
1632                  * The server is down, ignore this failure, and
1633                  * try again later. (rfscall() has set our retry
1634                  * timer).
1635                  */
1636                 fsdata.remote_down = 1;
1637                 pageiocleanup(bp, 0);
1638
1639                 /*
1640                  * vm_vfdcheck() has cleared the valid bit on the
1641                  * vfds for these pages.  We must go back and set the
1642                  * valid bit, as the pages are really not gone.
1643                  *
1644                  * NOTE: we can do this because we still hold (and have
1645                  * not released) the region lock.
1646                  */
1647                 if (steal)
1648                     vm_undo_invalidation(&vm_info, vm_info.start,
1649                                          vm_info.end);
1650             } else {
1651                 /*
1652                  * The I/O succeeded, or we had an error that we do
1653                  * not want to defer until later.  Call pageidone()
1654                  * to handle things.
1655                  */
1656                 pageiodone(bp);
1657             }
1658         }
1659 #endif
1660
1661         /*
1662          * And restore our credentials to what they were.
1663          */
1664         set_kt_cred(u.u_kthreadp, old_cred);
1665
1666         /*
1667          * If we reserved memory in vfs_vfdcheck(), (only for NFS) we
1668          * can now unreserve it.
1669          */
1670         if (vm_info.vm_flags & PAGEOUT_RESERVED) {
1671             vm_info.vm_flags &= ~PAGEOUT_RESERVED;
1672             vm_release_malloc_memory();
1673         }
1674
1675         /*
1676          * Update statistics
1677          */
1678         if (steal) {
1679             if (flags & PF_DEACT) {
1680 #if defined(AFS_HPUX110_ENV)
1681                 getppdp()->cnt.v_pswpout += npages;
1682 #else
1683                 mpproc_info[getprocindex()].cnt.v_pswpout += npages;
1684 #endif
1685 /*              sar_bswapout += ptod(npages);*/
1686             } else if (vhand) {
1687 #if defined(AFS_HPUX110_ENV)
1688                 getppdp()->cnt.v_pgout++;
1689                 getppdp()->cnt.v_pgpgout += npages;
1690 #else
1691                 mpproc_info[getprocindex()].cnt.v_pgout++;
1692                 mpproc_info[getprocindex()].cnt.v_pgpgout += npages;
1693 #endif
1694             }
1695         }
1696
1697         /*
1698          * If time and patience have delivered enough
1699          * pages, then quit now while we are ahead.
1700          */
1701         if (VM_STOP_PAGING(&vm_info))
1702             break;
1703
1704         i = VM_END_PAGEOUT_INDX(&vm_info) - VM_BASE_OFFSET(&vm_info) + 1;
1705     }
1706
1707     vm_finish_pageout(&vm_info);        /* update vhand's stealscan */
1708
1709     vmemp_unlockx();
1710
1711     /*
1712      * If we wanted to wait for the I/O to complete, sleep on piocnt.
1713      * We must decrement it by one first, and then make sure that it
1714      * is non-zero before going to sleep.
1715      */
1716     vm_wait_for_io(&vm_info);
1717
1718     if (inode_changed && !file_is_remote) {
1719         imark(ip, IUPD | ICHG);
1720         iupdat(ip, 0, 0);
1721     }
1722     return 0;
1723 }
1724
1725 int
1726 afs_mapdbd(filevp, offset, bn, flags, hole, startidx, endidx)
1727      struct vnode *filevp;
1728      off_t offset;
1729      kern_daddr_t *bn;          /* Block number. */
1730      int flags;                 /* B_READ or B_WRITE */
1731      int *hole;                 /* To be used for read-ahead. */
1732      pgcnt_t *startidx;         /* To be used for read-ahead. */
1733      pgcnt_t *endidx;           /* To be used for read-ahead. */
1734 {
1735     kern_daddr_t lbn, local_bn;
1736     int on;
1737     int err;
1738     long bsize = vtoblksz(filevp) & ~(DEV_BSIZE - 1);
1739
1740     if (startidx)
1741         *startidx = (pgcnt_t) (offset / NBPG);
1742     if (endidx)
1743         *endidx = (pgcnt_t) (offset / NBPG);
1744     if (hole)
1745         *hole = 0;              /* Can't have holes. */
1746     if (bsize <= 0)
1747         osi_Panic("afs_mapdbd: zero size");
1748
1749     lbn = (kern_daddr_t) (offset / bsize);
1750     on = offset % bsize;
1751
1752     err = VOP_BMAP(filevp, lbn, NULL, &local_bn, flags);
1753     VASSERT(err == 0);
1754
1755     /*
1756      * We can never get a bn less than zero on remote files.
1757      */
1758     VASSERT(local_bn >= 0);
1759
1760     local_bn = local_bn + btodb(on);
1761     *bn = local_bn;
1762
1763     return (0);
1764 }
1765
1766 /*
1767  * Return values:
1768  *      1: The blocks are contiguous.
1769  *      0: The blocks are not contiguous.
1770  */
1771 int
1772 afs_vm_fscontiguous(vp, args, cur_data)
1773      struct vnode *vp;
1774      vfspage_t *args;
1775      u_int cur_data;
1776 {
1777     if (cur_data == (VM_END_PAGEOUT_BLK(args) + btodb(NBPG))) {
1778         return (1);
1779     } else {
1780         return (0);
1781     }
1782 }
1783
1784 /*
1785  * Return values:
1786  *      1: Stop, this page is the last in the block.
1787  *      0: Continue on
1788  * Terminate requests at filesystem block boundaries
1789  */
1790 afs_vm_stopio(vp, args)
1791      struct vnode *vp;
1792      vfspage_t *args;
1793 {
1794     fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1795
1796 #if defined(AFS_HPUX1123_ENV)
1797         uint64_t tmpdb;
1798         tmpdb = VM_END_PAGEOUT_BLK(args);
1799
1800         if ((dbtob(tmpdb) + NBPG) % (fsdata->bsize) == 0)
1801 #else
1802     if ((dbtob(VM_END_PAGEOUT_BLK(args)) + NBPG) % (fsdata->bsize) == 0)
1803 #endif /* AFS_HPUX1123_ENV */
1804         {
1805         return (1);
1806     } else {
1807         return (0);
1808     }
1809 }
1810
1811 /*
1812  *      afs_vm_checkpage is called by the VM while collecting a run of
1813  *      pages on a pageout.  afs_vm_checkpage() is called for each page
1814  *      VM wants to write to disk.
1815  */
1816 afs_vm_checkpage(vp, args, pgindx, cur_data)
1817      struct vnode *vp;
1818      vfspage_t *args;
1819      pgcnt_t pgindx;
1820      int cur_data;
1821 {
1822     fsdata_t *fsdata = (fsdata_t *) args->fs_data;
1823
1824     if (fsdata->remote_down) {  /* never happens for AFS */
1825         /*
1826          * The remote system is down.
1827          */
1828         VASSERT(args->run == 0);
1829         return 1;
1830     }
1831     /*
1832      * A dirty page.  If we have not yet determined the file size and
1833      * other attributes that we need to write out pages (the block
1834      * size and ok_dbd_limit), get that information now.
1835      */
1836     if (fsdata->bsize == 0) {
1837         k_off_t isize;
1838         long bsize;
1839         struct vattr va;
1840         struct vnode *filevp;
1841         /*
1842          * Get the various attributes about the file.  Store them
1843          * in args for the next time around.
1844          */
1845         filevp = args->vp;
1846
1847         bsize = vtoblksz(filevp);
1848         args->maxpgs = (pgcnt_t) btop(bsize);
1849
1850         if (VOP_GETATTR(filevp, &va, kt_cred(u.u_kthreadp), VIFSYNC) != 0) {
1851             /*
1852              * The VOP_GETATTR() failed.
1853              * we are vhand, and this is a hard mount, we will
1854              * skip dirty pages for a while and try again later.
1855              */
1856             if (args->vm_flags & PAGEOUT_VHAND) {
1857                 VASSERT(args->run == 0);
1858                 return 1;
1859             }
1860             /*
1861              * This is a "soft" mount, or some other error was
1862              * returned from the server.  Mark this region
1863              * as a zombie, and free this dirty page.
1864              */
1865             VM_ZOMBIE_OBJECT(args);
1866
1867             /*
1868              * The caller will see r_zomb and remove the page
1869              * appropriately.
1870              */
1871             return (1);
1872         }
1873         isize = va.va_size;
1874         fsdata->isize = isize;
1875         fsdata->bsize = bsize;
1876         fsdata->remote = 1;
1877     }
1878     /*
1879      * See if the file has shrunk (this could have happened
1880      * asynchronously because of NFS or DUX).  If so, invalidate
1881      * all of the pages past the end of the file. This is only
1882      * needed for remote files, as local files are truncated
1883      * synchronously.
1884      */
1885
1886     if (vnodindx(VM_REGION(args), pgindx) > fsdata->isize) {
1887         /*
1888          * This page is past the end of the file.  Unlock this page
1889          * (region_trunc will throw it away) and then call region_trunc()
1890          * to invalidate all pages past the new end of the file.
1891          */
1892         VM_SETFS_FLAGS(args, PAGEOUT_TRUNCATE);
1893         return (1);
1894     }
1895 #ifdef notdef
1896     if ((args->vm_flags & PAGEOUT_VHAND)
1897         && (!(args->vm_flags & PAGEOUT_RESERVED))
1898         && (!(VM_IS_ZOMBIE(args)))) {
1899         VASSERT(args->run == 0);
1900         if (vm_reserve_malloc_memory(NFS_PAGEOUT_MEM)) {
1901             /*
1902              * Got enough memory to pageout.  Mark the fact that we did
1903              * a sysprocmemreserve(), so that we can sysprocmemunreserve() it
1904              * later (in remote_pageout()).
1905              */
1906             args->vm_flags |= PAGEOUT_RESERVED;
1907         } else {
1908             /*
1909              * We do not have enough memory to do this pageout.  By
1910              * definition, we do not yet have a run, so we just unlock
1911              * this page and tell foreach_valid() to continue scanning.
1912              * If we come across another dirty page, we will try to
1913              * reserve memory again.  That is okay, in fact some memory
1914              * may have freed up (as earlier pageouts complete under
1915              * interrupt).
1916              */
1917             return 1;
1918         }
1919     }
1920 #endif
1921     return (0);
1922 }
1923
1924 afs_swapfs_len(bp)
1925      struct buf *bp;
1926 {
1927     long fs_bsize;
1928     long max_size;
1929     long bnrem;
1930
1931     fs_bsize = vtoblksz(bp->b_vp);
1932     /*
1933      * Check to see if we are starting mid block.  If so, then
1934      * we must return the remainder of the block or less depending
1935      * on the length.
1936      */
1937     bnrem = bp->b_offset % fs_bsize;
1938     if (bnrem) {
1939         max_size = fs_bsize - bnrem;
1940     } else {
1941         max_size = fs_bsize;
1942     }
1943
1944     if (bp->b_bcount > max_size) {
1945         return (max_size);
1946     } else {
1947         return (bp->b_bcount);
1948     }
1949 }
1950
1951 afs_mmap(vp, off, size_bytes, access)
1952      struct vnode *vp;
1953      u_int off;
1954 #if defined(AFS_HPUX1111_ENV)
1955      u_long size_bytes;
1956 #else
1957      u_int size_bytes;
1958 #endif
1959      int access;
1960 {
1961     long bsize = vtoblksz(vp);
1962
1963     if (bsize % NBPG != 0) {
1964         return (EINVAL);
1965     }
1966
1967     return (0);
1968 }
1969
1970 afs_cachelimit(vp, len, location)
1971      struct vnode *vp;
1972      k_off_t len;
1973      int *location;
1974 {
1975     /*
1976      * Disk addresses are logical, not physical, so fragments are
1977      * transparent.
1978      */
1979     *location = btorp(len) + 1;
1980 }
1981
1982 afs_release(vp)
1983      struct vnode *vp;
1984 {
1985     return (0);
1986 }
1987
1988 int
1989 afs_unmap(vp, off, size_bytes, access)
1990      struct vnode *vp;
1991      u_int off;
1992 #if defined(AFS_HPUX1111_ENV)
1993      u_long size_bytes;
1994 #else
1995      u_int size_bytes;
1996 #endif
1997      int access;
1998 {
1999     return 0;
2000 }
2001
2002 int
2003 afs_read_ahead(vp, prp, wrt, space, vaddr, rhead_cnt)
2004      struct vnode *vp;
2005      preg_t *prp;
2006      int wrt;
2007      space_t space;
2008      caddr_t vaddr;
2009      pgcnt_t *rhead_cnt;
2010 {
2011     printf("afs_read_ahead returning 0 \n");
2012     return 0;
2013 }
2014
2015 int
2016 afs_prealloc(vp, size, ignore_minfree, reserved)
2017      struct vnode *vp;
2018       /* DEE on 11.22 following is off_t */
2019      size_t size;
2020      int ignore_minfree;
2021      int reserved;
2022 {
2023     printf("afs_prealloc returning ENOSPC\n");
2024     return ENOSPC;
2025 }
2026
2027 int
2028 afs_ioctl(vp, com, data, flag, cred)
2029      struct vnode *vp;
2030      int com;
2031      caddr_t data;
2032      int flag;
2033      struct ucred *cred;
2034 {
2035     int error;
2036     struct afs_ioctl afsioctl, *ai;
2037
2038     AFS_STATCNT(afs_ioctl);
2039
2040     /* The call must be a VICEIOCTL call */
2041     if (((com >> 8) & 0xff) == 'V') {
2042 #ifdef notdef
2043         /* AFS_COPYIN returns error 14. Copy data in instead */
2044         AFS_COPYIN(data, (caddr_t) & afsioctl, sizeof(afsioctl), error);
2045         if (error)
2046             return (error);
2047 #endif
2048         ai = (struct afs_ioctl *)data;
2049         afsioctl.in = ai->in;
2050         afsioctl.out = ai->out;
2051         afsioctl.in_size = ai->in_size;
2052         afsioctl.out_size = ai->out_size;
2053         error = HandleIoctl(VTOAFS(vp), com, &afsioctl);
2054         return (error);
2055     }
2056     return (ENOTTY);
2057 }
2058
2059 #if defined(AFS_HPUX1111_ENV)
2060 /* looks like even if appl is 32 bit, we need to round to 8 bytes */
2061 /* This had no effect, it must not be being used */
2062
2063 #define roundtoint(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2064 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2065                                 sizeof(u_int) + 2 * sizeof(u_short)))
2066 #else
2067
2068 #define roundtoint(x)   (((x) + (sizeof(int) - 1)) & ~(sizeof(int) - 1))
2069 #define reclen(dp)      roundtoint(((dp)->d_namlen + 1 + (sizeof(u_long)) +\
2070                                 2 * sizeof(u_short)))
2071 #endif
2072
2073 int
2074 afs_readdir(vp, uiop, cred)
2075      struct vnode *vp;
2076      struct uio *uiop;
2077      struct ucred *cred;
2078 {
2079     struct uio auio;
2080     struct iovec aiov;
2081     caddr_t ibuf, obuf, ibufend, obufend;
2082     struct __dirent32 *idp;
2083     struct dirent *odp;
2084     int count, outcount;
2085     dir_off_t offset;
2086     uint64_t tmp_offset;
2087
2088     count = uiop->uio_resid;
2089     /* Allocate temporary space for format conversion */
2090     ibuf = kmem_alloc(2 * count);       /* overkill - fix later */
2091     obuf = kmem_alloc(count + sizeof(struct dirent));
2092     aiov.iov_base = ibuf;
2093     aiov.iov_len = count;
2094     auio.uio_iov = &aiov;
2095     auio.uio_iovcnt = 1;
2096     offset = auio.uio_offset = uiop->uio_offset;
2097     auio.uio_seg = UIOSEG_KERNEL;
2098     auio.uio_resid = count;
2099     auio.uio_fpflags = 0;
2100
2101     u.u_error = mp_afs_readdir2(vp, &auio, cred);
2102     if (u.u_error)
2103         goto out;
2104
2105     /* Convert entries from __dirent32 to dirent format */
2106
2107     for (idp = (struct __dirent32 *)ibuf, odp =
2108          (struct dirent *)obuf, ibufend =
2109          ibuf + (count - auio.uio_resid), obufend = obuf + count;
2110          (caddr_t) idp < ibufend;
2111          idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2112          (struct dirent *)((caddr_t) odp + odp->d_reclen)) {
2113         odp->d_ino = idp->__d_ino;
2114         odp->d_namlen = idp->__d_namlen;
2115         (void)strcpy(odp->d_name, idp->__d_name);
2116         odp->d_reclen = reclen(odp);
2117         if ((caddr_t) odp + odp->d_reclen > obufend)
2118             break;
2119         /* record offset *after* we're sure to use this entry */
2120         memcpy((char *)&tmp_offset, (char *)&idp->__d_off, sizeof tmp_offset);
2121         offset = tmp_offset;
2122     }
2123
2124     outcount = (caddr_t) odp - obuf;
2125     AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2126     if (u.u_error)
2127         goto out;
2128     uiop->uio_offset = offset;
2129   out:
2130     kmem_free(ibuf, count);
2131     kmem_free(obuf, count + sizeof(struct dirent));
2132     return u.u_error;
2133 }
2134
2135
2136 #define roundtolong(x)   (((x) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))
2137 #define reclen_dirent64(dp)      roundtolong(((dp)->__d_namlen + 1 + (2*sizeof(u_long)) +\
2138                                 2 * sizeof(u_short)))
2139
2140 int
2141 afs_readdir3(vp, uiop, cred)
2142      struct vnode *vp;
2143      struct uio *uiop;
2144      struct ucred *cred;
2145 {
2146     struct uio auio;
2147     struct iovec aiov;
2148     caddr_t ibuf, obuf, ibufend, obufend;
2149     struct __dirent32 *idp;
2150     struct __dirent64 *odp;
2151     int count, outcount;
2152     dir_off_t offset;
2153
2154     count = uiop->uio_resid;
2155     /* Allocate temporary space for format conversion */
2156     ibuf = kmem_alloc(2 * count);       /* overkill - fix later */
2157     obuf = kmem_alloc(count + sizeof(struct __dirent64));
2158     aiov.iov_base = ibuf;
2159     aiov.iov_len = count;
2160     auio.uio_iov = &aiov;
2161     auio.uio_iovcnt = 1;
2162     offset = auio.uio_offset = uiop->uio_offset;
2163     auio.uio_seg = UIOSEG_KERNEL;
2164     auio.uio_resid = count;
2165     auio.uio_fpflags = 0;
2166
2167     u.u_error = mp_afs_readdir2(vp, &auio, cred);
2168     if (u.u_error)
2169         goto out;
2170
2171     /* Convert entries from __dirent32 to __dirent64 format */
2172
2173     for (idp = (struct __dirent32 *)ibuf, odp =
2174          (struct __dirent64 *)obuf, ibufend =
2175          ibuf + (count - auio.uio_resid), obufend = obuf + count;
2176          (caddr_t) idp < ibufend;
2177          idp = (struct __dirent32 *)((caddr_t) idp + idp->__d_reclen), odp =
2178          (struct __dirent64 *)((caddr_t) odp + odp->__d_reclen)) {
2179         memcpy((char *)&odp->__d_off, (char *)&idp->__d_off,
2180                sizeof odp->__d_off);
2181         odp->__d_ino = idp->__d_ino;
2182         odp->__d_namlen = idp->__d_namlen;
2183         (void)strcpy(odp->__d_name, idp->__d_name);
2184         odp->__d_reclen = reclen_dirent64(odp);
2185         if ((caddr_t) odp + odp->__d_reclen > obufend)
2186             break;
2187         /* record offset *after* we're sure to use this entry */
2188         offset = odp->__d_off;
2189     }
2190
2191     outcount = (caddr_t) odp - obuf;
2192     AFS_UIOMOVE(obuf, outcount, UIO_READ, uiop, u.u_error);
2193     if (u.u_error)
2194         goto out;
2195     uiop->uio_offset = offset;
2196   out:
2197     kmem_free(ibuf, count);
2198     kmem_free(obuf, count + sizeof(struct __dirent64));
2199     return u.u_error;
2200 }
2201
2202 #define AFS_SV_SEMA_HASH 1
2203 #define AFS_SV_SEMA_HASH_DEBUG 0
2204
2205 #if AFS_SV_SEMA_HASH
2206 /* This portion of the code was originally used to implement
2207  * thread specific storage for the semaphore save area. However,
2208  * there were some spare fields in the proc structure, this is
2209  * now being used for the saving semapores.  Hence, this portion of
2210  * the code is no longer used.
2211  */
2212
2213 /* This portion of the code implements thread specific information.
2214  * The thread id is passed in as the key. The semaphore saved area
2215  * is hashed on this key.
2216  */
2217
2218 /* why is this hash table required ?
2219  * The AFS code is written in such a way that a GLOCK() is done in
2220  * one function and the GUNLOCK() is done in another function further
2221  * down the call chain. The GLOCK() call has to save the current
2222  * semaphore status before acquiring afs_global_sema. The GUNLOCK
2223  * has to release afs_global_sema and reacquire the sempahore status
2224  * that existed before the corresponding GLOCK. If GLOCK() and
2225  * GUNLOCK() were called in the same function, the GLOCK call could
2226  * have stored the saved sempahore status in a local variable and the
2227  * corresponding GUNLOCK() call could have restored the original
2228  * status from this local variable. But this is not the case with
2229  * AFS code. Hence, we have to implement a thread specific semaphore
2230  * save area. This is implemented as a hash table. The key is the
2231  * thread id.
2232  */
2233
2234 /* In order for multithreaded processes to work, the sv_sema structures
2235  * must be saved on a per-thread basis, not a per-process basis.  There
2236  * is no per-thread storage available to hijack in the OS per-thread
2237  * data structures (e.g. struct user) so we revive this code.
2238  * I removed the upper limit on the memory consumption since we don't
2239  * know how many threads there will be.  Now the code first checks the
2240  * freeList.  If that fails it then tries garbage collecting.  If that
2241  * doesn't free up anything then it allocs what it needs.
2242  */
2243
2244 #define ELEMENT         sv_sema_t
2245 #define KEY             tid_t
2246 #define Hash(xx)        (  (xx) % sizeOfHashTable )
2247 #define hashLockInit(xx) initsema(&xx,1, FILESYS_SEMA_PRI, FILESYS_SEMA_ORDER)
2248 #define hashLock(xx)    MP_PSEMA(&xx)
2249 #define hashUnlock(xx)  MP_VSEMA(&xx)
2250
2251 typedef struct elem {
2252     struct elem *next;
2253     ELEMENT element;
2254     KEY key;
2255     int refCnt;
2256 } Element;
2257
2258 typedef struct bucket {
2259     sema_t lock;
2260     Element *element;
2261 } Bucket;
2262
2263 static int sizeOfHashTable;
2264 static Bucket *hashTable;
2265
2266 static int currentSize = 0;
2267 static Element *freeList;       /* free list */
2268
2269 #pragma align 64
2270 static sema_t afsHashLock = { 0 };      /* global lock for hash table */
2271
2272 static void afsHashGarbageCollect();
2273
2274 /*
2275 ** The global lock protects the global data structures,
2276 ** e.g. freeList and currentSize.
2277 ** The bucket lock protects the link list hanging off that bucket.
2278 ** The lock hierarchy : one can obtain the bucket lock while holding
2279 ** the global lock, but not vice versa.
2280 */
2281
2282
2283 void
2284 afsHash(int nbuckets)
2285 {                               /* allocate the hash table */
2286     int i;
2287
2288 #if AFS_SV_SEMA_HASH_DEBUG
2289     printf("afsHash: enter\n");
2290 #endif
2291
2292     sizeOfHashTable = nbuckets;
2293     currentSize = nbuckets * sizeof(Bucket);
2294
2295     if (hashTable)
2296         osi_Panic("afs: SEMA Hashtable already created\n");
2297
2298     hashTable = (Bucket *) AFS_KALLOC(sizeOfHashTable * sizeof(Bucket));
2299     if (!hashTable)
2300         osi_Panic("afs: cannot create SEMA Hashtable\n");
2301
2302     /* initialize the hash table and associated locks */
2303     memset(hashTable, 0, sizeOfHashTable * sizeof(Bucket));
2304     for (i = 0; i < sizeOfHashTable; i++)
2305         hashLockInit(hashTable[i].lock);
2306     hashLockInit(afsHashLock);
2307
2308 #if AFS_SV_SEMA_HASH_DEBUG
2309     printf("afsHash: exit\n");
2310 #endif
2311 }
2312
2313 ELEMENT *
2314 afsHashInsertFind(KEY key)
2315 {
2316     int index;
2317     Element *ptr;
2318
2319 #if AFS_SV_SEMA_HASH_DEBUG
2320     printf("afsHashInsertFind: %d\n", key);
2321 #endif
2322     if (!hashTable)
2323         osi_Panic("afs: afsHashInsertFind: no hashTable\n");
2324
2325     index = Hash(key);          /* get bucket number */
2326     hashLock(hashTable[index].lock);    /* lock this bucket */
2327     ptr = hashTable[index].element;
2328
2329     /* if it is already there */
2330     while (ptr) {
2331         if (ptr->key == key) {
2332             ptr->refCnt++;      /* hold it */
2333             hashUnlock(hashTable[index].lock);
2334 #if AFS_SV_SEMA_HASH_DEBUG
2335             printf("afsHashInsertFind: %d FOUND\n", key);
2336 #endif
2337             return &(ptr->element);
2338         } else {
2339             ptr = ptr->next;
2340         }
2341     }
2342
2343     hashUnlock(hashTable[index].lock);
2344
2345     /*  if something exists in the freeList, take it from there */
2346     ptr = NULL;
2347     hashLock(afsHashLock);
2348
2349     if (freeList) {
2350         ptr = freeList;         /* reuse entry */
2351         freeList = freeList->next;
2352     } else {
2353         afsHashGarbageCollect();        /* afsHashLock locked */
2354         if (freeList) {
2355             ptr = freeList;     /* reuse entry */
2356             freeList = freeList->next;
2357         } else {
2358             ptr = (Element *) AFS_KALLOC(sizeof(Element));
2359         }
2360     }
2361
2362     currentSize += sizeof(Element);     /* update memory used */
2363     hashUnlock(afsHashLock);
2364
2365     if (!ptr)
2366         osi_Panic("afs: SEMA Hashtable cannot create new entry\n");
2367     /* create new entry */
2368     ptr->key = key;
2369     memset(&ptr->element, 0, sizeof(ptr->element));
2370     ptr->refCnt = 1;            /* this guy */
2371
2372     /* insert new entry in bucket */
2373     hashLock(hashTable[index].lock);    /* lock this bucket */
2374     ptr->next = hashTable[index].element;
2375     hashTable[index].element = ptr;
2376     hashUnlock(hashTable[index].lock);
2377
2378 #if AFS_SV_SEMA_HASH_DEBUG
2379     printf("afsHashInsertFind: %d MADE\n", key);
2380 #endif
2381
2382     return &(ptr->element);
2383 }
2384
2385 ELEMENT *
2386 afsHashFind(KEY key)
2387 {
2388     int index;
2389     Element *ptr;
2390
2391 #if AFS_SV_SEMA_HASH_DEBUG
2392     printf("afsHashFind: %d\n", key);
2393 #endif
2394     if (!hashTable)
2395         osi_Panic("afs: afsHashFind: no hashTable\n");
2396
2397     index = Hash(key);          /* get bucket number */
2398     hashLock(hashTable[index].lock);    /* lock this bucket */
2399     ptr = hashTable[index].element;
2400
2401     /* it should be in the hash table */
2402     while (ptr) {
2403         if (ptr->key == key) {
2404             if (ptr->refCnt <= 0)
2405                 osi_Panic("afs: SEMA HashTable entry already released\n");
2406             hashUnlock(hashTable[index].lock);
2407 #if AFS_SV_SEMA_HASH_DEBUG
2408             printf("afsHashFind: %d FOUND\n", key);
2409 #endif
2410             return &(ptr->element);
2411         } else {
2412             ptr = ptr->next;
2413         }
2414     }
2415
2416     hashUnlock(hashTable[index].lock);
2417     /* it better be in the hash table */
2418     osi_Panic("afs: SEMA HashTable wants non-existent entry \n");
2419     return 0;
2420 }
2421
2422 void
2423 afsHashRelease(KEY key)
2424 {
2425     int index;
2426     Element *ptr;
2427
2428 #if AFS_SV_SEMA_HASH_DEBUG
2429     printf("afsHashRelease: %d\n", key);
2430 #endif
2431     if (!hashTable)
2432         osi_Panic("afs: afsHashRelease: no hashTable\n");
2433
2434     index = Hash(key);          /* get bucket number */
2435     hashLock(hashTable[index].lock);    /* lock this bucket */
2436     ptr = hashTable[index].element;
2437
2438     /* it should be in the hash table */
2439     while (ptr) {
2440         if (ptr->key == key) {
2441             if (ptr->refCnt <= 0)
2442                 osi_Panic("afs: SEMA HashTable entry already released\n");
2443             ptr->refCnt--;      /* release this guy */
2444             hashUnlock(hashTable[index].lock);
2445 #if AFS_SV_SEMA_HASH_DEBUG
2446             printf("afsHashRelease: %d FOUND\n", key);
2447 #endif
2448             return;
2449         } else {
2450             ptr = ptr->next;
2451         }
2452     }
2453
2454     hashUnlock(hashTable[index].lock);
2455     /* it better be in the hash table */
2456     osi_Panic("afs: SEMA HashTable deleting non-existent entry \n");
2457 }
2458
2459 /* this should be called with afsHashLock WRITE locked */
2460 static void
2461 afsHashGarbageCollect()
2462 {
2463     int index;
2464     Element *ptr;
2465     int foundFlag = 0;
2466
2467     if (!hashTable)
2468         osi_Panic("afs: afsHashGarbageCollect: no hashTable\n");
2469
2470     for (index = 0; index < sizeOfHashTable; index++) {
2471         hashLock(hashTable[index].lock);
2472         ptr = hashTable[index].element; /* pick up bucket */
2473
2474         while (ptr && !ptr->refCnt) {
2475             /* insert this element into free list */
2476             Element *temp;
2477             temp = ptr->next;
2478             ptr->next = freeList;
2479             freeList = ptr;
2480
2481             foundFlag = 1;      /* found at least one */
2482             currentSize -= sizeof(Element);
2483             ptr = temp;
2484         }
2485         hashTable[index].element = ptr;
2486
2487         /* scan thru the remaining list */
2488         if (ptr) {
2489             while (ptr->next) {
2490                 if (ptr->next->refCnt == 0) {
2491                     /* collect this element */
2492                     Element *temp;
2493                     temp = ptr->next;
2494                     ptr->next = ptr->next->next;
2495                     temp->next = freeList;
2496                     freeList = temp;
2497                     foundFlag = 1;
2498                     currentSize -= sizeof(Element);
2499                 } else {
2500                     ptr = ptr->next;
2501                 }
2502             }
2503         }
2504         hashUnlock(hashTable[index].lock);
2505     }
2506 #if 0
2507     if (!foundFlag)
2508         osi_Panic("afs: SEMA HashTable full\n");
2509 #endif
2510 }
2511
2512 #endif /* AFS_SV_SEMA_HASH */
2513
2514
2515 afs_hp_strategy(bp)
2516      struct buf *bp;
2517 {
2518     afs_int32 code;
2519     struct uio tuio;
2520     struct iovec tiovec[1];
2521     extern caddr_t hdl_kmap_bp();
2522     struct kthread *t = u.u_kthreadp;
2523
2524     AFS_STATCNT(afs_hp_strategy);
2525     /*
2526      * hdl_kmap_bp() saves "b_bcount" and restores it in hdl_remap_bp() after
2527      * the I/O.  We must save and restore the count because pageiodone()
2528      * uses b_bcount to determine how many pages to unlock.
2529      *
2530      * Remap the entire range.
2531      */
2532     hdl_kmap_bp(bp);
2533
2534     AFS_GLOCK();
2535     afs_Trace4(afs_iclSetp, CM_TRACE_HPSTRAT, ICL_TYPE_POINTER, bp->b_vp,
2536                ICL_TYPE_LONG, (int)bp->b_blkno * DEV_BSIZE, ICL_TYPE_LONG,
2537                bp->b_bcount, ICL_TYPE_LONG, 0);
2538
2539     /* Set up the uio structure */
2540     tuio.afsio_iov = tiovec;
2541     tuio.afsio_iovcnt = 1;
2542     tuio.afsio_offset = DEV_BSIZE * bp->b_blkno;
2543     tuio.afsio_seg = AFS_UIOSYS;
2544     tuio.afsio_resid = bp->b_bcount;
2545     tuio.uio_fpflags = 0;
2546     tiovec[0].iov_base = bp->b_un.b_addr;
2547     tiovec[0].iov_len = bp->b_bcount;
2548
2549     /* Do the I/O */
2550     if ((bp->b_flags & B_READ) == B_READ) {
2551         /* read b_bcount bytes into kernel address b_un.b_addr
2552          * starting at byte DEV_BSIZE * b_blkno. Bzero anything
2553          * we can't read, and finally call iodone(bp).  File is
2554          * in bp->b_vp. Credentials are from u area??
2555          */
2556         code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_READ, 0, kt_cred(t));
2557         if (code == 0)
2558             if (tuio.afsio_resid > 0) {
2559                 privlbzero(bvtospace(bp, bp->b_un.b_addr),
2560                            bp->b_un.b_addr + bp->b_bcount - tuio.afsio_resid,
2561                            (size_t) tuio.afsio_resid);
2562
2563             }
2564     } else
2565         code = afs_rdwr(VTOAFS(bp->b_vp), &tuio, UIO_WRITE, 0, kt_cred(t));
2566
2567     /* Remap back to the user's space */
2568     hdl_remap_bp(bp);
2569
2570     AFS_GUNLOCK();
2571
2572     iodone(bp);
2573     return code;
2574 }
2575
2576 afs_pathconf(vp, name, resultp, cred)
2577      struct vnode *vp;
2578      int name;
2579      int *resultp;
2580      struct ucred *cred;        /* unused */
2581 {
2582     switch (name) {
2583     case _PC_LINK_MAX:          /* Maximum number of links to a file */
2584         *resultp = 255;         /* an unsigned short on the fileserver */
2585         break;                  /* a unsigned char in the client.... */
2586
2587     case _PC_NAME_MAX:          /* Max length of file name */
2588         *resultp = 255;
2589         break;
2590
2591     case _PC_PATH_MAX:          /* Maximum length of Path Name */
2592         *resultp = 1024;
2593         break;
2594
2595     case _PC_PIPE_BUF:          /* Max atomic write to pipe.  See fifo_vnops */
2596     case _PC_CHOWN_RESTRICTED:  /* Anybody can chown? */
2597     case _PC_NO_TRUNC:          /* No file name truncation on overflow? */
2598         u.u_error = EOPNOTSUPP;
2599         return (EOPNOTSUPP);
2600         break;
2601
2602     case _PC_MAX_CANON: /* TTY buffer size for canonical input */
2603         /* need more work here for pty, ite buffer size, if differ */
2604         if (vp->v_type != VCHR) {
2605             u.u_error = EINVAL;
2606             return (EINVAL);
2607         }
2608         *resultp = CANBSIZ;     /*for tty */
2609         break;
2610
2611     case _PC_MAX_INPUT:
2612         /* need more work here for pty, ite buffer size, if differ */
2613         if (vp->v_type != VCHR) {       /* TTY buffer size */
2614             u.u_error = EINVAL;
2615             return (EINVAL);
2616         }
2617         *resultp = TTYHOG;      /*for tty */
2618         break;
2619
2620     case _PC_VDISABLE:
2621         /* Terminal special characters can be disabled? */
2622         if (vp->v_type != VCHR) {
2623             u.u_error = EINVAL;
2624             return (EINVAL);
2625         }
2626         *resultp = 1;
2627         break;
2628
2629     case _PC_SYNC_IO:
2630         if ((vp->v_type != VREG) && (vp->v_type != VBLK)) {
2631             *resultp = -1;
2632             return EINVAL;
2633         }
2634         *resultp = 1;           /* Synchronized IO supported for this file */
2635         break;
2636
2637     case _PC_FILESIZEBITS:
2638         if (vp->v_type != VDIR)
2639             return (EINVAL);
2640         *resultp = MAX_SMALL_FILE_BITS;
2641         break;
2642
2643     default:
2644         return (EINVAL);
2645     }
2646
2647     return (0);
2648 }