DAFS: Wait for exclusive ops in VFreeBitMapEntry_r

[openafs.git] / src / vol / volume.c
diff --git a/src/vol/volume.c b/src/vol/volume.c

index 938a836..1217ec6 100644 (file)
--- a/src/vol/volume.c
+++ b/src/vol/volume.c
@@ -1,12 +1,12 @@
 /*
  * Copyright 2000, International Business Machines Corporation and others.
  * All Rights Reserved.
- * 
+ *
  * This software has been released under the terms of the IBM Public
  * License.  For details, see the LICENSE file in the top-level source
  * directory or online at http://www.openafs.org/dl/license10.html
  *
- * Portions Copyright (c) 2006 Sine Nomine Associates
+ * Portions Copyright (c) 2005-2008 Sine Nomine Associates
  */
 
 /* 1/1/89: NB:  this stuff is all going to be replaced.  Don't take it too seriously */
@@ -21,12 +21,12 @@
 #include <afsconfig.h>
 #include <afs/param.h>
 
-RCSID
-    ("$Header$");
+#include <roken.h>
 
 #include <rx/xdr.h>
 #include <afs/afsint.h>
 #include <ctype.h>
+#include <signal.h>
 #ifndef AFS_NT40_ENV
 #include <sys/param.h>
 #if !defined(AFS_SGI_ENV)
@@ -46,7 +46,7 @@ RCSID
 #endif
 #endif
 #else /* AFS_VFSINCL_ENV */
-#if !defined(AFS_AIX_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV)
+#if !defined(AFS_AIX_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV) && !defined(AFS_ARM_DARWIN_ENV)
 #include <sys/fs.h>
 #endif
 #endif /* AFS_VFSINCL_ENV */
@@ -123,14 +123,12 @@ RCSID
 #include "vnode.h"
 #include "volume.h"
 #include "partition.h"
-#ifdef AFS_PTHREAD_ENV
-#include <assert.h>
-#else /* AFS_PTHREAD_ENV */
-#include "afs/assert.h"
-#endif /* AFS_PTHREAD_ENV */
+#include "volume_inline.h"
+#include "common.h"
+#include "afs/afs_assert.h"
 #include "vutils.h"
 #ifndef AFS_NT40_ENV
-#include <dir/dir.h>
+#include <afs/dir.h>
 #include <unistd.h>
 #endif
 
@@ -153,24 +151,37 @@ pthread_mutex_t vol_glock_mutex;
 pthread_mutex_t vol_trans_mutex;
 pthread_cond_t vol_put_volume_cond;
 pthread_cond_t vol_sleep_cond;
+pthread_cond_t vol_init_attach_cond;
+pthread_cond_t vol_vinit_cond;
 int vol_attach_threads = 1;
 #endif /* AFS_PTHREAD_ENV */
 
+/* start-time configurable I/O parameters */
+ih_init_params vol_io_params;
+
 #ifdef AFS_DEMAND_ATTACH_FS
 pthread_mutex_t vol_salvsync_mutex;
+
+/*
+ * Set this to 1 to disallow SALVSYNC communication in all threads; used
+ * during shutdown, since the salvageserver may have gone away.
+ */
+static volatile sig_atomic_t vol_disallow_salvsync = 0;
 #endif /* AFS_DEMAND_ATTACH_FS */
 
+/**
+ * has VShutdown_r been called / is VShutdown_r running?
+ */
+static int vol_shutting_down = 0;
+
 #ifdef AFS_OSF_ENV
 extern void *calloc(), *realloc();
 #endif
 
-/*@printflike@*/ extern void Log(const char *format, ...);
-
 /* Forward declarations */
-static Volume *attach2(Error * ec, VolId vid, char *path,
-                      register struct VolumeHeader *header,
-                      struct DiskPartition *partp, Volume * vp, 
-                      int isbusy, int mode);
+static Volume *attach2(Error * ec, VolId volumeId, char *path,
+                      struct DiskPartition64 *partp, Volume * vp,
+                      int isbusy, int mode, int *acheckedOut);
 static void ReallyFreeVolume(Volume * vp);
 #ifdef AFS_DEMAND_ATTACH_FS
 static void FreeVolume(Volume * vp);
@@ -179,31 +190,36 @@ static void FreeVolume(Volume * vp);
 static void VScanUpdateList(void);
 #endif /* !AFS_DEMAND_ATTACH_FS */
 static void VInitVolumeHeaderCache(afs_uint32 howMany);
-static int GetVolumeHeader(register Volume * vp);
-static void ReleaseVolumeHeader(register struct volHeader *hd);
-static void FreeVolumeHeader(register Volume * vp);
-static void AddVolumeToHashTable(register Volume * vp, int hashid);
-static void DeleteVolumeFromHashTable(register Volume * vp);
+static int GetVolumeHeader(Volume * vp);
+static void ReleaseVolumeHeader(struct volHeader *hd);
+static void FreeVolumeHeader(Volume * vp);
+static void AddVolumeToHashTable(Volume * vp, int hashid);
+static void DeleteVolumeFromHashTable(Volume * vp);
+#if 0
 static int VHold(Volume * vp);
+#endif
 static int VHold_r(Volume * vp);
 static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class);
-static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp,
-                         char **namep);
 static void VReleaseVolumeHandles_r(Volume * vp);
 static void VCloseVolumeHandles_r(Volume * vp);
 static void LoadVolumeHeader(Error * ec, Volume * vp);
-static int VCheckOffline(register Volume * vp);
-static int VCheckDetach(register Volume * vp);
-static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags);
-static int VolumeExternalName_r(VolumeId volumeId, char * name, size_t len);
+static int VCheckOffline(Volume * vp);
+static int VCheckDetach(Volume * vp);
+static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId,
+                          Volume * hint, const struct timespec *ts);
 
 int LogLevel;                  /* Vice loglevel--not defined as extern so that it will be
                                 * defined when not linked with vice, XXXX */
 ProgramType programType;       /* The type of program using the package */
+static VolumePackageOptions vol_opts;
 
 /* extended volume package statistics */
 VolPkgStats VStats;
 
+#ifdef VOL_LOCK_DEBUG
+pthread_t vol_glock_holder = 0;
+#endif
+
 
 #define VOLUME_BITMAP_GROWSIZE 16      /* bytes, => 128vnodes */
                                        /* Must be a multiple of 4 (1 word) !! */
@@ -233,7 +249,7 @@ VolPkgStats VStats;
 /*
  * when possible, don't just reorder single elements, but reorder
  * entire chains of elements at once.  a chain of elements that
- * exceed the element previous to the pivot by at least CHAIN_THRESH 
+ * exceed the element previous to the pivot by at least CHAIN_THRESH
  * accesses are moved in front of the chain whose elements have at
  * least CHAIN_THRESH less accesses than the pivot element
  */
@@ -271,20 +287,77 @@ ffs(x)
 #endif /* !AFS_HAVE_FFS */
 
 #ifdef AFS_PTHREAD_ENV
+/**
+ * disk partition queue element
+ */
 typedef struct diskpartition_queue_t {
-    struct rx_queue queue;
-    struct DiskPartition * diskP;
+    struct rx_queue queue;             /**< queue header */
+    struct DiskPartition64 *diskP;     /**< disk partition table entry */
 } diskpartition_queue_t;
+
+#ifndef AFS_DEMAND_ATTACH_FS
+
 typedef struct vinitvolumepackage_thread_t {
     struct rx_queue queue;
     pthread_cond_t thread_done_cv;
     int n_threads_complete;
 } vinitvolumepackage_thread_t;
 static void * VInitVolumePackageThread(void * args);
+
+#else  /* !AFS_DEMAND_ATTTACH_FS */
+#define VINIT_BATCH_MAX_SIZE 512
+
+/**
+ * disk partition work queue
+ */
+struct partition_queue {
+    struct rx_queue head;              /**< diskpartition_queue_t queue */
+    pthread_mutex_t mutex;
+    pthread_cond_t cv;
+};
+
+/**
+ * volumes parameters for preattach
+ */
+struct volume_init_batch {
+    struct rx_queue queue;               /**< queue header */
+    int thread;                          /**< posting worker thread */
+    int last;                            /**< indicates thread is done */
+    int size;                            /**< number of volume ids in batch */
+    Volume *batch[VINIT_BATCH_MAX_SIZE]; /**< volumes ids to preattach */
+};
+
+/**
+ * volume parameters work queue
+ */
+struct volume_init_queue {
+    struct rx_queue head;                /**< volume_init_batch queue */
+    pthread_mutex_t mutex;
+    pthread_cond_t cv;
+};
+
+/**
+ * volume init worker thread parameters
+ */
+struct vinitvolumepackage_thread_param {
+    int nthreads;                        /**< total number of worker threads */
+    int thread;                          /**< thread number for this worker thread */
+    struct partition_queue *pq;          /**< queue partitions to scan */
+    struct volume_init_queue *vq;        /**< queue of volume to preattach */
+};
+
+static void *VInitVolumePackageThread(void *args);
+static struct DiskPartition64 *VInitNextPartition(struct partition_queue *pq);
+static VolId VInitNextVolumeId(DIR *dirp);
+static int VInitPreAttachVolumes(int nthreads, struct volume_init_queue *vq);
+
+#endif /* !AFS_DEMAND_ATTACH_FS */
 #endif /* AFS_PTHREAD_ENV */
 
-static int VAttachVolumesByPartition(struct DiskPartition *diskP, 
+#ifndef AFS_DEMAND_ATTACH_FS
+static int VAttachVolumesByPartition(struct DiskPartition64 *diskP,
                                     int * nAttached, int * nUnattached);
+#endif /* AFS_DEMAND_ATTACH_FS */
 
 
 #ifdef AFS_DEMAND_ATTACH_FS
@@ -340,15 +413,15 @@ static int VCheckFree(Volume * vp);
 /* VByP List */
 static void AddVolumeToVByPList_r(Volume * vp);
 static void DeleteVolumeFromVByPList_r(Volume * vp);
-static void VVByPListBeginExclusive_r(struct DiskPartition * dp);
-static void VVByPListEndExclusive_r(struct DiskPartition * dp);
-static void VVByPListWait_r(struct DiskPartition * dp);
+static void VVByPListBeginExclusive_r(struct DiskPartition64 * dp);
+static void VVByPListEndExclusive_r(struct DiskPartition64 * dp);
+static void VVByPListWait_r(struct DiskPartition64 * dp);
 
 /* online salvager */
-static int VCheckSalvage(register Volume * vp);
-static int VUpdateSalvagePriority_r(Volume * vp);
+static int VCheckSalvage(Volume * vp);
+#if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT)
 static int VScheduleSalvage_r(Volume * vp);
-static int VCancelSalvage_r(Volume * vp, int reason);
+#endif
 
 /* Volume hash table */
 static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp);
@@ -356,18 +429,9 @@ static void VHashBeginExclusive_r(VolumeHashChainHead * head);
 static void VHashEndExclusive_r(VolumeHashChainHead * head);
 static void VHashWait_r(VolumeHashChainHead * head);
 
-/* Volume state machine */
-static void VCreateReservation_r(Volume * vp);
-static void VCancelReservation_r(Volume * vp);
-static void VWaitStateChange_r(Volume * vp);
-static void VWaitExclusiveState_r(Volume * vp);
-static int IsExclusiveState(VolState state);
-static int IsErrorState(VolState state);
-static int IsValidState(VolState state);
-
 /* shutdown */
-static int ShutdownVByPForPass_r(struct DiskPartition * dp, int pass);
-static int ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+static int ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass);
+static int ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
                                struct rx_queue ** idx);
 static void ShutdownController(vshutdown_thread_t * params);
 static void ShutdownCreateSchedule(vshutdown_thread_t * params);
@@ -375,31 +439,35 @@ static void ShutdownCreateSchedule(vshutdown_thread_t * params);
 /* VLRU */
 static void VLRU_ComputeConstants(void);
 static void VInitVLRU(void);
-static void VLRU_Init_Node_r(volatile Volume * vp);
-static void VLRU_Add_r(volatile Volume * vp);
-static void VLRU_Delete_r(volatile Volume * vp);
-static void VLRU_UpdateAccess_r(volatile Volume * vp);
+static void VLRU_Init_Node_r(Volume * vp);
+static void VLRU_Add_r(Volume * vp);
+static void VLRU_Delete_r(Volume * vp);
+static void VLRU_UpdateAccess_r(Volume * vp);
 static void * VLRU_ScannerThread(void * args);
 static void VLRU_Scan_r(int idx);
 static void VLRU_Promote_r(int idx);
 static void VLRU_Demote_r(int idx);
-static void VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append);
+static void VLRU_SwitchQueues(Volume * vp, int new_idx, int append);
 
 /* soft detach */
-static int VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh);
-static int VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh);
-static int VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh);
+static int VCheckSoftDetach(Volume * vp, afs_uint32 thresh);
+static int VCheckSoftDetachCandidate(Volume * vp, afs_uint32 thresh);
+static int VSoftDetachVolume_r(Volume * vp, afs_uint32 thresh);
+
+
+pthread_key_t VThread_key;
+VThreadOptions_t VThread_defaults = {
+    0                           /**< allow salvsync */
+};
 #endif /* AFS_DEMAND_ATTACH_FS */
 
 
-struct Lock vol_listLock;      /* Lock obtained when listing volumes:  
-                                * prevents a volume from being missed 
-                                * if the volume is attached during a 
+struct Lock vol_listLock;      /* Lock obtained when listing volumes:
+                                * prevents a volume from being missed
+                                * if the volume is attached during a
                                 * list volumes */
 
 
-static int TimeZoneCorrection; /* Number of seconds west of GMT */
-
 /* Common message used when the volume goes off line */
 char *VSalvageMessage =
     "Files in this volume are currently unavailable; call operations";
@@ -410,6 +478,7 @@ int VInit;                  /* 0 - uninitialized,
                                 * 3 - initialized, all volumes have been attached, and
                                 * VConnectFS() has completed. */
 
+static int vinit_attach_abort = 0;
 
 bit32 VolumeCacheCheck;                /* Incremented everytime a volume goes on line--
                                 * used to stamp volume headers and in-core
@@ -424,52 +493,146 @@ bit32 VolumeCacheCheck;          /* Incremented everytime a volume goes on line--
 /* Startup routines                                */
 /***************************************************/
 
+#if defined(FAST_RESTART) && defined(AFS_DEMAND_ATTACH_FS)
+# error FAST_RESTART and DAFS are incompatible. For the DAFS equivalent \
+        of FAST_RESTART, use the -unsafe-nosalvage fileserver argument
+#endif
+
+/**
+ * assign default values to a VolumePackageOptions struct.
+ *
+ * Always call this on a VolumePackageOptions struct first, then set any
+ * specific options you want, then call VInitVolumePackage2.
+ *
+ * @param[in]  pt   caller's program type
+ * @param[out] opts volume package options
+ */
+void
+VOptDefaults(ProgramType pt, VolumePackageOptions *opts)
+{
+    opts->nLargeVnodes = opts->nSmallVnodes = 5;
+    opts->volcache = 0;
+
+    opts->canScheduleSalvage = 0;
+    opts->canUseFSSYNC = 0;
+    opts->canUseSALVSYNC = 0;
+
+    opts->interrupt_rxcall = NULL;
+    opts->offline_timeout = -1;
+    opts->offline_shutdown_timeout = -1;
+
+#ifdef FAST_RESTART
+    opts->unsafe_attach = 1;
+#else /* !FAST_RESTART */
+    opts->unsafe_attach = 0;
+#endif /* !FAST_RESTART */
+
+    switch (pt) {
+    case fileServer:
+       opts->canScheduleSalvage = 1;
+       opts->canUseSALVSYNC = 1;
+       break;
+
+    case salvageServer:
+       opts->canUseFSSYNC = 1;
+       break;
+
+    case volumeServer:
+       opts->nLargeVnodes = 0;
+       opts->nSmallVnodes = 0;
+
+       opts->canScheduleSalvage = 1;
+       opts->canUseFSSYNC = 1;
+       break;
+
+    default:
+       /* noop */
+       break;
+    }
+}
+
+/**
+ * Set VInit to a certain value, and signal waiters.
+ *
+ * @param[in] value  the value to set VInit to
+ *
+ * @pre VOL_LOCK held
+ */
+static void
+VSetVInit_r(int value)
+{
+    VInit = value;
+    CV_BROADCAST(&vol_vinit_cond);
+}
+
+static_inline void
+VLogOfflineTimeout(const char *type, afs_int32 timeout)
+{
+    if (timeout < 0) {
+       return;
+    }
+    if (timeout == 0) {
+       Log("VInitVolumePackage: Interrupting clients accessing %s "
+           "immediately\n", type);
+    } else {
+       Log("VInitVolumePackage: Interrupting clients accessing %s "
+           "after %ld second%s\n", type, (long)timeout, timeout==1?"":"s");
+    }
+}
+
 int
-VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVnodes,
-                  int connect, afs_uint32 volcache)
+VInitVolumePackage2(ProgramType pt, VolumePackageOptions * opts)
 {
     int errors = 0;            /* Number of errors while finding vice partitions. */
-    struct timeval tv;
-    struct timezone tz;
 
     programType = pt;
+    vol_opts = *opts;
+
+#ifndef AFS_PTHREAD_ENV
+    if (opts->offline_timeout != -1 || opts->offline_shutdown_timeout != -1) {
+       Log("VInitVolumePackage: offline_timeout and/or "
+           "offline_shutdown_timeout was specified, but the volume package "
+           "does not support these for LWP builds\n");
+       return -1;
+    }
+#endif
+    VLogOfflineTimeout("volumes going offline", opts->offline_timeout);
+    VLogOfflineTimeout("volumes going offline during shutdown",
+                       opts->offline_shutdown_timeout);
 
-#ifdef AFS_DEMAND_ATTACH_FS
     memset(&VStats, 0, sizeof(VStats));
     VStats.hdr_cache_size = 200;
-#endif
 
     VInitPartitionPackage();
     VInitVolumeHash();
-    VInitVnHashByVolume();
 #ifdef AFS_DEMAND_ATTACH_FS
     if (programType == fileServer) {
        VInitVLRU();
     } else {
        VLRU_SetOptions(VLRU_SET_ENABLED, 0);
     }
+    osi_Assert(pthread_key_create(&VThread_key, NULL) == 0);
 #endif
 
-#ifdef AFS_PTHREAD_ENV
-    assert(pthread_mutex_init(&vol_glock_mutex, NULL) == 0);
-    assert(pthread_mutex_init(&vol_trans_mutex, NULL) == 0);
-    assert(pthread_cond_init(&vol_put_volume_cond, NULL) == 0);
-    assert(pthread_cond_init(&vol_sleep_cond, NULL) == 0);
-#else /* AFS_PTHREAD_ENV */
+    MUTEX_INIT(&vol_glock_mutex, "vol glock", MUTEX_DEFAULT, 0);
+    MUTEX_INIT(&vol_trans_mutex, "vol trans", MUTEX_DEFAULT, 0);
+    CV_INIT(&vol_put_volume_cond, "vol put", CV_DEFAULT, 0);
+    CV_INIT(&vol_sleep_cond, "vol sleep", CV_DEFAULT, 0);
+    CV_INIT(&vol_init_attach_cond, "vol init attach", CV_DEFAULT, 0);
+    CV_INIT(&vol_vinit_cond, "vol init", CV_DEFAULT, 0);
+#ifndef AFS_PTHREAD_ENV
     IOMGR_Initialize();
 #endif /* AFS_PTHREAD_ENV */
     Lock_Init(&vol_listLock);
 
     srandom(time(0));          /* For VGetVolumeInfo */
-    gettimeofday(&tv, &tz);
-    TimeZoneCorrection = tz.tz_minuteswest * 60;
 
 #ifdef AFS_DEMAND_ATTACH_FS
-    assert(pthread_mutex_init(&vol_salvsync_mutex, NULL) == 0);
+    MUTEX_INIT(&vol_salvsync_mutex, "salvsync", MUTEX_DEFAULT, 0);
 #endif /* AFS_DEMAND_ATTACH_FS */
 
-    /* Ok, we have done enough initialization that fileserver can 
-     * start accepting calls, even though the volumes may not be 
+    /* Ok, we have done enough initialization that fileserver can
+     * start accepting calls, even though the volumes may not be
      * available just yet.
      */
     VInit = 1;
@@ -485,41 +648,110 @@ VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVno
     }
 #endif
 #if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT)
-    if (programType == fileServer) {
+    if (VCanUseSALVSYNC()) {
        /* establish a connection to the salvager at this point */
-       assert(VConnectSALV() != 0);
+       osi_Assert(VConnectSALV() != 0);
     }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
-    if (volcache > VStats.hdr_cache_size)
-       VStats.hdr_cache_size = volcache;
+    if (opts->volcache > VStats.hdr_cache_size)
+       VStats.hdr_cache_size = opts->volcache;
     VInitVolumeHeaderCache(VStats.hdr_cache_size);
 
-    VInitVnodes(vLarge, nLargeVnodes);
-    VInitVnodes(vSmall, nSmallVnodes);
+    VInitVnodes(vLarge, opts->nLargeVnodes);
+    VInitVnodes(vSmall, opts->nSmallVnodes);
 
 
     errors = VAttachPartitions();
     if (errors)
        return -1;
 
-    if (programType == fileServer) {
-       struct DiskPartition *diskP;
-#ifdef AFS_PTHREAD_ENV
+    if (programType != fileServer) {
+        errors = VInitAttachVolumes(programType);
+        if (errors) {
+            return -1;
+        }
+    }
+
+#ifdef FSSYNC_BUILD_CLIENT
+    if (VCanUseFSSYNC()) {
+       if (!VConnectFS()) {
+#ifdef AFS_DEMAND_ATTACH_FS
+           if (programType == salvageServer) {
+               Log("Unable to connect to file server; aborted\n");
+               exit(1);
+           }
+#endif /* AFS_DEMAND_ATTACH_FS */
+           Log("Unable to connect to file server; will retry at need\n");
+       }
+    }
+#endif /* FSSYNC_BUILD_CLIENT */
+    return 0;
+}
+
+
+#if !defined(AFS_PTHREAD_ENV)
+/**
+ * Attach volumes in vice partitions
+ *
+ * @param[in]  pt         calling program type
+ *
+ * @return 0
+ * @note This is the original, non-threaded version of attach parititions.
+ *
+ * @post VInit state is 2
+ */
+int
+VInitAttachVolumes(ProgramType pt)
+{
+    osi_Assert(VInit==1);
+    if (pt == fileServer) {
+       struct DiskPartition64 *diskP;
+       /* Attach all the volumes in this partition */
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           int nAttached = 0, nUnattached = 0;
+           osi_Assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
+       }
+    }
+    VOL_LOCK;
+    VSetVInit_r(2);                    /* Initialized, and all volumes have been attached */
+    LWP_NoYieldSignal(VInitAttachVolumes);
+    VOL_UNLOCK;
+    return 0;
+}
+#endif /* !AFS_PTHREAD_ENV */
+
+#if defined(AFS_PTHREAD_ENV) && !defined(AFS_DEMAND_ATTACH_FS)
+/**
+ * Attach volumes in vice partitions
+ *
+ * @param[in]  pt         calling program type
+ *
+ * @return 0
+ * @note Threaded version of attach parititions.
+ *
+ * @post VInit state is 2
+ */
+int
+VInitAttachVolumes(ProgramType pt)
+{
+    osi_Assert(VInit==1);
+    if (pt == fileServer) {
+       struct DiskPartition64 *diskP;
        struct vinitvolumepackage_thread_t params;
        struct diskpartition_queue_t * dpq;
        int i, threads, parts;
        pthread_t tid;
        pthread_attr_t attrs;
 
-       assert(pthread_cond_init(&params.thread_done_cv,NULL) == 0);
+       CV_INIT(&params.thread_done_cv, "thread done", CV_DEFAULT, 0);
        queue_Init(&params);
        params.n_threads_complete = 0;
 
        /* create partition work queue */
        for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
            dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
-           assert(dpq != NULL);
+           osi_Assert(dpq != NULL);
            dpq->diskP = diskP;
            queue_Append(&params,dpq);
        }
@@ -528,88 +760,52 @@ VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVno
 
        if (threads > 1) {
            /* spawn off a bunch of initialization threads */
-           assert(pthread_attr_init(&attrs) == 0);
-           assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+           osi_Assert(pthread_attr_init(&attrs) == 0);
+           osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
 
            Log("VInitVolumePackage: beginning parallel fileserver startup\n");
-#ifdef AFS_DEMAND_ATTACH_FS
-           Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n",
-               threads, parts);
-#else /* AFS_DEMAND_ATTACH_FS */
            Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n",
                threads, parts);
-#endif /* AFS_DEMAND_ATTACH_FS */
 
            VOL_LOCK;
            for (i=0; i < threads; i++) {
-               assert(pthread_create
+                AFS_SIGSET_DECL;
+                AFS_SIGSET_CLEAR();
+               osi_Assert(pthread_create
                       (&tid, &attrs, &VInitVolumePackageThread,
                        &params) == 0);
+                AFS_SIGSET_RESTORE();
            }
 
            while(params.n_threads_complete < threads) {
-               pthread_cond_wait(&params.thread_done_cv,&vol_glock_mutex);
+               VOL_CV_WAIT(&params.thread_done_cv);
            }
            VOL_UNLOCK;
 
-           assert(pthread_attr_destroy(&attrs) == 0);
+           osi_Assert(pthread_attr_destroy(&attrs) == 0);
        } else {
            /* if we're only going to run one init thread, don't bother creating
             * another LWP */
            Log("VInitVolumePackage: beginning single-threaded fileserver startup\n");
-#ifdef AFS_DEMAND_ATTACH_FS
-           Log("VInitVolumePackage: using 1 thread to pre-attach volumes on %d partition(s)\n",
-               parts);
-#else /* AFS_DEMAND_ATTACH_FS */
            Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n",
                parts);
-#endif /* AFS_DEMAND_ATTACH_FS */
 
            VInitVolumePackageThread(&params);
        }
 
-       assert(pthread_cond_destroy(&params.thread_done_cv) == 0);
-
-#else /* AFS_PTHREAD_ENV */
-       DIR *dirp;
-       struct dirent *dp;
-
-       /* Attach all the volumes in this partition */
-       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-           int nAttached = 0, nUnattached = 0;
-           assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
-       }
-#endif /* AFS_PTHREAD_ENV */
-    }
-
-    VInit = 2;                 /* Initialized, and all volumes have been attached */
-#ifdef FSSYNC_BUILD_CLIENT
-    if (programType == volumeUtility && connect) {
-       if (!VConnectFS()) {
-           Log("Unable to connect to file server; aborted\n");
-           exit(1);
-       }
-    }
-#ifdef AFS_DEMAND_ATTACH_FS
-    else if (programType == salvageServer) {
-       if (!VConnectFS()) {
-           Log("Unable to connect to file server; aborted\n");
-           exit(1);
-       }
+       CV_DESTROY(&params.thread_done_cv);
     }
-#endif /* AFS_DEMAND_ATTACH_FS */
-#endif /* FSSYNC_BUILD_CLIENT */
+    VOL_LOCK;
+    VSetVInit_r(2);                    /* Initialized, and all volumes have been attached */
+    CV_BROADCAST(&vol_init_attach_cond);
+    VOL_UNLOCK;
     return 0;
 }
 
-#ifdef AFS_PTHREAD_ENV
 static void *
 VInitVolumePackageThread(void * args) {
-    int errors = 0;            /* Number of errors while finding vice partitions. */
 
-    DIR *dirp;
-    struct dirent *dp;
-    struct DiskPartition *diskP;
+    struct DiskPartition64 *diskP;
     struct vinitvolumepackage_thread_t * params;
     struct diskpartition_queue_t * dpq;
 
@@ -621,29 +817,314 @@ VInitVolumePackageThread(void * args) {
     while (queue_IsNotEmpty(params)) {
         int nAttached = 0, nUnattached = 0;
 
+        if (vinit_attach_abort) {
+            Log("Aborting initialization\n");
+            goto done;
+        }
+
         dpq = queue_First(params,diskpartition_queue_t);
        queue_Remove(dpq);
        VOL_UNLOCK;
        diskP = dpq->diskP;
        free(dpq);
 
-       assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
+       osi_Assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
 
        VOL_LOCK;
     }
 
+done:
     params->n_threads_complete++;
-    pthread_cond_signal(&params->thread_done_cv);
+    CV_SIGNAL(&params->thread_done_cv);
     VOL_UNLOCK;
     return NULL;
 }
-#endif /* AFS_PTHREAD_ENV */
+#endif /* AFS_PTHREAD_ENV && !AFS_DEMAND_ATTACH_FS */
+
+#if defined(AFS_DEMAND_ATTACH_FS)
+/**
+ * Attach volumes in vice partitions
+ *
+ * @param[in]  pt         calling program type
+ *
+ * @return 0
+ * @note Threaded version of attach partitions.
+ *
+ * @post VInit state is 2
+ */
+int
+VInitAttachVolumes(ProgramType pt)
+{
+    osi_Assert(VInit==1);
+    if (pt == fileServer) {
+
+       struct DiskPartition64 *diskP;
+       struct partition_queue pq;
+        struct volume_init_queue vq;
+
+       int i, threads, parts;
+       pthread_t tid;
+       pthread_attr_t attrs;
+
+       /* create partition work queue */
+        queue_Init(&pq);
+       CV_INIT(&(pq.cv), "partq", CV_DEFAULT, 0);
+       MUTEX_INIT(&(pq.mutex), "partq", MUTEX_DEFAULT, 0);
+       for (parts = 0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
+           struct diskpartition_queue_t *dp;
+           dp = (struct diskpartition_queue_t*)malloc(sizeof(struct diskpartition_queue_t));
+           osi_Assert(dp != NULL);
+           dp->diskP = diskP;
+           queue_Append(&pq, dp);
+       }
+
+        /* number of worker threads; at least one, not to exceed the number of partitions */
+       threads = MIN(parts, vol_attach_threads);
+
+        /* create volume work queue */
+        queue_Init(&vq);
+       CV_INIT(&(vq.cv), "volq", CV_DEFAULT, 0);
+       MUTEX_INIT(&(vq.mutex), "volq", MUTEX_DEFAULT, 0);
+
+        osi_Assert(pthread_attr_init(&attrs) == 0);
+        osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+
+        Log("VInitVolumePackage: beginning parallel fileserver startup\n");
+        Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n",
+               threads, parts);
+
+        /* create threads to scan disk partitions. */
+       for (i=0; i < threads; i++) {
+           struct vinitvolumepackage_thread_param *params;
+            AFS_SIGSET_DECL;
+
+            params = (struct vinitvolumepackage_thread_param *)malloc(sizeof(struct vinitvolumepackage_thread_param));
+            osi_Assert(params);
+            params->pq = &pq;
+            params->vq = &vq;
+            params->nthreads = threads;
+            params->thread = i+1;
+
+            AFS_SIGSET_CLEAR();
+           osi_Assert(pthread_create (&tid, &attrs, &VInitVolumePackageThread, (void*)params) == 0);
+            AFS_SIGSET_RESTORE();
+       }
+
+        VInitPreAttachVolumes(threads, &vq);
+
+        osi_Assert(pthread_attr_destroy(&attrs) == 0);
+       CV_DESTROY(&pq.cv);
+       MUTEX_DESTROY(&pq.mutex);
+       CV_DESTROY(&vq.cv);
+       MUTEX_DESTROY(&vq.mutex);
+    }
+
+    VOL_LOCK;
+    VSetVInit_r(2);                    /* Initialized, and all volumes have been attached */
+    CV_BROADCAST(&vol_init_attach_cond);
+    VOL_UNLOCK;
+
+    return 0;
+}
+
+/**
+ * Volume package initialization worker thread. Scan partitions for volume
+ * header files. Gather batches of volume ids and dispatch them to
+ * the main thread to be preattached.  The volume preattachement is done
+ * in the main thread to avoid global volume lock contention.
+ */
+static void *
+VInitVolumePackageThread(void *args)
+{
+    struct vinitvolumepackage_thread_param *params;
+    struct DiskPartition64 *partition;
+    struct partition_queue *pq;
+    struct volume_init_queue *vq;
+    struct volume_init_batch *vb;
+
+    osi_Assert(args);
+    params = (struct vinitvolumepackage_thread_param *)args;
+    pq = params->pq;
+    vq = params->vq;
+    osi_Assert(pq);
+    osi_Assert(vq);
+
+    vb = (struct volume_init_batch*)malloc(sizeof(struct volume_init_batch));
+    osi_Assert(vb);
+    vb->thread = params->thread;
+    vb->last = 0;
+    vb->size = 0;
+
+    Log("Scanning partitions on thread %d of %d\n", params->thread, params->nthreads);
+    while((partition = VInitNextPartition(pq))) {
+        DIR *dirp;
+        VolId vid;
+
+        Log("Partition %s: pre-attaching volumes\n", partition->name);
+        dirp = opendir(VPartitionPath(partition));
+        if (!dirp) {
+            Log("opendir on Partition %s failed, errno=%d!\n", partition->name, errno);
+            continue;
+        }
+        while ((vid = VInitNextVolumeId(dirp))) {
+            Volume *vp = (Volume*)malloc(sizeof(Volume));
+            osi_Assert(vp);
+            memset(vp, 0, sizeof(Volume));
+            vp->device = partition->device;
+            vp->partition = partition;
+            vp->hashid = vid;
+            queue_Init(&vp->vnode_list);
+            queue_Init(&vp->rx_call_list);
+           CV_INIT(&V_attachCV(vp), "partattach", CV_DEFAULT, 0);
+
+            vb->batch[vb->size++] = vp;
+            if (vb->size == VINIT_BATCH_MAX_SIZE) {
+               MUTEX_ENTER(&vq->mutex);
+                queue_Append(vq, vb);
+               CV_BROADCAST(&vq->cv);
+               MUTEX_EXIT(&vq->mutex);
+
+                vb = (struct volume_init_batch*)malloc(sizeof(struct volume_init_batch));
+                osi_Assert(vb);
+                vb->thread = params->thread;
+                vb->size = 0;
+                vb->last = 0;
+            }
+        }
+        closedir(dirp);
+    }
+
+    vb->last = 1;
+    MUTEX_ENTER(&vq->mutex);
+    queue_Append(vq, vb);
+    CV_BROADCAST(&vq->cv);
+    MUTEX_EXIT(&vq->mutex);
+
+    Log("Partition scan thread %d of %d ended\n", params->thread, params->nthreads);
+    free(params);
+    return NULL;
+}
+
+/**
+ * Read next element from the pre-populated partition list.
+ */
+static struct DiskPartition64*
+VInitNextPartition(struct partition_queue *pq)
+{
+    struct DiskPartition64 *partition;
+    struct diskpartition_queue_t *dp; /* queue element */
+
+    if (vinit_attach_abort) {
+        Log("Aborting volume preattach thread.\n");
+        return NULL;
+    }
+
+    /* get next partition to scan */
+    MUTEX_ENTER(&pq->mutex);
+    if (queue_IsEmpty(pq)) {
+       MUTEX_EXIT(&pq->mutex);
+        return NULL;
+    }
+    dp = queue_First(pq, diskpartition_queue_t);
+    queue_Remove(dp);
+    MUTEX_EXIT(&pq->mutex);
+
+    osi_Assert(dp);
+    osi_Assert(dp->diskP);
+
+    partition = dp->diskP;
+    free(dp);
+    return partition;
+}
+
+/**
+ * Find next volume id on the partition.
+ */
+static VolId
+VInitNextVolumeId(DIR *dirp)
+{
+    struct dirent *d;
+    VolId vid = 0;
+    char *ext;
+
+    while((d = readdir(dirp))) {
+        if (vinit_attach_abort) {
+            Log("Aborting volume preattach thread.\n");
+            break;
+        }
+        ext = strrchr(d->d_name, '.');
+        if (d->d_name[0] == 'V' && ext && strcmp(ext, VHDREXT) == 0) {
+            vid = VolumeNumber(d->d_name);
+            if (vid) {
+               break;
+            }
+            Log("Warning: bogus volume header file: %s\n", d->d_name);
+        }
+    }
+    return vid;
+}
+
+/**
+ * Preattach volumes in batches to avoid lock contention.
+ */
+static int
+VInitPreAttachVolumes(int nthreads, struct volume_init_queue *vq)
+{
+    struct volume_init_batch *vb;
+    int i;
+
+    while (nthreads) {
+        /* dequeue next volume */
+       MUTEX_ENTER(&vq->mutex);
+        if (queue_IsEmpty(vq)) {
+           CV_WAIT(&vq->cv, &vq->mutex);
+        }
+        vb = queue_First(vq, volume_init_batch);
+        queue_Remove(vb);
+       MUTEX_EXIT(&vq->mutex);
+
+        if (vb->size) {
+            VOL_LOCK;
+            for (i = 0; i<vb->size; i++) {
+                Volume *vp;
+                Volume *dup;
+                Error ec = 0;
+
+                vp = vb->batch[i];
+               dup = VLookupVolume_r(&ec, vp->hashid, NULL);
+                if (ec) {
+                    Log("Error looking up volume, code=%d\n", ec);
+                }
+                else if (dup) {
+                    Log("Warning: Duplicate volume id %d detected.\n", vp->hashid);
+                }
+                else {
+                    /* put pre-attached volume onto the hash table
+                     * and bring it up to the pre-attached state */
+                    AddVolumeToHashTable(vp, vp->hashid);
+                    AddVolumeToVByPList_r(vp);
+                    VLRU_Init_Node_r(vp);
+                    VChangeState_r(vp, VOL_STATE_PREATTACHED);
+                }
+            }
+            VOL_UNLOCK;
+        }
+
+        if (vb->last) {
+            nthreads--;
+        }
+        free(vb);
+    }
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
 
+#if !defined(AFS_DEMAND_ATTACH_FS)
 /*
  * attach all volumes on a given disk partition
  */
 static int
-VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nUnattached)
+VAttachVolumesByPartition(struct DiskPartition64 *diskP, int * nAttached, int * nUnattached)
 {
   DIR * dirp;
   struct dirent * dp;
@@ -659,16 +1140,17 @@ VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nU
   while ((dp = readdir(dirp))) {
     char *p;
     p = strrchr(dp->d_name, '.');
+
+    if (vinit_attach_abort) {
+      Log("Partition %s: abort attach volumes\n", diskP->name);
+      goto done;
+    }
+
     if (p != NULL && strcmp(p, VHDREXT) == 0) {
       Error error;
       Volume *vp;
-#ifdef AFS_DEMAND_ATTACH_FS
-      vp = VPreAttachVolumeByName(&error, diskP->name, dp->d_name,
-                                  V_VOLUPD);
-#else /* AFS_DEMAND_ATTACH_FS */
       vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
                               V_VOLUPD);
-#endif /* AFS_DEMAND_ATTACH_FS */
       (*(vp ? nAttached : nUnattached))++;
       if (error == VOFFLINE)
        Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
@@ -677,19 +1159,18 @@ VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nU
            diskP->name, VolumeNumber(dp->d_name),
            dp->d_name);
       }
-#if !defined(AFS_DEMAND_ATTACH_FS)
       if (vp) {
        VPutVolume(vp);
       }
-#endif /* AFS_DEMAND_ATTACH_FS */
     }
   }
 
   Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached);
+done:
   closedir(dirp);
   return ret;
 }
-
+#endif /* !AFS_DEMAND_ATTACH_FS */
 
 /***************************************************/
 /* Shutdown routines                               */
@@ -748,14 +1229,13 @@ VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nU
  *   shutdown all remaining volumes
  */
 
+#ifdef AFS_DEMAND_ATTACH_FS
+
 void
 VShutdown_r(void)
 {
     int i;
-    register Volume *vp, *np;
-    register afs_int32 code;
-#ifdef AFS_DEMAND_ATTACH_FS
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
     struct diskpartition_queue_t * dpq;
     vshutdown_thread_t params;
     pthread_t tid;
@@ -763,20 +1243,28 @@ VShutdown_r(void)
 
     memset(&params, 0, sizeof(vshutdown_thread_t));
 
+    if (VInit < 2) {
+        Log("VShutdown:  aborting attach volumes\n");
+        vinit_attach_abort = 1;
+        VOL_CV_WAIT(&vol_init_attach_cond);
+    }
+
     for (params.n_parts=0, diskP = DiskPartitionList;
         diskP; diskP = diskP->next, params.n_parts++);
 
-    Log("VShutdown:  shutting down on-line volumes on %d partition%s...\n", 
+    Log("VShutdown:  shutting down on-line volumes on %d partition%s...\n",
        params.n_parts, params.n_parts > 1 ? "s" : "");
 
+    vol_shutting_down = 1;
+
     if (vol_attach_threads > 1) {
        /* prepare for parallel shutdown */
        params.n_threads = vol_attach_threads;
-       assert(pthread_mutex_init(&params.lock, NULL) == 0);
-       assert(pthread_cond_init(&params.cv, NULL) == 0);
-       assert(pthread_cond_init(&params.master_cv, NULL) == 0);
-       assert(pthread_attr_init(&attrs) == 0);
-       assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+       MUTEX_INIT(&params.lock, "params", MUTEX_DEFAULT, 0);
+       CV_INIT(&params.cv, "params", CV_DEFAULT, 0);
+       CV_INIT(&params.master_cv, "params master", CV_DEFAULT, 0);
+       osi_Assert(pthread_attr_init(&attrs) == 0);
+       osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
        queue_Init(&params);
 
        /* setup the basic partition information structures for
@@ -798,15 +1286,15 @@ VShutdown_r(void)
            }
            Log("VShutdown: partition %s has %d volumes with attached headers\n",
                VPartitionPath(diskP), count);
-               
+
 
            /* build up the pass 0 shutdown work queue */
            dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
-           assert(dpq != NULL);
+           osi_Assert(dpq != NULL);
            dpq->diskP = diskP;
            queue_Prepend(&params, dpq);
 
-           params.part_pass_head[diskP->device] = queue_First(&diskP->vol_list, rx_queue);
+           params.part_pass_head[diskP->index] = queue_First(&diskP->vol_list, rx_queue);
        }
 
        Log("VShutdown:  beginning parallel fileserver shutdown\n");
@@ -814,47 +1302,47 @@ VShutdown_r(void)
            vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" );
 
        /* do pass 0 shutdown */
-       assert(pthread_mutex_lock(&params.lock) == 0);
+       MUTEX_ENTER(&params.lock);
        for (i=0; i < params.n_threads; i++) {
-           assert(pthread_create
+           osi_Assert(pthread_create
                   (&tid, &attrs, &VShutdownThread,
                    &params) == 0);
        }
-       
+
        /* wait for all the pass 0 shutdowns to complete */
        while (params.n_threads_complete < params.n_threads) {
-           assert(pthread_cond_wait(&params.master_cv, &params.lock) == 0);
+           CV_WAIT(&params.master_cv, &params.lock);
        }
        params.n_threads_complete = 0;
        params.pass = 1;
-       assert(pthread_cond_broadcast(&params.cv) == 0);
-       assert(pthread_mutex_unlock(&params.lock) == 0);
+       CV_BROADCAST(&params.cv);
+       MUTEX_EXIT(&params.lock);
 
        Log("VShutdown:  pass 0 completed using the 1 thread per partition algorithm\n");
        Log("VShutdown:  starting passes 1 through 3 using finely-granular mp-fast algorithm\n");
 
        /* run the parallel shutdown scheduler. it will drop the glock internally */
        ShutdownController(&params);
-       
+
        /* wait for all the workers to finish pass 3 and terminate */
        while (params.pass < 4) {
-           assert(pthread_cond_wait(&params.cv, &vol_glock_mutex) == 0);
+           VOL_CV_WAIT(&params.cv);
        }
-       
-       assert(pthread_attr_destroy(&attrs) == 0);
-       assert(pthread_cond_destroy(&params.cv) == 0);
-       assert(pthread_cond_destroy(&params.master_cv) == 0);
-       assert(pthread_mutex_destroy(&params.lock) == 0);
+
+       osi_Assert(pthread_attr_destroy(&attrs) == 0);
+       CV_DESTROY(&params.cv);
+       CV_DESTROY(&params.master_cv);
+       MUTEX_DESTROY(&params.lock);
 
        /* drop the VByPList exclusive reservations */
        for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
            VVByPListEndExclusive_r(diskP);
            Log("VShutdown:  %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
                VPartitionPath(diskP),
-               params.stats[0][diskP->device],
-               params.stats[1][diskP->device],
-               params.stats[2][diskP->device],
-               params.stats[3][diskP->device]);
+               params.stats[0][diskP->index],
+               params.stats[1][diskP->index],
+               params.stats[2][diskP->index],
+               params.stats[3][diskP->index]);
        }
 
        Log("VShutdown:  shutdown finished using %d threads\n", params.n_threads);
@@ -869,34 +1357,77 @@ VShutdown_r(void)
     }
 
     Log("VShutdown:  complete.\n");
+}
+
 #else /* AFS_DEMAND_ATTACH_FS */
-    Log("VShutdown:  shutting down on-line volumes...\n");
-    for (i = 0; i < VolumeHashTable.Size; i++) {
-       /* try to hold first volume in the hash table */
-       for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) {
-           code = VHold_r(vp);
-           if (code == 0) {
-               if (LogLevel >= 5)
-                   Log("VShutdown:  Attempting to take volume %u offline.\n",
+
+void
+VShutdown_r(void)
+{
+    int i;
+    Volume *vp, *np;
+    afs_int32 code;
+
+    if (VInit < 2) {
+        Log("VShutdown:  aborting attach volumes\n");
+        vinit_attach_abort = 1;
+#ifdef AFS_PTHREAD_ENV
+        VOL_CV_WAIT(&vol_init_attach_cond);
+#else
+        LWP_WaitProcess(VInitAttachVolumes);
+#endif /* AFS_PTHREAD_ENV */
+    }
+
+    Log("VShutdown:  shutting down on-line volumes...\n");
+    vol_shutting_down = 1;
+    for (i = 0; i < VolumeHashTable.Size; i++) {
+       /* try to hold first volume in the hash table */
+       for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) {
+           code = VHold_r(vp);
+           if (code == 0) {
+               if (LogLevel >= 5)
+                   Log("VShutdown:  Attempting to take volume %u offline.\n",
                        vp->hashid);
-               
+
                /* next, take the volume offline (drops reference count) */
                VOffline_r(vp, "File server was shut down");
            }
        }
     }
     Log("VShutdown:  complete.\n");
-#endif /* AFS_DEMAND_ATTACH_FS */
 }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 
 void
 VShutdown(void)
 {
+    osi_Assert(VInit>0);
     VOL_LOCK;
     VShutdown_r();
     VOL_UNLOCK;
 }
 
+/**
+ * stop new activity (e.g. SALVSYNC) from occurring
+ *
+ * Use this to make the volume package less busy; for example, during
+ * shutdown. This doesn't actually shutdown/detach anything in the
+ * volume package, but prevents certain processes from ocurring. For
+ * example, preventing new SALVSYNC communication in DAFS. In theory, we
+ * could also use this to prevent new volume attachment, or prevent
+ * other programs from checking out volumes, etc.
+ */
+void
+VSetTranquil(void)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* make sure we don't try to contact the salvageserver, since it may
+     * not be around anymore */
+    vol_disallow_salvsync = 1;
+#endif
+}
+
 #ifdef AFS_DEMAND_ATTACH_FS
 /*
  * demand attach fs
@@ -906,7 +1437,7 @@ static void
 ShutdownController(vshutdown_thread_t * params)
 {
     /* XXX debug */
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
     Device id;
     vshutdown_thread_t shadow;
 
@@ -925,12 +1456,12 @@ ShutdownController(vshutdown_thread_t * params)
        Log("ShutdownController:  n_threads_complete=%d, n_parts_done_pass=%d\n",
            shadow.n_threads_complete, shadow.n_parts_done_pass);
        for (diskP = DiskPartitionList; diskP; diskP=diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            Log("ShutdownController:  part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n",
-               id, 
+               id,
                diskP->vol_list.len,
-               shadow.part_thread_target[id], 
-               shadow.part_done_pass[id], 
+               shadow.part_thread_target[id],
+               shadow.part_done_pass[id],
                shadow.part_pass_head[id]);
        }
 
@@ -943,7 +1474,7 @@ ShutdownController(vshutdown_thread_t * params)
 
 /* create the shutdown thread work schedule.
  * this scheduler tries to implement fairness
- * by allocating at least 1 thread to each 
+ * by allocating at least 1 thread to each
  * partition with volumes to be shutdown,
  * and then it attempts to allocate remaining
  * threads based upon the amount of work left
@@ -951,7 +1482,7 @@ ShutdownController(vshutdown_thread_t * params)
 static void
 ShutdownCreateSchedule(vshutdown_thread_t * params)
 {
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
     int sum, thr_workload, thr_left;
     int part_residue[VOLMAXPARTS+1];
     Device id;
@@ -961,7 +1492,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
     for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
        sum += diskP->vol_list.len;
     }
-    
+
     params->schedule_version++;
     params->vol_remaining = sum;
 
@@ -979,7 +1510,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
     /* for fairness, give every partition with volumes remaining
      * at least one thread */
     for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
-       id = diskP->device;
+       id = diskP->index;
        if (diskP->vol_list.len) {
            params->part_thread_target[id] = 1;
            thr_left--;
@@ -993,7 +1524,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
        int delta;
 
        for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            delta = (diskP->vol_list.len / thr_workload) -
                params->part_thread_target[id];
            if (delta < 0) {
@@ -1013,12 +1544,12 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
     if (thr_left) {
        /* try to assign any leftover threads to partitions that
         * had volume lengths closer to needing thread_target+1 */
-       int max_residue, max_id;
+       int max_residue, max_id = 0;
 
        /* compute the residues */
        for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-           id = diskP->device;
-           part_residue[id] = diskP->vol_list.len - 
+           id = diskP->index;
+           part_residue[id] = diskP->vol_list.len -
                (params->part_thread_target[id] * thr_workload);
        }
 
@@ -1027,7 +1558,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
        while (thr_left) {
            max_residue = 0;
            for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-               id = diskP->device;
+               id = diskP->index;
                if (part_residue[id] > max_residue) {
                    max_residue = part_residue[id];
                    max_id = id;
@@ -1050,7 +1581,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
        if (thr_left >= params->n_parts) {
            alloc = thr_left / params->n_parts;
            for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-               id = diskP->device;
+               id = diskP->index;
                params->part_thread_target[id] += alloc;
                thr_left -= alloc;
            }
@@ -1058,7 +1589,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
 
        /* finish off the last of the threads */
        for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            params->part_thread_target[id]++;
            thr_left--;
        }
@@ -1069,51 +1600,49 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
 static void *
 VShutdownThread(void * args)
 {
-    struct rx_queue *qp;
-    Volume * vp;
     vshutdown_thread_t * params;
-    int part, code, found, pass, schedule_version_save, count;
-    struct DiskPartition *diskP;
+    int found, pass, schedule_version_save, count;
+    struct DiskPartition64 *diskP;
     struct diskpartition_queue_t * dpq;
     Device id;
 
     params = (vshutdown_thread_t *) args;
 
     /* acquire the shutdown pass 0 lock */
-    assert(pthread_mutex_lock(&params->lock) == 0);
+    MUTEX_ENTER(&params->lock);
 
     /* if there's still pass 0 work to be done,
      * get a work entry, and do a pass 0 shutdown */
     if (queue_IsNotEmpty(params)) {
        dpq = queue_First(params, diskpartition_queue_t);
        queue_Remove(dpq);
-       assert(pthread_mutex_unlock(&params->lock) == 0);
+       MUTEX_EXIT(&params->lock);
        diskP = dpq->diskP;
        free(dpq);
-       id = diskP->device;
+       id = diskP->index;
 
        count = 0;
        while (ShutdownVolumeWalk_r(diskP, 0, &params->part_pass_head[id]))
            count++;
-       params->stats[0][diskP->device] = count;
-       assert(pthread_mutex_lock(&params->lock) == 0);
+       params->stats[0][diskP->index] = count;
+       MUTEX_ENTER(&params->lock);
     }
 
     params->n_threads_complete++;
     if (params->n_threads_complete == params->n_threads) {
-      /* notify control thread that all workers have completed pass 0 */
-      assert(pthread_cond_signal(&params->master_cv) == 0);
+       /* notify control thread that all workers have completed pass 0 */
+       CV_SIGNAL(&params->master_cv);
     }
     while (params->pass == 0) {
-      assert(pthread_cond_wait(&params->cv, &params->lock) == 0);
+       CV_WAIT(&params->cv, &params->lock);
     }
 
     /* switch locks */
-    assert(pthread_mutex_unlock(&params->lock) == 0);
+    MUTEX_EXIT(&params->lock);
     VOL_LOCK;
 
     pass = params->pass;
-    assert(pass > 0);
+    osi_Assert(pass > 0);
 
     /* now escalate through the more complicated shutdowns */
     while (pass <= 3) {
@@ -1121,19 +1650,19 @@ VShutdownThread(void * args)
        found = 0;
        /* find a disk partition to work on */
        for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            if (params->part_thread_target[id] && !params->part_done_pass[id]) {
                params->part_thread_target[id]--;
                found = 1;
                break;
            }
        }
-       
+
        if (!found) {
-           /* hmm. for some reason the controller thread couldn't find anything for 
+           /* hmm. for some reason the controller thread couldn't find anything for
             * us to do. let's see if there's anything we can do */
            for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-               id = diskP->device;
+               id = diskP->index;
                if (diskP->vol_list.len && !params->part_done_pass[id]) {
                    found = 1;
                    break;
@@ -1147,7 +1676,7 @@ VShutdownThread(void * args)
                }
            }
        }
-       
+
        /* do work on this partition until either the controller
         * creates a new schedule, or we run out of things to do
         * on this partition */
@@ -1184,7 +1713,7 @@ VShutdownThread(void * args)
                    params->n_parts_done_pass = 0;
                    params->pass++;
                    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-                       id = diskP->device;
+                       id = diskP->index;
                        params->part_done_pass[id] = 0;
                        params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue);
                    }
@@ -1193,19 +1722,19 @@ VShutdownThread(void * args)
                    ShutdownCreateSchedule(params);
 
                    /* wake up all the workers */
-                   assert(pthread_cond_broadcast(&params->cv) == 0);
+                   CV_BROADCAST(&params->cv);
 
                    VOL_UNLOCK;
                    Log("VShutdown:  pass %d completed using %d threads on %d partitions\n",
                        pass, params->n_threads, params->n_parts);
                    VOL_LOCK;
                } else {
-                   assert(pthread_cond_wait(&params->cv, &vol_glock_mutex) == 0);
+                   VOL_CV_WAIT(&params->cv);
                }
            }
            pass = params->pass;
        }
-       
+
        /* for fairness */
        VOL_UNLOCK;
        pthread_yield();
@@ -1217,14 +1746,14 @@ VShutdownThread(void * args)
     return NULL;
 }
 
-/* shut down all volumes on a given disk partition 
+/* shut down all volumes on a given disk partition
  *
  * note that this function will not allow mp-fast
  * shutdown of a partition */
 int
-VShutdownByPartition_r(struct DiskPartition * dp)
+VShutdownByPartition_r(struct DiskPartition64 * dp)
 {
-    int pass, retVal;
+    int pass;
     int pass_stats[4];
     int total;
 
@@ -1235,7 +1764,7 @@ VShutdownByPartition_r(struct DiskPartition * dp)
     VVByPListBeginExclusive_r(dp);
 
     /* pick the low-hanging fruit first,
-     * then do the complicated ones last 
+     * then do the complicated ones last
      * (has the advantage of keeping
      *  in-use volumes up until the bitter end) */
     for (pass = 0, total=0; pass < 4; pass++) {
@@ -1249,7 +1778,7 @@ VShutdownByPartition_r(struct DiskPartition * dp)
     Log("VShutdownByPartition:  shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
        total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]);
 
-    return retVal;
+    return 0;
 }
 
 /* internal shutdown functionality
@@ -1258,13 +1787,13 @@ VShutdownByPartition_r(struct DiskPartition * dp)
  * 0 to only "shutdown" {pre,un}attached and error state volumes
  * 1 to also shutdown attached volumes w/ volume header loaded
  * 2 to also shutdown attached volumes w/o volume header loaded
- * 3 to also shutdown exclusive state volumes 
+ * 3 to also shutdown exclusive state volumes
  *
  * caller MUST hold exclusive access on the hash chain
  * because we drop vol_glock_mutex internally
- * 
- * this function is reentrant for passes 1--3 
- * (e.g. multiple threads can cooperate to 
+ *
+ * this function is reentrant for passes 1--3
+ * (e.g. multiple threads can cooperate to
  *  shutdown a partition mp-fast)
  *
  * pass 0 is not scaleable because the volume state data is
@@ -1273,10 +1802,10 @@ VShutdownByPartition_r(struct DiskPartition * dp)
  * traversal
  */
 static int
-ShutdownVByPForPass_r(struct DiskPartition * dp, int pass)
+ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass)
 {
     struct rx_queue * q = queue_First(&dp->vol_list, rx_queue);
-    register int i = 0;
+    int i = 0;
 
     while (ShutdownVolumeWalk_r(dp, pass, &q))
        i++;
@@ -1288,7 +1817,7 @@ ShutdownVByPForPass_r(struct DiskPartition * dp, int pass)
  * returns 1 if a volume was shutdown in this pass,
  * 0 otherwise */
 static int
-ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
                     struct rx_queue ** idx)
 {
     struct rx_queue *qp, *nqp;
@@ -1298,11 +1827,12 @@ ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
 
     for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) {
        vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list));
-       
+
        switch (pass) {
        case 0:
            if ((V_attachState(vp) != VOL_STATE_UNATTACHED) &&
                (V_attachState(vp) != VOL_STATE_ERROR) &&
+               (V_attachState(vp) != VOL_STATE_DELETED) &&
                (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
                break;
            }
@@ -1312,7 +1842,7 @@ ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
                break;
            }
        case 2:
-           if (IsExclusiveState(V_attachState(vp))) {
+           if (VIsExclusiveState(V_attachState(vp))) {
                break;
            }
        case 3:
@@ -1346,17 +1876,19 @@ VShutdownVolume_r(Volume * vp)
     /* wait for other blocking ops to finish */
     VWaitExclusiveState_r(vp);
 
-    assert(IsValidState(V_attachState(vp)));
-    
+    osi_Assert(VIsValidState(V_attachState(vp)));
+
     switch(V_attachState(vp)) {
     case VOL_STATE_SALVAGING:
-       /* make sure salvager knows we don't want
-        * the volume back */
-       VCancelSalvage_r(vp, SALVSYNC_SHUTDOWN);
+       /* Leave salvaging volumes alone. Any in-progress salvages will
+        * continue working after viced shuts down. This is intentional.
+        */
+
     case VOL_STATE_PREATTACHED:
     case VOL_STATE_ERROR:
        VChangeState_r(vp, VOL_STATE_UNATTACHED);
     case VOL_STATE_UNATTACHED:
+    case VOL_STATE_DELETED:
        break;
     case VOL_STATE_GOING_OFFLINE:
     case VOL_STATE_SHUTTING_DOWN:
@@ -1371,8 +1903,10 @@ VShutdownVolume_r(Volume * vp)
            VOffline_r(vp, "File server was shut down");
        }
        break;
+    default:
+       break;
     }
-    
+
     VCancelReservation_r(vp);
     vp = NULL;
     return 0;
@@ -1408,13 +1942,8 @@ ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
        return;
     }
 
-    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
-       *ec = VSALVAGE;
-       FDH_REALLYCLOSE(fdP);
-       return;
-    }
     vsn = (struct versionStamp *)to;
-    if (FDH_READ(fdP, to, size) != size || vsn->magic != magic) {
+    if (FDH_PREAD(fdP, to, size, 0) != size || vsn->magic != magic) {
        *ec = VSALVAGE;
        FDH_REALLYCLOSE(fdP);
        return;
@@ -1440,12 +1969,7 @@ WriteVolumeHeader_r(Error * ec, Volume * vp)
        *ec = VSALVAGE;
        return;
     }
-    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
-       *ec = VSALVAGE;
-       FDH_REALLYCLOSE(fdP);
-       return;
-    }
-    if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
+    if (FDH_PWRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)), 0)
        != sizeof(V_disk(vp))) {
        *ec = VSALVAGE;
        FDH_REALLYCLOSE(fdP);
@@ -1464,7 +1988,7 @@ void
 VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
 {
 
-    memset((char *)dh, 0, sizeof(VolumeDiskHeader_t));
+    memset(dh, 0, sizeof(VolumeDiskHeader_t));
     dh->stamp = h->stamp;
     dh->id = h->id;
     dh->parent = h->parent;
@@ -1492,14 +2016,14 @@ VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
  * Converts an on-disk representation of a volume header to
  * the in-memory representation of a volume header.
  *
- * Makes the assumption that AFS has *always* 
+ * Makes the assumption that AFS has *always*
  * zero'd the volume header file so that high parts of inode
  * numbers are 0 in older (SGI EFS) volume header files.
  */
 void
 DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
 {
-    memset((char *)h, 0, sizeof(VolumeHeader_t));
+    memset(h, 0, sizeof(VolumeHeader_t));
     h->stamp = dh->stamp;
     h->id = dh->id;
     h->parent = dh->parent;
@@ -1531,75 +2055,149 @@ DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
 /***************************************************/
 
 #ifdef AFS_DEMAND_ATTACH_FS
-/* pre-attach a volume given its path 
+/**
+ * pre-attach a volume given its path.
  *
- * a pre-attached volume will only have its partition
- * and hashid fields initialized
+ * @param[out] ec         outbound error code
+ * @param[in]  partition  partition path string
+ * @param[in]  name       volume id string
+ *
+ * @return volume object pointer
+ *
+ * @note A pre-attached volume will only have its partition
+ *       and hashid fields initialized.  At first call to
+ *       VGetVolume, the volume will be fully attached.
  *
- * at first call to VGetVolume, the volume will be
- * fully attached
  */
 Volume *
-VPreAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
+VPreAttachVolumeByName(Error * ec, char *partition, char *name)
 {
     Volume * vp;
     VOL_LOCK;
-    vp = VPreAttachVolumeByName_r(ec, partition, name, mode);
+    vp = VPreAttachVolumeByName_r(ec, partition, name);
     VOL_UNLOCK;
     return vp;
 }
 
+/**
+ * pre-attach a volume given its path.
+ *
+ * @param[out] ec         outbound error code
+ * @param[in]  partition  path to vice partition
+ * @param[in]  name       volume id string
+ *
+ * @return volume object pointer
+ *
+ * @pre VOL_LOCK held
+ *
+ * @internal volume package internal use only.
+ */
+Volume *
+VPreAttachVolumeByName_r(Error * ec, char *partition, char *name)
+{
+    return VPreAttachVolumeById_r(ec,
+                                 partition,
+                                 VolumeNumber(name));
+}
+
+/**
+ * pre-attach a volume given its path and numeric volume id.
+ *
+ * @param[out] ec          error code return
+ * @param[in]  partition   path to vice partition
+ * @param[in]  volumeId    numeric volume id
+ *
+ * @return volume object pointer
+ *
+ * @pre VOL_LOCK held
+ *
+ * @internal volume package internal use only.
+ */
 Volume *
-VPreAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
+VPreAttachVolumeById_r(Error * ec,
+                      char * partition,
+                      VolId volumeId)
 {
-    register Volume *vp = NULL;
-    int fd, n;
-    struct afs_stat status;
-    struct DiskPartition *partp;
-    char path[64];
-    int isbusy = 0;
-    VolId volumeId;
+    Volume *vp;
+    struct DiskPartition64 *partp;
+
     *ec = 0;
 
-    assert(programType == fileServer);
+    osi_Assert(programType == fileServer);
 
     if (!(partp = VGetPartition_r(partition, 0))) {
        *ec = VNOVOL;
-       Log("VPreAttachVolume:  Error getting partition (%s)\n", partition);
+       Log("VPreAttachVolumeById_r:  Error getting partition (%s)\n", partition);
        return NULL;
     }
 
-    volumeId = VolumeNumber(name);
-
     vp = VLookupVolume_r(ec, volumeId, NULL);
     if (*ec) {
        return NULL;
     }
 
-    return VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+    return VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
 }
 
-/* pre-attach a volume given its partition and volume id
+/**
+ * preattach a volume.
+ *
+ * @param[out] ec     outbound error code
+ * @param[in]  partp  pointer to partition object
+ * @param[in]  vp     pointer to volume object
+ * @param[in]  vid    volume id
+ *
+ * @return volume object pointer
  *
- * if vp == NULL, then a new vp is created
- * if vp != NULL, then we assumed it is already on the hash chain
+ * @pre VOL_LOCK is held.
+ *
+ * @warning Returned volume object pointer does not have to
+ *          equal the pointer passed in as argument vp.  There
+ *          are potential race conditions which can result in
+ *          the pointers having different values.  It is up to
+ *          the caller to make sure that references are handled
+ *          properly in this case.
+ *
+ * @note If there is already a volume object registered with
+ *       the same volume id, its pointer MUST be passed as
+ *       argument vp.  Failure to do so will result in a silent
+ *       failure to preattach.
+ *
+ * @internal volume package internal use only.
  */
-Volume * 
-VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, 
-                      Volume * vp, int vid)
+Volume *
+VPreAttachVolumeByVp_r(Error * ec,
+                      struct DiskPartition64 * partp,
+                      Volume * vp,
+                      VolId vid)
 {
     Volume *nvp = NULL;
 
     *ec = 0;
 
     /* check to see if pre-attach already happened */
-    if (vp && 
-       (V_attachState(vp) != VOL_STATE_UNATTACHED) && 
-       !IsErrorState(V_attachState(vp))) {
+    if (vp &&
+       (V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+       (V_attachState(vp) != VOL_STATE_DELETED) &&
+       (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
+       !VIsErrorState(V_attachState(vp))) {
+       /*
+        * pre-attach is a no-op in all but the following cases:
+        *
+        *   - volume is unattached
+        *   - volume is in an error state
+        *   - volume is pre-attached
+        */
+       Log("VPreattachVolumeByVp_r: volume %u not in quiescent state\n", vid);
        goto done;
     } else if (vp) {
        /* we're re-attaching a volume; clear out some old state */
        memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage));
+
+       if (V_partition(vp) != partp) {
+           /* XXX potential race */
+           DeleteVolumeFromVByPList_r(vp);
+       }
     } else {
        /* if we need to allocate a new Volume struct,
         * go ahead and drop the vol glock, otherwise
@@ -1609,15 +2207,19 @@ VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp,
 
        /* allocate the volume structure */
        vp = nvp = (Volume *) malloc(sizeof(Volume));
-       assert(vp != NULL);
+       osi_Assert(vp != NULL);
        memset(vp, 0, sizeof(Volume));
-       assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
+       queue_Init(&vp->vnode_list);
+       queue_Init(&vp->rx_call_list);
+       CV_INIT(&V_attachCV(vp), "vp attach", CV_DEFAULT, 0);
     }
 
     /* link the volume with its associated vice partition */
     vp->device = partp->device;
     vp->partition = partp;
+
     vp->hashid = vid;
+    vp->specialStatus = 0;
 
     /* if we dropped the lock, reacquire the lock,
      * check for pre-attach races, and then add
@@ -1634,7 +2236,7 @@ VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp,
            vp = nvp;
            goto done;
        } else {
-         /* hack to make up for VChangeState_r() decrementing 
+         /* hack to make up for VChangeState_r() decrementing
           * the old state counter */
          VStats.state_levels[0]++;
        }
@@ -1648,7 +2250,7 @@ VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp,
     VChangeState_r(vp, VOL_STATE_PREATTACHED);
 
     if (LogLevel >= 5)
-       Log("VPreAttachVolumeById_r:  volume %u pre-attached\n", vp->hashid);
+       Log("VPreAttachVolumeByVp_r:  volume %u pre-attached\n", vp->hashid);
 
   done:
     if (*ec)
@@ -1675,21 +2277,19 @@ VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
 Volume *
 VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 {
-    register Volume *vp = NULL, *svp = NULL;
-    int fd, n;
-    struct afs_stat status;
-    struct VolumeDiskHeader diskHeader;
-    struct VolumeHeader iheader;
-    struct DiskPartition *partp;
+    Volume *vp = NULL;
+    struct DiskPartition64 *partp;
     char path[64];
     int isbusy = 0;
     VolId volumeId;
+    int checkedOut;
 #ifdef AFS_DEMAND_ATTACH_FS
     VolumeStats stats_save;
+    Volume *svp = NULL;
 #endif /* AFS_DEMAND_ATTACH_FS */
 
     *ec = 0;
-   
+
     volumeId = VolumeNumber(name);
 
     if (!(partp = VGetPartition_r(partition, 0))) {
@@ -1698,8 +2298,8 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
        goto done;
     }
 
-    if (programType == volumeUtility) {
-       assert(VInit == 3);
+    if (VRequiresPartLock()) {
+       osi_Assert(VInit == 3);
        VLockPartition_r(partition);
     } else if (programType == fileServer) {
 #ifdef AFS_DEMAND_ATTACH_FS
@@ -1726,21 +2326,22 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
            VWaitExclusiveState_r(vp);
 
            /* at this point state must be one of:
-            *   UNATTACHED,
-            *   ATTACHED,
-            *   SHUTTING_DOWN,
-            *   GOING_OFFLINE,
-            *   SALVAGING,
-            *   ERROR
+            *   - UNATTACHED
+            *   - ATTACHED
+            *   - SHUTTING_DOWN
+            *   - GOING_OFFLINE
+            *   - SALVAGING
+            *   - ERROR
+            *   - DELETED
             */
 
            if (vp->specialStatus == VBUSY)
                isbusy = 1;
-           
+
            /* if it's already attached, see if we can return it */
            if (V_attachState(vp) == VOL_STATE_ATTACHED) {
                VGetVolumeByVp_r(ec, vp);
-               if (V_inUse(vp)) {
+               if (V_inUse(vp) == fileServer) {
                    VCancelReservation_r(vp);
                    return vp;
                }
@@ -1762,23 +2363,24 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
        }
 
        /* pre-attach volume if it hasn't been done yet */
-       if (!vp || 
+       if (!vp ||
            (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+           (V_attachState(vp) == VOL_STATE_DELETED) ||
            (V_attachState(vp) == VOL_STATE_ERROR)) {
            svp = vp;
-           vp = VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+           vp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
            if (*ec) {
                return NULL;
            }
        }
 
-       assert(vp != NULL);
+       osi_Assert(vp != NULL);
 
-       /* handle pre-attach races 
+       /* handle pre-attach races
         *
         * multiple threads can race to pre-attach a volume,
         * but we can't let them race beyond that
-        * 
+        *
         * our solution is to let the first thread to bring
         * the volume into an exclusive state win; the other
         * threads just wait until it finishes bringing the
@@ -1810,7 +2412,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 #else /* AFS_DEMAND_ATTACH_FS */
        vp = VGetVolume_r(ec, volumeId);
        if (vp) {
-           if (V_inUse(vp))
+           if (V_inUse(vp) == fileServer)
                return vp;
            if (vp->specialStatus == VBUSY)
                isbusy = 1;
@@ -1828,68 +2430,39 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 
     VOL_UNLOCK;
 
-    strcat(path, "/");
+    strcat(path, OS_DIRSEP);
     strcat(path, name);
-    if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
-       Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
-       if (fd > -1)
-           close(fd);
-       *ec = VNOVOL;
-       VOL_LOCK;
-       goto done;
-    }
-    n = read(fd, &diskHeader, sizeof(diskHeader));
-    close(fd);
-    if (n != sizeof(diskHeader)
-       || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
-       Log("VAttachVolume: Error reading volume header %s\n", path);
-       *ec = VSALVAGE;
-       VOL_LOCK;
-       goto done;
-    }
-    if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
-       Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
-       *ec = VSALVAGE;
-       VOL_LOCK;
-       goto done;
-    }
-
-    DiskToVolumeHeader(&iheader, &diskHeader);
-#ifdef FSSYNC_BUILD_CLIENT
-    if (programType == volumeUtility && mode != V_SECRETLY && mode != V_PEEK) {
-        VOL_LOCK;
-       if (FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_NEEDVOLUME, mode, NULL)
-           != SYNC_OK) {
-           Log("VAttachVolume: attach of volume %u apparently denied by file server\n", iheader.id);
-           *ec = VNOVOL;       /* XXXX */
-           goto done;
-       }
-       VOL_UNLOCK;
-    }
-#endif
 
     if (!vp) {
       vp = (Volume *) calloc(1, sizeof(Volume));
-      assert(vp != NULL);
+      osi_Assert(vp != NULL);
+      vp->hashid = volumeId;
       vp->device = partp->device;
       vp->partition = partp;
+      queue_Init(&vp->vnode_list);
+      queue_Init(&vp->rx_call_list);
 #ifdef AFS_DEMAND_ATTACH_FS
-      assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
+      CV_INIT(&V_attachCV(vp), "vp attach", CV_DEFAULT, 0);
 #endif /* AFS_DEMAND_ATTACH_FS */
     }
 
     /* attach2 is entered without any locks, and returns
      * with vol_glock_mutex held */
-    vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
+    vp = attach2(ec, volumeId, path, partp, vp, isbusy, mode, &checkedOut);
 
-    if (programType == volumeUtility && vp) {
+    if (VCanUseFSSYNC() && vp) {
 #ifdef AFS_DEMAND_ATTACH_FS
+       if ((mode == V_VOLUPD) || (VolumeWriteable(vp) && (mode == V_CLONE))) {
+           /* mark volume header as in use so that volser crashes lead to a
+            * salvage attempt */
+           VUpdateVolume_r(ec, vp, 0);
+       }
        /* for dafs, we should tell the fileserver, except for V_PEEK
          * where we know it is not necessary */
        if (mode == V_PEEK) {
            vp->needsPutBack = 0;
        } else {
-           vp->needsPutBack = 1;
+           vp->needsPutBack = VOL_PUTBACK;
        }
 #else /* !AFS_DEMAND_ATTACH_FS */
        /* duplicate computation in fssync.c about whether the server
@@ -1900,32 +2473,32 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
            || (!VolumeWriteable(vp) && (mode == V_CLONE || mode == V_DUMP)))
            vp->needsPutBack = 0;
        else
-           vp->needsPutBack = 1;
+           vp->needsPutBack = VOL_PUTBACK;
 #endif /* !AFS_DEMAND_ATTACH_FS */
     }
-    /* OK, there's a problem here, but one that I don't know how to
-     * fix right now, and that I don't think should arise often.
-     * Basically, we should only put back this volume to the server if
-     * it was given to us by the server, but since we don't have a vp,
-     * we can't run the VolumeWriteable function to find out as we do
-     * above when computing vp->needsPutBack.  So we send it back, but
-     * there's a path in VAttachVolume on the server which may abort
-     * if this volume doesn't have a header.  Should be pretty rare
-     * for all of that to happen, but if it does, probably the right
-     * fix is for the server to allow the return of readonly volumes
-     * that it doesn't think are really checked out. */
 #ifdef FSSYNC_BUILD_CLIENT
-    if (programType == volumeUtility && vp == NULL &&
-       mode != V_SECRETLY && mode != V_PEEK) {
-       FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_ON, 0, NULL);
-    } else 
+    /* Only give back the vol to the fileserver if we checked it out; attach2
+     * will set checkedOut only if we successfully checked it out from the
+     * fileserver. */
+    if (VCanUseFSSYNC() && vp == NULL && checkedOut) {
+
+#ifdef AFS_DEMAND_ATTACH_FS
+        /* If we couldn't attach but we scheduled a salvage, we already
+         * notified the fileserver; don't online it now */
+        if (*ec != VSALVAGING)
+#endif /* AFS_DEMAND_ATTACH_FS */
+       FSYNC_VolOp(volumeId, partition, FSYNC_VOL_ON, 0, NULL);
+    } else
 #endif
     if (programType == fileServer && vp) {
-       V_needsCallback(vp) = 0;
-#ifdef notdef
-       if (VInit >= 2 && V_BreakVolumeCallbacks) {
-           Log("VAttachVolume: Volume %u was changed externally; breaking callbacks\n", V_id(vp));
-           (*V_BreakVolumeCallbacks) (V_id(vp));
+#ifdef AFS_DEMAND_ATTACH_FS
+       /*
+        * we can get here in cases where we don't "own"
+        * the volume (e.g. volume owned by a utility).
+        * short circuit around potential disk header races.
+        */
+       if (V_attachState(vp) != VOL_STATE_ATTACHED) {
+           goto done;
        }
 #endif
        VUpdateVolume_r(ec, vp, 0);
@@ -1959,15 +2532,16 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
            Log("VOnline:  volume %u (%s) attached and online\n", V_id(vp),
                V_name(vp));
     }
+
   done:
-    if (programType == volumeUtility) {
+    if (VRequiresPartLock()) {
        VUnlockPartition_r(partition);
     }
     if (*ec) {
 #ifdef AFS_DEMAND_ATTACH_FS
-       if (vp) {
-           V_attachState(vp) = VOL_STATE_ERROR;
-           assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0);
+       /* attach failed; make sure we're in error state */
+       if (vp && !VIsErrorState(V_attachState(vp))) {
+           VChangeState_r(vp, VOL_STATE_ERROR);
        }
 #endif /* AFS_DEMAND_ATTACH_FS */
        return NULL;
@@ -1987,21 +2561,19 @@ static Volume *
 VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
 {
     char name[VMAXPATHLEN];
-    int fd, n, reserve = 0;
-    struct afs_stat status;
-    struct VolumeDiskHeader diskHeader;
-    struct VolumeHeader iheader;
-    struct DiskPartition *partp;
+    int reserve = 0;
+    struct DiskPartition64 *partp;
     char path[64];
     int isbusy = 0;
     VolId volumeId;
-    Volume * nvp;
+    Volume * nvp = NULL;
     VolumeStats stats_save;
+    int checkedOut;
     *ec = 0;
 
     /* volume utility should never call AttachByVp */
-    assert(programType == fileServer);
-   
+    osi_Assert(programType == fileServer);
+
     volumeId = vp->hashid;
     partp = vp->partition;
     VolumeExternalName_r(volumeId, name, sizeof(name));
@@ -2015,7 +2587,7 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
     /* if it's already attached, see if we can return it */
     if (V_attachState(vp) == VOL_STATE_ATTACHED) {
        VGetVolumeByVp_r(ec, vp);
-       if (V_inUse(vp)) {
+       if (V_inUse(vp) == fileServer) {
            return vp;
        } else {
            if (vp->specialStatus == VBUSY)
@@ -2029,10 +2601,11 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
     }
 
     /* pre-attach volume if it hasn't been done yet */
-    if (!vp || 
+    if (!vp ||
        (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+       (V_attachState(vp) == VOL_STATE_DELETED) ||
        (V_attachState(vp) == VOL_STATE_ERROR)) {
-       nvp = VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+       nvp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
        if (*ec) {
            return NULL;
        }
@@ -2042,8 +2615,8 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
            vp = nvp;
        }
     }
-    
-    assert(vp != NULL);
+
+    osi_Assert(vp != NULL);
     VChangeState_r(vp, VOL_STATE_ATTACHING);
 
     /* restore monotonically increasing stats */
@@ -2051,54 +2624,32 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
 
     *ec = 0;
 
-
-    /* compute path to disk header, 
-     * read in header, 
-     * and verify magic and version stamps */
+    /* compute path to disk header */
     strcpy(path, VPartitionPath(partp));
 
     VOL_UNLOCK;
 
-    strcat(path, "/");
+    strcat(path, OS_DIRSEP);
     strcat(path, name);
-    if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
-       Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
-       if (fd > -1)
-           close(fd);
-       *ec = VNOVOL;
-       VOL_LOCK;
-       goto done;
-    }
-    n = read(fd, &diskHeader, sizeof(diskHeader));
-    close(fd);
-    if (n != sizeof(diskHeader)
-       || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
-       Log("VAttachVolume: Error reading volume header %s\n", path);
-       *ec = VSALVAGE;
-       VOL_LOCK;
-       goto done;
-    }
-    if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
-       Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
-       *ec = VSALVAGE;
-       VOL_LOCK;
-       goto done;
-    }
-
-    /* convert on-disk header format to in-memory header format */
-    DiskToVolumeHeader(&iheader, &diskHeader);
 
     /* do volume attach
      *
      * NOTE: attach2 is entered without any locks, and returns
      * with vol_glock_mutex held */
-    vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
+    vp = attach2(ec, volumeId, path, partp, vp, isbusy, mode, &checkedOut);
 
-    if (*ec || vp == NULL) {
+    /*
+     * the event that an error was encountered, or
+     * the volume was not brought to an attached state
+     * for any reason, skip to the end.  We cannot
+     * safely call VUpdateVolume unless we "own" it.
+     */
+    if (*ec ||
+       (vp == NULL) ||
+       (V_attachState(vp) != VOL_STATE_ATTACHED)) {
        goto done;
     }
 
-    V_needsCallback(vp) = 0;
     VUpdateVolume_r(ec, vp, 0);
     if (*ec) {
        Log("VAttachVolume: Error updating volume %u\n", vp->hashid);
@@ -2134,7 +2685,7 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
        reserve = 0;
     }
     if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) {
-       if (vp && !IsErrorState(V_attachState(vp))) {
+       if (vp && !VIsErrorState(V_attachState(vp))) {
            VChangeState_r(vp, VOL_STATE_ERROR);
        }
        return NULL;
@@ -2142,42 +2693,208 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
        return vp;
     }
 }
+
+/**
+ * lock a volume on disk (non-blocking).
+ *
+ * @param[in] vp  The volume to lock
+ * @param[in] locktype READ_LOCK or WRITE_LOCK
+ *
+ * @return operation status
+ *  @retval 0 success, lock was obtained
+ *  @retval EBUSY a conflicting lock was held by another process
+ *  @retval EIO   error acquiring lock
+ *
+ * @pre If we're in the fileserver, vp is in an exclusive state
+ *
+ * @pre vp is not already locked
+ */
+static int
+VLockVolumeNB(Volume *vp, int locktype)
+{
+    int code;
+
+    osi_Assert(programType != fileServer || VIsExclusiveState(V_attachState(vp)));
+    osi_Assert(!(V_attachFlags(vp) & VOL_LOCKED));
+
+    code = VLockVolumeByIdNB(vp->hashid, vp->partition, locktype);
+    if (code == 0) {
+       V_attachFlags(vp) |= VOL_LOCKED;
+    }
+
+    return code;
+}
+
+/**
+ * unlock a volume on disk that was locked with VLockVolumeNB.
+ *
+ * @param[in] vp  volume to unlock
+ *
+ * @pre If we're in the fileserver, vp is in an exclusive state
+ *
+ * @pre vp has already been locked
+ */
+static void
+VUnlockVolume(Volume *vp)
+{
+    osi_Assert(programType != fileServer || VIsExclusiveState(V_attachState(vp)));
+    osi_Assert((V_attachFlags(vp) & VOL_LOCKED));
+
+    VUnlockVolumeById(vp->hashid, vp->partition);
+
+    V_attachFlags(vp) &= ~VOL_LOCKED;
+}
 #endif /* AFS_DEMAND_ATTACH_FS */
 
-/*
- * called without any locks held
- * returns with vol_glock_mutex held
- */
-private Volume * 
-attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * header,
-       struct DiskPartition * partp, register Volume * vp, int isbusy, int mode)
-{
-    vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
-    IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent,
-           header->largeVnodeIndex);
-    IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header->parent,
-           header->smallVnodeIndex);
-    IH_INIT(vp->diskDataHandle, partp->device, header->parent,
-           header->volumeInfo);
-    IH_INIT(vp->linkHandle, partp->device, header->parent, header->linkTable);
-    vp->shuttingDown = 0;
-    vp->goingOffline = 0;
-    vp->nUsers = 1;
+/**
+ * read in a vol header, possibly lock the vol header, and possibly check out
+ * the vol header from the fileserver, as part of volume attachment.
+ *
+ * @param[out] ec     error code
+ * @param[in] vp      volume pointer object
+ * @param[in] partp   disk partition object of the attaching partition
+ * @param[in] mode    attachment mode such as V_VOLUPD, V_DUMP, etc (see
+ *                    volume.h)
+ * @param[in] peek    1 to just try to read in the volume header and make sure
+ *                    we don't try to lock the vol, or check it out from
+ *                    FSSYNC or anything like that; 0 otherwise, for 'normal'
+ *                    operation
+ * @param[out] acheckedOut   If we successfully checked-out the volume from
+ *                           the fileserver (if we needed to), this is set
+ *                           to 1, otherwise it is untouched.
+ *
+ * @note As part of DAFS volume attachment, the volume header may be either
+ *       read- or write-locked to ensure mutual exclusion of certain volume
+ *       operations. In some cases in order to determine whether we need to
+ *       read- or write-lock the header, we need to read in the header to see
+ *       if the volume is RW or not. So, if we read in the header under a
+ *       read-lock and determine that we actually need a write-lock on the
+ *       volume header, this function will drop the read lock, acquire a write
+ *       lock, and read the header in again.
+ */
+static void
+attach_volume_header(Error *ec, Volume *vp, struct DiskPartition64 *partp,
+                     int mode, int peek, int *acheckedOut)
+{
+    struct VolumeDiskHeader diskHeader;
+    struct VolumeHeader header;
+    int code;
+    int first_try = 1;
+    int lock_tries = 0, checkout_tries = 0;
+    int retry;
+    VolumeId volid = vp->hashid;
+#ifdef FSSYNC_BUILD_CLIENT
+    int checkout, done_checkout = 0;
+#endif /* FSSYNC_BUILD_CLIENT */
 #ifdef AFS_DEMAND_ATTACH_FS
-    vp->stats.last_attach = FT_ApproxTime();
-    vp->stats.attaches++;
+    int locktype = 0, use_locktype = -1;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ retry:
+    retry = 0;
+    *ec = 0;
+
+    if (lock_tries > VOL_MAX_CHECKOUT_RETRIES) {
+       Log("VAttachVolume: retried too many times trying to lock header for "
+           "vol %lu part %s; giving up\n", afs_printable_uint32_lu(volid),
+           VPartitionPath(partp));
+       *ec = VNOVOL;
+       goto done;
+    }
+    if (checkout_tries > VOL_MAX_CHECKOUT_RETRIES) {
+       Log("VAttachVolume: retried too many times trying to checkout "
+           "vol %lu part %s; giving up\n", afs_printable_uint32_lu(volid),
+           VPartitionPath(partp));
+       *ec = VNOVOL;
+       goto done;
+    }
+
+    if (VReadVolumeDiskHeader(volid, partp, NULL)) {
+       /* short-circuit the 'volume does not exist' case */
+       *ec = VNOVOL;
+       goto done;
+    }
+
+#ifdef FSSYNC_BUILD_CLIENT
+    checkout = !done_checkout;
+    done_checkout = 1;
+    if (!peek && checkout && VMustCheckoutVolume(mode)) {
+        SYNC_response res;
+        memset(&res, 0, sizeof(res));
+
+       if (FSYNC_VolOp(volid, VPartitionPath(partp), FSYNC_VOL_NEEDVOLUME, mode, &res)
+           != SYNC_OK) {
+
+            if (res.hdr.reason == FSYNC_SALVAGE) {
+                Log("VAttachVolume: file server says volume %lu is salvaging\n",
+                     afs_printable_uint32_lu(volid));
+                *ec = VSALVAGING;
+            } else {
+               Log("VAttachVolume: attach of volume %lu apparently denied by file server\n",
+                     afs_printable_uint32_lu(volid));
+               *ec = VNOVOL;   /* XXXX */
+            }
+           goto done;
+       }
+       *acheckedOut = 1;
+    }
 #endif
 
-    VOL_LOCK;
 #ifdef AFS_DEMAND_ATTACH_FS
-    IncUInt64(&VStats.attaches);
-#endif
-    vp->cacheCheck = ++VolumeCacheCheck;
-    /* just in case this ever rolls over */
-    if (!vp->cacheCheck)
-       vp->cacheCheck = ++VolumeCacheCheck;
-    GetVolumeHeader(vp);
-    VOL_UNLOCK;
+    if (use_locktype < 0) {
+       /* don't know whether vol is RO or RW; assume it's RO and we can retry
+        * if it turns out to be RW */
+       locktype = VVolLockType(mode, 0);
+
+    } else {
+       /* a previous try says we should use use_locktype to lock the volume,
+        * so use that */
+       locktype = use_locktype;
+    }
+
+    if (!peek && locktype) {
+       code = VLockVolumeNB(vp, locktype);
+       if (code) {
+           if (code == EBUSY) {
+               Log("VAttachVolume: another program has vol %lu locked\n",
+                   afs_printable_uint32_lu(volid));
+           } else {
+               Log("VAttachVolume: error %d trying to lock vol %lu\n",
+                   code, afs_printable_uint32_lu(volid));
+           }
+
+           *ec = VNOVOL;
+           goto done;
+       }
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    code = VReadVolumeDiskHeader(volid, partp, &diskHeader);
+    if (code) {
+       if (code == EIO) {
+           *ec = VSALVAGE;
+       } else {
+           *ec = VNOVOL;
+       }
+       goto done;
+    }
+
+    DiskToVolumeHeader(&header, &diskHeader);
+
+    IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header.parent,
+           header.largeVnodeIndex);
+    IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header.parent,
+           header.smallVnodeIndex);
+    IH_INIT(vp->diskDataHandle, partp->device, header.parent,
+           header.volumeInfo);
+    IH_INIT(vp->linkHandle, partp->device, header.parent, header.linkTable);
+
+    if (first_try) {
+       /* only need to do this once */
+       VOL_LOCK;
+       GetVolumeHeader(vp);
+       VOL_UNLOCK;
+    }
 
 #if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
     /* demand attach changes the V_PEEK mechanism
@@ -2189,13 +2906,13 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
      *  to demand attach fileservers.  However, I'm trying
      *  to limit the number of common code changes)
      */
-    if (programType != fileServer && mode == V_PEEK) {
+    if (VCanUseFSSYNC() && (mode == V_PEEK || peek)) {
        SYNC_response res;
        res.payload.len = sizeof(VolumeDiskData);
        res.payload.buf = &vp->header->diskstuff;
 
-       if (FSYNC_VolOp(volumeId,
-                       VPartitionPath(partp),
+       if (FSYNC_VolOp(vp->hashid,
+                       partp->name,
                        FSYNC_VOL_QUERY_HDR,
                        FSYNC_WHATEVER,
                        &res) == SYNC_OK) {
@@ -2213,51 +2930,265 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
     IncUInt64(&vp->stats.hdr_loads);
     VOL_UNLOCK;
 #endif /* AFS_DEMAND_ATTACH_FS */
-    
+
     if (*ec) {
-       Log("VAttachVolume: Error reading diskDataHandle vol header %s; error=%u\n", path, *ec);
+       Log("VAttachVolume: Error reading diskDataHandle header for vol %lu; "
+           "error=%u\n", afs_printable_uint32_lu(volid), *ec);
+       goto done;
     }
 
- disk_header_loaded:
-
 #ifdef AFS_DEMAND_ATTACH_FS
-    if (!*ec) {
-
-       /* check for pending volume operations */
-       if (vp->pending_vol_op) {
-           /* see if the pending volume op requires exclusive access */
-           if (!VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
-               /* mark the volume down */
-               *ec = VOFFLINE;
-               VChangeState_r(vp, VOL_STATE_UNATTACHED);
-               if (V_offlineMessage(vp)[0] == '\0')
-                   strlcpy(V_offlineMessage(vp),
-                           "A volume utility is running.", 
-                           sizeof(V_offlineMessage(vp)));
-               V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
-
-               /* check to see if we should set the specialStatus flag */
-               if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) {
-                   vp->specialStatus = VBUSY;
-               }
-           }
-       }
+# ifdef FSSYNC_BUILD_CLIENT
+ disk_header_loaded:
+# endif /* FSSYNC_BUILD_CLIENT */
 
-       V_attachFlags(vp) |= VOL_HDR_LOADED;
+    /* if the lock type we actually used to lock the volume is different than
+     * the lock type we should have used, retry with the lock type we should
+     * use */
+    use_locktype = VVolLockType(mode, VolumeWriteable(vp));
+    if (locktype != use_locktype) {
+       retry = 1;
+       lock_tries++;
     }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
-    if (!*ec) {
-       struct IndexFileHeader iHead;
+    *ec = 0;
 
-#if OPENAFS_VOL_STATS
-       /*
-        * We just read in the diskstuff part of the header.  If the detailed
+ done:
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
+    if (!peek && *ec == 0 && retry == 0 && VMustCheckoutVolume(mode)) {
+
+       code = FSYNC_VerifyCheckout(volid, VPartitionPath(partp), FSYNC_VOL_NEEDVOLUME, mode);
+
+       if (code == SYNC_DENIED) {
+           /* must retry checkout; fileserver no longer thinks we have
+            * the volume */
+           retry = 1;
+           checkout_tries++;
+           done_checkout = 0;
+
+       } else if (code != SYNC_OK) {
+           *ec = VNOVOL;
+       }
+    }
+#endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */
+
+    if (*ec || retry) {
+       /* either we are going to be called again for a second pass, or we
+        * encountered an error; clean up in either case */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+       if ((V_attachFlags(vp) & VOL_LOCKED)) {
+           VUnlockVolume(vp);
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
+       if (vp->linkHandle) {
+           IH_RELEASE(vp->vnodeIndex[vLarge].handle);
+           IH_RELEASE(vp->vnodeIndex[vSmall].handle);
+           IH_RELEASE(vp->diskDataHandle);
+           IH_RELEASE(vp->linkHandle);
+       }
+    }
+
+    if (*ec) {
+       return;
+    }
+    if (retry) {
+       first_try = 0;
+       goto retry;
+    }
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+attach_check_vop(Error *ec, VolumeId volid, struct DiskPartition64 *partp,
+                 Volume *vp, int *acheckedOut)
+{
+    *ec = 0;
+
+    if (vp->pending_vol_op) {
+
+       VOL_LOCK;
+
+       if (vp->pending_vol_op->vol_op_state == FSSYNC_VolOpRunningUnknown) {
+           int code;
+           code = VVolOpLeaveOnlineNoHeader_r(vp, vp->pending_vol_op);
+           if (code == 1) {
+               vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOnline;
+           } else if (code == 0) {
+               vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOffline;
+
+           } else {
+               /* we need the vol header to determine if the volume can be
+                * left online for the vop, so... get the header */
+
+               VOL_UNLOCK;
+
+               /* attach header with peek=1 to avoid checking out the volume
+                * or locking it; we just want the header info, we're not
+                * messing with the volume itself at all */
+               attach_volume_header(ec, vp, partp, V_PEEK, 1, acheckedOut);
+               if (*ec) {
+                   return;
+               }
+
+               VOL_LOCK;
+
+               if (VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+                   vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOnline;
+               } else {
+                   vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOffline;
+               }
+
+               /* make sure we grab a new vol header and re-open stuff on
+                * actual attachment; we can't keep the data we grabbed, since
+                * it was not done under a lock and thus not safe */
+               FreeVolumeHeader(vp);
+               VReleaseVolumeHandles_r(vp);
+           }
+       }
+       /* see if the pending volume op requires exclusive access */
+       switch (vp->pending_vol_op->vol_op_state) {
+       case FSSYNC_VolOpPending:
+           /* this should never happen */
+           osi_Assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpPending);
+           break;
+
+       case FSSYNC_VolOpRunningUnknown:
+           /* this should never happen; we resolved 'unknown' above */
+           osi_Assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpRunningUnknown);
+           break;
+
+       case FSSYNC_VolOpRunningOffline:
+           /* mark the volume down */
+           *ec = VOFFLINE;
+           VChangeState_r(vp, VOL_STATE_UNATTACHED);
+
+           /* do not set V_offlineMessage here; we don't have ownership of
+            * the volume (and probably do not have the header loaded), so we
+            * can't alter the disk header */
+
+           /* check to see if we should set the specialStatus flag */
+           if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) {
+               vp->specialStatus = VBUSY;
+           }
+           break;
+
+       default:
+           break;
+       }
+
+       VOL_UNLOCK;
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/**
+ * volume attachment helper function.
+ *
+ * @param[out] ec      error code
+ * @param[in] volumeId volume ID of the attaching volume
+ * @param[in] path     full path to the volume header .vol file
+ * @param[in] partp    disk partition object for the attaching partition
+ * @param[in] vp       volume object; vp->hashid, vp->device, vp->partition,
+ *                     vp->vnode_list, vp->rx_call_list, and V_attachCV (for
+ *                     DAFS) should already be initialized
+ * @param[in] isbusy   1 if vp->specialStatus should be set to VBUSY; that is,
+ *                     if there is a volume operation running for this volume
+ *                     that should set the volume to VBUSY during its run. 0
+ *                     otherwise. (see VVolOpSetVBusy_r)
+ * @param[in] mode     attachment mode such as V_VOLUPD, V_DUMP, etc (see
+ *                     volume.h)
+ * @param[out] acheckedOut   If we successfully checked-out the volume from
+ *                           the fileserver (if we needed to), this is set
+ *                           to 1, otherwise it is 0.
+ *
+ * @return pointer to the semi-attached volume pointer
+ *  @retval NULL an error occurred (check value of *ec)
+ *  @retval vp volume successfully attaching
+ *
+ * @pre no locks held
+ *
+ * @post VOL_LOCK held
+ */
+static Volume *
+attach2(Error * ec, VolId volumeId, char *path, struct DiskPartition64 *partp,
+        Volume * vp, int isbusy, int mode, int *acheckedOut)
+{
+    /* have we read in the header successfully? */
+    int read_header = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* should we FreeVolume(vp) instead of VCheckFree(vp) in the error
+     * cleanup? */
+    int forcefree = 0;
+
+    /* in the case of an error, to what state should the volume be
+     * transitioned? */
+    VolState error_state = VOL_STATE_ERROR;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    *ec = 0;
+
+    vp->vnodeIndex[vLarge].handle = NULL;
+    vp->vnodeIndex[vSmall].handle = NULL;
+    vp->diskDataHandle = NULL;
+    vp->linkHandle = NULL;
+
+    *acheckedOut = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    attach_check_vop(ec, volumeId, partp, vp, acheckedOut);
+    if (!*ec) {
+       attach_volume_header(ec, vp, partp, mode, 0, acheckedOut);
+    }
+#else
+    attach_volume_header(ec, vp, partp, mode, 0, acheckedOut);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+    if (*ec == VNOVOL) {
+       /* if the volume doesn't exist, skip straight to 'error' so we don't
+        * request a salvage */
+       goto unlocked_error;
+    }
+
+    if (!*ec) {
+       read_header = 1;
+
+       vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
+       vp->shuttingDown = 0;
+       vp->goingOffline = 0;
+       vp->nUsers = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+       vp->stats.last_attach = FT_ApproxTime();
+       vp->stats.attaches++;
+#endif
+
+       VOL_LOCK;
+       IncUInt64(&VStats.attaches);
+       vp->cacheCheck = ++VolumeCacheCheck;
+       /* just in case this ever rolls over */
+       if (!vp->cacheCheck)
+           vp->cacheCheck = ++VolumeCacheCheck;
+       VOL_UNLOCK;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+       V_attachFlags(vp) |= VOL_HDR_LOADED;
+       vp->stats.last_hdr_load = vp->stats.last_attach;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+
+    if (!*ec) {
+       struct IndexFileHeader iHead;
+
+#if OPENAFS_VOL_STATS
+       /*
+        * We just read in the diskstuff part of the header.  If the detailed
         * volume stats area has not yet been initialized, we should bzero the
         * area and mark it as initialized.
         */
        if (!(V_stat_initialized(vp))) {
-           memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
+           memset((V_stat_area(vp)), 0, VOL_STATS_BYTES);
            V_stat_initialized(vp) = 1;
        }
 #endif /* OPENAFS_VOL_STATS */
@@ -2299,27 +3230,22 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
 #if defined(AFS_DEMAND_ATTACH_FS)
     if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) {
         VOL_LOCK;
-       if (programType == fileServer) {
-           VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
-           vp->nUsers = 0;
-           *ec = VSALVAGING;
-       } else {
+       if (!VCanScheduleSalvage()) {
            Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
-           FreeVolume(vp);
-           *ec = VSALVAGE;
        }
-       return NULL;
+       VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+                                                 VOL_SALVAGE_NO_OFFLINE);
+       vp->nUsers = 0;
+
+       goto locked_error;
     } else if (*ec) {
        /* volume operation in progress */
-       VOL_LOCK;
-       return NULL;
+       goto unlocked_error;
     }
 #else /* AFS_DEMAND_ATTACH_FS */
     if (*ec) {
        Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
-        VOL_LOCK;
-       FreeVolume(vp);
-       return NULL;
+       goto unlocked_error;
     }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
@@ -2328,58 +3254,66 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
            vp->specialStatus = 0;
         VOL_LOCK;
 #if defined(AFS_DEMAND_ATTACH_FS)
-       if (programType == fileServer) {
-           VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
-           vp->nUsers = 0;
-           *ec = VSALVAGING;
-       } else {
+       if (!VCanScheduleSalvage()) {
            Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
-           FreeVolume(vp);
-           *ec = VSALVAGE;
        }
+       VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER |
+                                                  VOL_SALVAGE_NO_OFFLINE);
+       vp->nUsers = 0;
+
 #else /* AFS_DEMAND_ATTACH_FS */
-       FreeVolume(vp);
        *ec = VSALVAGE;
 #endif /* AFS_DEMAND_ATTACH_FS */
-       return NULL;
+
+       goto locked_error;
     }
 
     VOL_LOCK;
-    if (programType == fileServer) {
-#ifndef FAST_RESTART
-       if (V_inUse(vp) && VolumeWriteable(vp)) {
-           if (!V_needsSalvaged(vp)) {
-               V_needsSalvaged(vp) = 1;
-               VUpdateVolume_r(ec, vp, 0);
-           }
+    vp->nextVnodeUnique = V_uniquifier(vp);
+
+    if (VShouldCheckInUse(mode) && V_inUse(vp) && VolumeWriteable(vp)) {
+       if (!V_needsSalvaged(vp)) {
+           V_needsSalvaged(vp) = 1;
+           VUpdateVolume_r(ec, vp, 0);
+       }
 #if defined(AFS_DEMAND_ATTACH_FS)
-           VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
-           vp->nUsers = 0;
-           *ec = VSALVAGING;
-#else /* AFS_DEMAND_ATTACH_FS */
+       if (!VCanScheduleSalvage()) {
            Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
-           FreeVolume(vp);
-           *ec = VSALVAGE;
-#endif /* AFS_DEMAND_ATTACH_FS */
-           return NULL;
        }
-#endif /* FAST_RESTART */
+       VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER |
+                                                  VOL_SALVAGE_NO_OFFLINE);
+       vp->nUsers = 0;
+
+#else /* AFS_DEMAND_ATTACH_FS */
+       Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
+       *ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+       goto locked_error;
+    }
+
+    if (programType == fileServer && V_destroyMe(vp) == DESTROY_ME) {
+       /* Only check destroyMe if we are the fileserver, since the
+        * volserver et al sometimes need to work with volumes with
+        * destroyMe set. Examples are 'temporary' volumes the
+        * volserver creates, and when we create a volume (destroyMe
+        * is set on creation; sometimes a separate volserver
+        * transaction is created to clear destroyMe).
+        */
 
-       if (V_destroyMe(vp) == DESTROY_ME) {
 #if defined(AFS_DEMAND_ATTACH_FS)
-           /* schedule a salvage so the volume goes away on disk */
-           VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
-           VChangeState_r(vp, VOL_STATE_ERROR);
-           vp->nUsers = 0;
+       /* schedule a salvage so the volume goes away on disk */
+       VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+                                                 VOL_SALVAGE_NO_OFFLINE);
+       VChangeState_r(vp, VOL_STATE_ERROR);
+       vp->nUsers = 0;
+       forcefree = 1;
 #endif /* AFS_DEMAND_ATTACH_FS */
-           FreeVolume(vp);
-           Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
-           *ec = VNOVOL;
-           return NULL;
-       }
+       Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
+       *ec = VNOVOL;
+       goto locked_error;
     }
 
-    vp->nextVnodeUnique = V_uniquifier(vp);
     vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL;
 #ifndef BITMAP_LATER
     if (programType == fileServer && VolumeWriteable(vp)) {
@@ -2388,132 +3322,495 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
            VGetBitmap_r(ec, vp, i);
            if (*ec) {
 #ifdef AFS_DEMAND_ATTACH_FS
-               VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+               VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+                                                         VOL_SALVAGE_NO_OFFLINE);
                vp->nUsers = 0;
-               *ec = VSALVAGING;
-#else /* AFS_DEMAND_ATTACH_FS */
-               FreeVolume(vp);
 #endif /* AFS_DEMAND_ATTACH_FS */
                Log("VAttachVolume: error getting bitmap for volume (%s)\n",
                    path);
-               return NULL;
+               goto locked_error;
            }
        }
     }
 #endif /* BITMAP_LATER */
 
+    if (VInit >= 2 && V_needsCallback(vp)) {
+       if (V_BreakVolumeCallbacks) {
+           Log("VAttachVolume: Volume %lu was changed externally; breaking callbacks\n",
+               afs_printable_uint32_lu(V_id(vp)));
+           V_needsCallback(vp) = 0;
+           VOL_UNLOCK;
+           (*V_BreakVolumeCallbacks) (V_id(vp));
+           VOL_LOCK;
+
+           VUpdateVolume_r(ec, vp, 0);
+       }
+#ifdef FSSYNC_BUILD_CLIENT
+       else if (VCanUseFSSYNC()) {
+           afs_int32 fsync_code;
+
+           V_needsCallback(vp) = 0;
+           VOL_UNLOCK;
+           fsync_code = FSYNC_VolOp(V_id(vp), NULL, FSYNC_VOL_BREAKCBKS, FSYNC_WHATEVER, NULL);
+           VOL_LOCK;
+
+           if (fsync_code) {
+               V_needsCallback(vp) = 1;
+               Log("Error trying to tell the fileserver to break callbacks for "
+                   "changed volume %lu; error code %ld\n",
+                   afs_printable_uint32_lu(V_id(vp)),
+                   afs_printable_int32_ld(fsync_code));
+           } else {
+               VUpdateVolume_r(ec, vp, 0);
+           }
+       }
+#endif /* FSSYNC_BUILD_CLIENT */
+
+       if (*ec) {
+           Log("VAttachVolume: error %d clearing needsCallback on volume "
+               "%lu; needs salvage\n", (int)*ec,
+               afs_printable_uint32_lu(V_id(vp)));
+#ifdef AFS_DEMAND_ATTACH_FS
+           VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+                                                     VOL_SALVAGE_NO_OFFLINE);
+           vp->nUsers = 0;
+#else /* !AFS_DEMAND_ATTACH_FS */
+           *ec = VSALVAGE;
+#endif /* !AFS_DEMAND_ATTACh_FS */
+           goto locked_error;
+       }
+    }
+
     if (programType == fileServer) {
        if (vp->specialStatus)
            vp->specialStatus = 0;
        if (V_blessed(vp) && V_inService(vp) && !V_needsSalvaged(vp)) {
-           V_inUse(vp) = 1;
+           V_inUse(vp) = fileServer;
            V_offlineMessage(vp)[0] = '\0';
        }
+       if (!V_inUse(vp)) {
+           *ec = VNOVOL;
+#ifdef AFS_DEMAND_ATTACH_FS
+           /* Put the vol into PREATTACHED state, so if someone tries to
+            * access it again, we try to attach, see that we're not blessed,
+            * and give a VNOVOL error again. Putting it into UNATTACHED state
+            * would result in a VOFFLINE error instead. */
+           error_state = VOL_STATE_PREATTACHED;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+           /* mimic e.g. GetVolume errors */
+           if (!V_blessed(vp)) {
+               Log("Volume %lu offline: not blessed\n", afs_printable_uint32_lu(V_id(vp)));
+               FreeVolumeHeader(vp);
+           } else if (!V_inService(vp)) {
+               Log("Volume %lu offline: not in service\n", afs_printable_uint32_lu(V_id(vp)));
+               FreeVolumeHeader(vp);
+           } else {
+               Log("Volume %lu offline: needs salvage\n", afs_printable_uint32_lu(V_id(vp)));
+               *ec = VSALVAGE;
+#ifdef AFS_DEMAND_ATTACH_FS
+               error_state = VOL_STATE_ERROR;
+               /* see if we can recover */
+               VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
+#endif
+           }
+#ifdef AFS_DEMAND_ATTACH_FS
+           vp->nUsers = 0;
+#endif
+           goto locked_error;
+       }
+    } else {
+#ifdef AFS_DEMAND_ATTACH_FS
+       if ((mode != V_PEEK) && (mode != V_SECRETLY))
+           V_inUse(vp) = programType;
+#endif /* AFS_DEMAND_ATTACH_FS */
+       V_checkoutMode(vp) = mode;
     }
 
     AddVolumeToHashTable(vp, V_id(vp));
 #ifdef AFS_DEMAND_ATTACH_FS
-    AddVolumeToVByPList_r(vp);
-    VLRU_Add_r(vp);
-    VChangeState_r(vp, VOL_STATE_ATTACHED);
+    if (VCanUnlockAttached() && (V_attachFlags(vp) & VOL_LOCKED)) {
+       VUnlockVolume(vp);
+    }
+    if ((programType != fileServer) ||
+       (V_inUse(vp) == fileServer)) {
+       AddVolumeToVByPList_r(vp);
+       VLRU_Add_r(vp);
+       VChangeState_r(vp, VOL_STATE_ATTACHED);
+    } else {
+       VChangeState_r(vp, VOL_STATE_UNATTACHED);
+    }
 #endif
+
     return vp;
+
+unlocked_error:
+    VOL_LOCK;
+locked_error:
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (!VIsErrorState(V_attachState(vp))) {
+       VChangeState_r(vp, error_state);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    if (read_header) {
+       VReleaseVolumeHandles_r(vp);
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VCheckSalvage(vp);
+    if (forcefree) {
+       FreeVolume(vp);
+    } else {
+       VCheckFree(vp);
+    }
+#else /* !AFS_DEMAND_ATTACH_FS */
+    FreeVolume(vp);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+    return NULL;
 }
 
 /* Attach an existing volume.
    The volume also normally goes online at this time.
    An offline volume must be reattached to make it go online.
  */
-
-Volume *
-VAttachVolume(Error * ec, VolumeId volumeId, int mode)
+
+Volume *
+VAttachVolume(Error * ec, VolumeId volumeId, int mode)
+{
+    Volume *retVal;
+    VOL_LOCK;
+    retVal = VAttachVolume_r(ec, volumeId, mode);
+    VOL_UNLOCK;
+    return retVal;
+}
+
+Volume *
+VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
+{
+    char *part, *name;
+    VGetVolumePath(ec, volumeId, &part, &name);
+    if (*ec) {
+       Volume *vp;
+       Error error;
+       vp = VGetVolume_r(&error, volumeId);
+       if (vp) {
+           osi_Assert(V_inUse(vp) == 0);
+           VDetachVolume_r(ec, vp);
+       }
+       return NULL;
+    }
+    return VAttachVolumeByName_r(ec, part, name, mode);
+}
+
+/* Increment a reference count to a volume, sans context swaps.  Requires
+ * possibly reading the volume header in from the disk, since there's
+ * an invariant in the volume package that nUsers>0 ==> vp->header is valid.
+ *
+ * N.B. This call can fail if we can't read in the header!!  In this case
+ * we still guarantee we won't context swap, but the ref count won't be
+ * incremented (otherwise we'd violate the invariant).
+ */
+/* NOTE: with the demand attach fileserver extensions, the global lock
+ * is dropped within VHold */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VHold_r(Volume * vp)
+{
+    Error error;
+
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
+
+    LoadVolumeHeader(&error, vp);
+    if (error) {
+       VCancelReservation_r(vp);
+       return error;
+    }
+    vp->nUsers++;
+    VCancelReservation_r(vp);
+    return 0;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VHold_r(Volume * vp)
+{
+    Error error;
+
+    LoadVolumeHeader(&error, vp);
+    if (error)
+       return error;
+    vp->nUsers++;
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/**** volume timeout-related stuff ****/
+
+#ifdef AFS_PTHREAD_ENV
+
+static struct timespec *shutdown_timeout;
+static pthread_once_t shutdown_timeout_once = PTHREAD_ONCE_INIT;
+
+static_inline int
+VTimedOut(const struct timespec *ts)
+{
+    struct timeval tv;
+    int code;
+
+    if (ts->tv_sec == 0) {
+       /* short-circuit; this will have always timed out */
+       return 1;
+    }
+
+    code = gettimeofday(&tv, NULL);
+    if (code) {
+       Log("Error %d from gettimeofday, assuming we have not timed out\n", errno);
+       /* assume no timeout; failure mode is we just wait longer than normal
+        * instead of returning errors when we shouldn't */
+       return 0;
+    }
+
+    if (tv.tv_sec < ts->tv_sec ||
+        (tv.tv_sec == ts->tv_sec && tv.tv_usec*1000 < ts->tv_nsec)) {
+
+       return 0;
+    }
+
+    return 1;
+}
+
+/**
+ * Calculate an absolute timeout.
+ *
+ * @param[out] ts  A timeout that is "timeout" seconds from now, if we return
+ *                 NULL, the memory is not touched
+ * @param[in]  timeout  How long the timeout should be from now
+ *
+ * @return timeout to use
+ *  @retval NULL      no timeout; wait forever
+ *  @retval non-NULL  the given value for "ts"
+ *
+ * @internal
+ */
+static struct timespec *
+VCalcTimeout(struct timespec *ts, afs_int32 timeout)
+{
+    struct timeval now;
+    int code;
+
+    if (timeout < 0) {
+       return NULL;
+    }
+
+    if (timeout == 0) {
+       ts->tv_sec = ts->tv_nsec = 0;
+       return ts;
+    }
+
+    code = gettimeofday(&now, NULL);
+    if (code) {
+       Log("Error %d from gettimeofday, falling back to 'forever' timeout\n", errno);
+       return NULL;
+    }
+
+    ts->tv_sec = now.tv_sec + timeout;
+    ts->tv_nsec = now.tv_usec * 1000;
+
+    return ts;
+}
+
+/**
+ * Initialize the shutdown_timeout global.
+ */
+static void
+VShutdownTimeoutInit(void)
+{
+    struct timespec *ts;
+
+    ts = malloc(sizeof(*ts));
+
+    shutdown_timeout = VCalcTimeout(ts, vol_opts.offline_shutdown_timeout);
+
+    if (!shutdown_timeout) {
+       free(ts);
+    }
+}
+
+/**
+ * Figure out the timeout that should be used for waiting for offline volumes.
+ *
+ * @param[out] ats  Storage space for a local timeout value if needed
+ *
+ * @return The timeout value that should be used
+ *   @retval NULL      No timeout; wait forever for offlining volumes
+ *   @retval non-NULL  A pointer to the absolute time that should be used as
+ *                     the deadline for waiting for offlining volumes.
+ *
+ * @note If we return non-NULL, the pointer we return may or may not be the
+ *       same as "ats"
+ */
+static const struct timespec *
+VOfflineTimeout(struct timespec *ats)
+{
+    if (vol_shutting_down) {
+       osi_Assert(pthread_once(&shutdown_timeout_once, VShutdownTimeoutInit) == 0);
+       return shutdown_timeout;
+    } else {
+       return VCalcTimeout(ats, vol_opts.offline_timeout);
+    }
+}
+
+#else /* AFS_PTHREAD_ENV */
+
+/* Waiting a certain amount of time for offlining volumes is not supported
+ * for LWP due to a lack of primitives. So, we never time out */
+# define VTimedOut(x) (0)
+# define VOfflineTimeout(x) (NULL)
+
+#endif /* !AFS_PTHREAD_ENV */
+
+#if 0
+static int
+VHold(Volume * vp)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VHold_r(vp);
+    VOL_UNLOCK;
+    return retVal;
+}
+#endif
+
+static afs_int32
+VIsGoingOffline_r(struct Volume *vp)
+{
+    afs_int32 code = 0;
+
+    if (vp->goingOffline) {
+       if (vp->specialStatus) {
+           code = vp->specialStatus;
+       } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
+           code = VNOVOL;
+       } else {
+           code = VOFFLINE;
+       }
+    }
+
+    return code;
+}
+
+/**
+ * Tell the caller if a volume is waiting to go offline.
+ *
+ * @param[in] vp  The volume we want to know about
+ *
+ * @return volume status
+ *   @retval 0 volume is not waiting to go offline, go ahead and use it
+ *   @retval nonzero volume is waiting to offline, and give the returned code
+ *           as an error to anyone accessing the volume
+ *
+ * @pre VOL_LOCK is NOT held
+ * @pre caller holds a heavyweight reference on vp
+ */
+afs_int32
+VIsGoingOffline(struct Volume *vp)
 {
-    Volume *retVal;
+    afs_int32 code;
+
     VOL_LOCK;
-    retVal = VAttachVolume_r(ec, volumeId, mode);
+    code = VIsGoingOffline_r(vp);
     VOL_UNLOCK;
-    return retVal;
-}
 
-Volume *
-VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
-{
-    char *part, *name;
-    GetVolumePath(ec, volumeId, &part, &name);
-    if (*ec) {
-       register Volume *vp;
-       Error error;
-       vp = VGetVolume_r(&error, volumeId);
-       if (vp) {
-           assert(V_inUse(vp) == 0);
-           VDetachVolume_r(ec, vp);
-       }
-       return NULL;
-    }
-    return VAttachVolumeByName_r(ec, part, name, mode);
+    return code;
 }
 
-/* Increment a reference count to a volume, sans context swaps.  Requires
- * possibly reading the volume header in from the disk, since there's
- * an invariant in the volume package that nUsers>0 ==> vp->header is valid.
+/**
+ * Register an RX call with a volume.
  *
- * N.B. This call can fail if we can't read in the header!!  In this case
- * we still guarantee we won't context swap, but the ref count won't be
- * incremented (otherwise we'd violate the invariant).
+ * @param[inout] ec        Error code; if unset when passed in, may be set if
+ *                         the volume starts going offline
+ * @param[out]   client_ec @see GetVolume
+ * @param[in] vp   Volume struct
+ * @param[in] cbv  VCallByVol struct containing the RX call to register
+ *
+ * @pre VOL_LOCK held
+ * @pre caller holds heavy ref on vp
+ *
+ * @internal
  */
-/* NOTE: with the demand attach fileserver extensions, the global lock
- * is dropped within VHold */
-#ifdef AFS_DEMAND_ATTACH_FS
-static int
-VHold_r(register Volume * vp)
+static void
+VRegisterCall_r(Error *ec, Error *client_ec, Volume *vp, struct VCallByVol *cbv)
 {
-    Error error;
+    if (vp && cbv) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       if (!*ec) {
+           /* just in case the volume started going offline after we got the
+            * reference to it... otherwise, if the volume started going
+            * offline right at the end of GetVolume(), we might race with the
+            * RX call scanner, and return success and add our cbv to the
+            * rx_call_list _after_ the scanner has scanned the list. */
+           *ec = VIsGoingOffline_r(vp);
+           if (client_ec) {
+               *client_ec = *ec;
+           }
+       }
 
-    VCreateReservation_r(vp);
-    VWaitExclusiveState_r(vp);
+       while (V_attachState(vp) == VOL_STATE_SCANNING_RXCALLS) {
+           VWaitStateChange_r(vp);
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
 
-    LoadVolumeHeader(&error, vp);
-    if (error) {
-       VCancelReservation_r(vp);
-       return error;
+       queue_Prepend(&vp->rx_call_list, cbv);
     }
-    vp->nUsers++;
-    VCancelReservation_r(vp);
-    return 0;
 }
-#else /* AFS_DEMAND_ATTACH_FS */
-static int
-VHold_r(register Volume * vp)
-{
-    Error error;
 
-    LoadVolumeHeader(&error, vp);
-    if (error)
-       return error;
-    vp->nUsers++;
-    return 0;
-}
+/**
+ * Deregister an RX call with a volume.
+ *
+ * @param[in] vp   Volume struct
+ * @param[in] cbv  VCallByVol struct containing the RX call to deregister
+ *
+ * @pre VOL_LOCK held
+ * @pre caller holds heavy ref on vp
+ *
+ * @internal
+ */
+static void
+VDeregisterCall_r(Volume *vp, struct VCallByVol *cbv)
+{
+    if (cbv && queue_IsOnQueue(cbv)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       while (V_attachState(vp) == VOL_STATE_SCANNING_RXCALLS) {
+           VWaitStateChange_r(vp);
+       }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
-static int
-VHold(register Volume * vp)
-{
-    int retVal;
-    VOL_LOCK;
-    retVal = VHold_r(vp);
-    VOL_UNLOCK;
-    return retVal;
+       queue_Remove(cbv);
+    }
 }
 
-
 /***************************************************/
 /* get and put volume routines                     */
 /***************************************************/
 
+/**
+ * put back a heavyweight reference to a volume object.
+ *
+ * @param[in] vp  volume object pointer
+ *
+ * @pre VOL_LOCK held
+ *
+ * @post heavyweight volume reference put back.
+ *       depending on state, volume may have been taken offline,
+ *       detached, salvaged, freed, etc.
+ *
+ * @internal volume package internal use only
+ */
 void
-VPutVolume_r(register Volume * vp)
+VPutVolume_r(Volume * vp)
 {
-    assert(--vp->nUsers >= 0);
+    osi_Assert(--vp->nUsers >= 0);
     if (vp->nUsers == 0) {
        VCheckOffline(vp);
        ReleaseVolumeHeader(vp->header);
@@ -2529,13 +3826,29 @@ VPutVolume_r(register Volume * vp)
 }
 
 void
-VPutVolume(register Volume * vp)
+VPutVolume(Volume * vp)
 {
     VOL_LOCK;
     VPutVolume_r(vp);
     VOL_UNLOCK;
 }
 
+/**
+ * Puts a volume reference obtained with VGetVolumeWithCall.
+ *
+ * @param[in] vp  Volume struct
+ * @param[in] cbv VCallByVol struct given to VGetVolumeWithCall, or NULL if none
+ *
+ * @pre VOL_LOCK is NOT held
+ */
+void
+VPutVolumeWithCall(Volume *vp, struct VCallByVol *cbv)
+{
+    VOL_LOCK;
+    VDeregisterCall_r(vp, cbv);
+    VPutVolume_r(vp);
+    VOL_UNLOCK;
+}
 
 /* Get a pointer to an attached volume.  The pointer is returned regardless
    of whether or not the volume is in service or on/off line.  An error
@@ -2550,28 +3863,78 @@ VGetVolume(Error * ec, Error * client_ec, VolId volumeId)
     return retVal;
 }
 
+/**
+ * Get a volume reference associated with an RX call.
+ *
+ * @param[out] ec @see GetVolume
+ * @param[out] client_ec @see GetVolume
+ * @param[in] volumeId @see GetVolume
+ * @param[in] ts  How long to wait for going-offline volumes (absolute time).
+ *                If NULL, wait forever. If ts->tv_sec == 0, return immediately
+ *                with an error if the volume is going offline.
+ * @param[in] cbv Contains an RX call to be associated with this volume
+ *                reference. This call may be interrupted if the volume is
+ *                requested to go offline while we hold a ref on it. Give NULL
+ *                to not associate an RX call with this reference.
+ *
+ * @return @see GetVolume
+ *
+ * @note for LWP builds, ts must be NULL
+ *
+ * @note A reference obtained with this function MUST be put back with
+ *       VPutVolumeWithCall
+ */
+Volume *
+VGetVolumeWithCall(Error * ec, Error * client_ec, VolId volumeId,
+                   const struct timespec *ts, struct VCallByVol *cbv)
+{
+    Volume *retVal;
+    VOL_LOCK;
+    retVal = GetVolume(ec, client_ec, volumeId, NULL, ts);
+    VRegisterCall_r(ec, client_ec, retVal, cbv);
+    VOL_UNLOCK;
+    return retVal;
+}
+
 Volume *
 VGetVolume_r(Error * ec, VolId volumeId)
 {
-    return GetVolume(ec, NULL, volumeId, NULL, 0);
+    return GetVolume(ec, NULL, volumeId, NULL, NULL);
 }
 
 /* try to get a volume we've previously looked up */
 /* for demand attach fs, caller MUST NOT hold a ref count on vp */
-Volume * 
+Volume *
 VGetVolumeByVp_r(Error * ec, Volume * vp)
 {
-    return GetVolume(ec, NULL, vp->hashid, vp, 0);
+    return GetVolume(ec, NULL, vp->hashid, vp, NULL);
 }
 
-/* private interface for getting a volume handle
- * volumeId must be provided.
- * hint is an optional parameter to speed up hash lookups
- * flags is not used at this time
+/**
+ * private interface for getting a volume handle
+ *
+ * @param[out] ec         error code (0 if no error)
+ * @param[out] client_ec  wire error code to be given to clients
+ * @param[in]  volumeId   ID of the volume we want
+ * @param[in]  hint       optional hint for hash lookups, or NULL
+ * @param[in]  timeout    absolute deadline for waiting for the volume to go
+ *                        offline, if it is going offline. NULL to wait forever.
+ *
+ * @return a volume handle for the specified volume
+ *  @retval NULL an error occurred, or the volume is in such a state that
+ *               we cannot load a header or return any volume struct
+ *
+ * @note for DAFS, caller must NOT hold a ref count on 'hint'
+ *
+ * @note 'timeout' is only checked if the volume is actually going offline; so
+ *       if you pass timeout->tv_sec = 0, this will exhibit typical
+ *       nonblocking behavior.
+ *
+ * @note for LWP builds, 'timeout' must be NULL
  */
-/* for demand attach fs, caller MUST NOT hold a ref count on hint */
 static Volume *
-GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags)
+GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint,
+          const struct timespec *timeout)
 {
     Volume *vp = hint;
     /* pull this profiling/debugging code out of regular builds */
@@ -2583,10 +3946,25 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
 #else
 #define VGET_CTR_INC(x)
 #endif
-
 #ifdef AFS_DEMAND_ATTACH_FS
     Volume *avp, * rvp = hint;
+#endif
 
+    /*
+     * if VInit is zero, the volume package dynamic
+     * data structures have not been initialized yet,
+     * and we must immediately return an error
+     */
+    if (VInit == 0) {
+       vp = NULL;
+       *ec = VOFFLINE;
+       if (client_ec) {
+           *client_ec = VOFFLINE;
+       }
+       goto not_inited;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
     if (rvp) {
        VCreateReservation_r(rvp);
     }
@@ -2632,7 +4010,7 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
 
        VGET_CTR_INC(V3);
        IncUInt64(&VStats.hdr_gets);
-       
+
 #ifdef AFS_DEMAND_ATTACH_FS
        /* block if someone else is performing an exclusive op on this volume */
        if (rvp != vp) {
@@ -2643,22 +4021,39 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
 
        /* short circuit with VNOVOL in the following circumstances:
         *
-        *   VOL_STATE_ERROR
-        *   VOL_STATE_SHUTTING_DOWN
+        *   - VOL_STATE_ERROR
+        *   - VOL_STATE_SHUTTING_DOWN
         */
        if ((V_attachState(vp) == VOL_STATE_ERROR) ||
-           (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN)) {
+           (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN) ||
+           (V_attachState(vp) == VOL_STATE_GOING_OFFLINE)) {
            *ec = VNOVOL;
            vp = NULL;
            break;
        }
 
+       /*
+        * short circuit with VOFFLINE for VOL_STATE_UNATTACHED and
+        *                    VNOVOL   for VOL_STATE_DELETED
+        */
+       if ((V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+           (V_attachState(vp) == VOL_STATE_DELETED)) {
+          if (vp->specialStatus) {
+              *ec = vp->specialStatus;
+          } else if (V_attachState(vp) == VOL_STATE_DELETED) {
+              *ec = VNOVOL;
+          } else {
+              *ec = VOFFLINE;
+          }
+           vp = NULL;
+           break;
+       }
+
        /* allowable states:
-        *   UNATTACHED
-        *   PREATTACHED
-        *   ATTACHED
-        *   GOING_OFFLINE
-        *   SALVAGING
+        *   - PREATTACHED
+        *   - ATTACHED
+        *   - SALVAGING
+        *   - SALVAGE_REQ
         */
 
        if (vp->salvage.requested) {
@@ -2701,8 +4096,7 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
            }
        }
 
-       if ((V_attachState(vp) == VOL_STATE_SALVAGING) ||
-           (*ec == VSALVAGING)) {
+       if (VIsSalvaging(vp) || (*ec == VSALVAGING)) {
            if (client_ec) {
                /* see CheckVnode() in afsfileprocs.c for an explanation
                 * of this error code logic */
@@ -2719,6 +4113,50 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
        }
 #endif
 
+#ifdef AFS_DEMAND_ATTACH_FS
+       /*
+        * this test MUST happen after VAttachVolymeByVp, so vol_op_state is
+        * not VolOpRunningUnknown (attach2 would have converted it to Online
+        * or Offline)
+        */
+
+         /* only valid before/during demand attachment */
+         osi_Assert(!vp->pending_vol_op || vp->pending_vol_op->vol_op_state != FSSYNC_VolOpRunningUnknown);
+
+         /* deny getvolume due to running mutually exclusive vol op */
+         if (vp->pending_vol_op && vp->pending_vol_op->vol_op_state==FSSYNC_VolOpRunningOffline) {
+          /*
+           * volume cannot remain online during this volume operation.
+           * notify client.
+           */
+          if (vp->specialStatus) {
+              /*
+               * special status codes outrank normal VOFFLINE code
+               */
+              *ec = vp->specialStatus;
+              if (client_ec) {
+                  *client_ec = vp->specialStatus;
+              }
+          } else {
+              if (client_ec) {
+                  /* see CheckVnode() in afsfileprocs.c for an explanation
+                   * of this error code logic */
+                  afs_uint32 now = FT_ApproxTime();
+                  if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
+                      *client_ec = VBUSY;
+                  } else {
+                      *client_ec = VRESTARTING;
+                  }
+              }
+              *ec = VOFFLINE;
+          }
+          VChangeState_r(vp, VOL_STATE_UNATTACHED);
+          FreeVolumeHeader(vp);
+          vp = NULL;
+          break;
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
        LoadVolumeHeader(ec, vp);
        if (*ec) {
            VGET_CTR_INC(V6);
@@ -2728,9 +4166,8 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
                Log("Volume %u: couldn't reread volume header\n",
                    vp->hashid);
 #ifdef AFS_DEMAND_ATTACH_FS
-           if (programType == fileServer) {
-               VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
-               *ec = VSALVAGING;
+           if (VCanScheduleSalvage()) {
+               VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
            } else {
                FreeVolume(vp);
                vp = NULL;
@@ -2742,32 +4179,6 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
            break;
        }
 
-#ifdef AFS_DEMAND_ATTACH_FS
-       if (vp->pending_vol_op && !VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
-           if (client_ec) {
-               /* see CheckVnode() in afsfileprocs.c for an explanation
-                * of this error code logic */
-               afs_uint32 now = FT_ApproxTime();
-               if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
-                   *client_ec = VBUSY;
-               } else {
-                   *client_ec = VRESTARTING;
-               }
-           }
-           *ec = VOFFLINE;
-           ReleaseVolumeHeader(vp->header);
-           vp = NULL;
-           break;
-       }
-
-       if (V_attachState(vp) == VOL_STATE_UNATTACHED) {
-           *ec = VOFFLINE;
-           ReleaseVolumeHeader(vp->header);
-           vp = NULL;
-           break;
-       }
-#endif /* AFS_DEMAND_ATTACH_FS */
-       
        VGET_CTR_INC(V7);
        if (vp->shuttingDown) {
            VGET_CTR_INC(V8);
@@ -2779,18 +4190,25 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
        if (programType == fileServer) {
            VGET_CTR_INC(V9);
            if (vp->goingOffline) {
-               VGET_CTR_INC(V10);
+               if (timeout && VTimedOut(timeout)) {
+                   /* we've timed out; don't wait for the vol */
+               } else {
+                   VGET_CTR_INC(V10);
 #ifdef AFS_DEMAND_ATTACH_FS
-               /* wait for the volume to go offline */
-               if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
-                   VWaitStateChange_r(vp);
-               }
+                   /* wait for the volume to go offline */
+                   if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+                       VTimedWaitStateChange_r(vp, timeout, NULL);
+                   }
 #elif defined(AFS_PTHREAD_ENV)
-               assert(pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex) == 0);
+                   VOL_CV_TIMEDWAIT(&vol_put_volume_cond, timeout, NULL);
 #else /* AFS_PTHREAD_ENV */
-               LWP_WaitProcess(VPutVolume);
+                   /* LWP has no timed wait, so the caller better not be
+                    * expecting one */
+                   osi_Assert(!timeout);
+                   LWP_WaitProcess(VPutVolume);
 #endif /* AFS_PTHREAD_ENV */
-               continue;
+                   continue;
+               }
            }
            if (vp->specialStatus) {
                VGET_CTR_INC(V11);
@@ -2798,7 +4216,7 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
            } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
                VGET_CTR_INC(V12);
                *ec = VNOVOL;
-           } else if (V_inUse(vp) == 0) {
+           } else if (V_inUse(vp) == 0 || vp->goingOffline) {
                VGET_CTR_INC(V13);
                *ec = VOFFLINE;
            } else {
@@ -2832,7 +4250,8 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
     }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
-    assert(vp || *ec);
+ not_inited:
+    osi_Assert(vp || *ec);
     return vp;
 }
 
@@ -2844,10 +4263,12 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
 /* caller MUST hold a heavyweight ref on vp */
 #ifdef AFS_DEMAND_ATTACH_FS
 void
-VTakeOffline_r(register Volume * vp)
+VTakeOffline_r(Volume * vp)
 {
-    assert(vp->nUsers > 0);
-    assert(programType == fileServer);
+    Error error;
+
+    osi_Assert(vp->nUsers > 0);
+    osi_Assert(programType == fileServer);
 
     VCreateReservation_r(vp);
     VWaitExclusiveState_r(vp);
@@ -2855,15 +4276,15 @@ VTakeOffline_r(register Volume * vp)
     vp->goingOffline = 1;
     V_needsSalvaged(vp) = 1;
 
-    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+    VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
     VCancelReservation_r(vp);
 }
 #else /* AFS_DEMAND_ATTACH_FS */
 void
-VTakeOffline_r(register Volume * vp)
+VTakeOffline_r(Volume * vp)
 {
-    assert(vp->nUsers > 0);
-    assert(programType == fileServer);
+    osi_Assert(vp->nUsers > 0);
+    osi_Assert(programType == fileServer);
 
     vp->goingOffline = 1;
     V_needsSalvaged(vp) = 1;
@@ -2871,55 +4292,246 @@ VTakeOffline_r(register Volume * vp)
 #endif /* AFS_DEMAND_ATTACH_FS */
 
 void
-VTakeOffline(register Volume * vp)
+VTakeOffline(Volume * vp)
 {
     VOL_LOCK;
     VTakeOffline_r(vp);
     VOL_UNLOCK;
 }
 
-/* Force the volume offline, set the salvage flag.  No further references to
- * the volume through the volume package will be honored. */
-/* for demand attach, caller MUST hold ref count on vp */
+/**
+ * force a volume offline.
+ *
+ * @param[in] vp     volume object pointer
+ * @param[in] flags  flags (see note below)
+ *
+ * @note the flag VOL_FORCEOFF_NOUPDATE is a recursion control flag
+ *       used when VUpdateVolume_r needs to call VForceOffline_r
+ *       (which in turn would normally call VUpdateVolume_r)
+ *
+ * @see VUpdateVolume_r
+ *
+ * @pre VOL_LOCK must be held.
+ *      for DAFS, caller must hold ref.
+ *
+ * @note for DAFS, it _is safe_ to call this function from an
+ *       exclusive state
+ *
+ * @post needsSalvaged flag is set.
+ *       for DAFS, salvage is requested.
+ *       no further references to the volume through the volume
+ *       package will be honored.
+ *       all file descriptor and vnode caches are invalidated.
+ *
+ * @warning this is a heavy-handed interface.  it results in
+ *          a volume going offline regardless of the current
+ *          reference count state.
+ *
+ * @internal  volume package internal use only
+ */
 void
 VForceOffline_r(Volume * vp, int flags)
 {
     Error error;
-    if (!V_inUse(vp))
+    if (!V_inUse(vp)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       VChangeState_r(vp, VOL_STATE_ERROR);
+#endif
        return;
+    }
+
     strcpy(V_offlineMessage(vp),
           "Forced offline due to internal error: volume needs to be salvaged");
     Log("Volume %u forced offline:  it needs salvaging!\n", V_id(vp));
+
     V_inUse(vp) = 0;
     vp->goingOffline = 0;
     V_needsSalvaged(vp) = 1;
     if (!(flags & VOL_FORCEOFF_NOUPDATE)) {
-       VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT | VOL_UPDATE_NOFORCEOFF);
+       VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF);
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#ifdef AFS_PTHREAD_ENV
+    CV_BROADCAST(&vol_put_volume_cond);
+#else /* AFS_PTHREAD_ENV */
+    LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+
+    VReleaseVolumeHandles_r(vp);
+}
+
+/**
+ * force a volume offline.
+ *
+ * @param[in] vp  volume object pointer
+ *
+ * @see VForceOffline_r
+ */
+void
+VForceOffline(Volume * vp)
+{
+    VOL_LOCK;
+    VForceOffline_r(vp, 0);
+    VOL_UNLOCK;
+}
+
+/**
+ * Iterate over the RX calls associated with a volume, and interrupt them.
+ *
+ * @param[in] vp The volume whose RX calls we want to scan
+ *
+ * @pre VOL_LOCK held
+ */
+static void
+VScanCalls_r(struct Volume *vp)
+{
+    struct VCallByVol *cbv, *ncbv;
+    afs_int32 err;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif
+
+    if (queue_IsEmpty(&vp->rx_call_list))
+       return; /* no calls to interrupt */
+    if (!vol_opts.interrupt_rxcall)
+       return; /* we have no function with which to interrupt calls */
+    err = VIsGoingOffline_r(vp);
+    if (!err)
+       return; /* we're not going offline anymore */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VWaitExclusiveState_r(vp);
+    state_save = VChangeState_r(vp, VOL_STATE_SCANNING_RXCALLS);
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    for(queue_Scan(&vp->rx_call_list, cbv, ncbv, VCallByVol)) {
+       if (LogLevel > 0) {
+           struct rx_peer *peer;
+           char hoststr[16];
+           peer = rx_PeerOf(rx_ConnectionOf(cbv->call));
+
+           Log("Offlining volume %lu while client %s:%u is trying to read "
+               "from it; kicking client off with error %ld\n",
+               (long unsigned) vp->hashid,
+               afs_inet_ntoa_r(rx_HostOf(peer), hoststr),
+               (unsigned) ntohs(rx_PortOf(peer)),
+               (long) err);
+       }
+       (*vol_opts.interrupt_rxcall) (cbv->call, err);
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    VChangeState_r(vp, state_save);
+#endif /* AFS_DEMAND_ATTACH_FS */
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * Wait for a vp to go offline.
+ *
+ * @param[out] ec 1 if a salvage on the volume has been requested and
+ *                salvok == 0, 0 otherwise
+ * @param[in] vp  The volume to wait for
+ * @param[in] salvok  If 0, we return immediately with *ec = 1 if the volume
+ *                    has been requested to salvage. Otherwise we keep waiting
+ *                    until the volume has gone offline.
+ *
+ * @pre VOL_LOCK held
+ * @pre caller holds a lightweight ref on vp
+ *
+ * @note DAFS only
+ */
+static void
+VWaitForOfflineByVp_r(Error *ec, struct Volume *vp, int salvok)
+{
+    struct timespec timeout_ts;
+    const struct timespec *ts;
+    int timedout = 0;
+
+    ts = VOfflineTimeout(&timeout_ts);
+
+    *ec = 0;
+
+    while (!VIsOfflineState(V_attachState(vp)) && !timedout) {
+       if (!salvok && vp->salvage.requested) {
+           *ec = 1;
+           return;
+       }
+       VTimedWaitStateChange_r(vp, ts, &timedout);
+    }
+    if (!timedout) {
+       /* we didn't time out, so the volume must be offline, so we're done */
+       return;
+    }
+
+    /* If we got here, we timed out waiting for the volume to go offline.
+     * Kick off the accessing RX calls and wait again */
+
+    VScanCalls_r(vp);
+
+    while (!VIsOfflineState(V_attachState(vp))) {
+       if (!salvok && vp->salvage.requested) {
+           *ec = 1;
+           return;
+       }
+
+       VWaitStateChange_r(vp);
+    }
+}
+
+#else /* AFS_DEMAND_ATTACH_FS */
+
+/**
+ * Wait for a volume to go offline.
+ *
+ * @pre VOL_LOCK held
+ *
+ * @note non-DAFS only (for DAFS, use @see WaitForOfflineByVp_r)
+ */
+static void
+VWaitForOffline_r(Error *ec, VolumeId volid)
+{
+    struct Volume *vp;
+    const struct timespec *ts;
+#ifdef AFS_PTHREAD_ENV
+    struct timespec timeout_ts;
+#endif
+
+    ts = VOfflineTimeout(&timeout_ts);
+
+    vp = GetVolume(ec, NULL, volid, NULL, ts);
+    if (!vp) {
+       /* error occurred so bad that we can't even get a vp; we have no
+        * information on the vol so we don't know whether to wait, so just
+        * return */
+       return;
     }
-#ifdef AFS_DEMAND_ATTACH_FS
-#ifdef SALVSYNC_BUILD_CLIENT
-    if (programType == fileServer) {
-       VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+    if (!VIsGoingOffline_r(vp)) {
+       /* volume is no longer going offline, so we're done */
+       VPutVolume_r(vp);
+       return;
     }
-#endif
-    VChangeState_r(vp, VOL_STATE_ERROR);
-#endif /* AFS_DEMAND_ATTACH_FS */
-#ifdef AFS_PTHREAD_ENV
-    assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
-    LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
 
-    VReleaseVolumeHandles_r(vp);
-}
+    /* If we got here, we timed out waiting for the volume to go offline.
+     * Kick off the accessing RX calls and wait again */
 
-void
-VForceOffline(Volume * vp)
-{
-    VOL_LOCK;
-    VForceOffline_r(vp, 0);
-    VOL_UNLOCK;
+    VScanCalls_r(vp);
+    VPutVolume_r(vp);
+    vp = NULL;
+
+    vp = VGetVolume_r(ec, volid);
+    if (vp) {
+       /* In case it was reattached... */
+       VPutVolume_r(vp);
+    }
 }
+#endif /* !AFS_DEMAND_ATTACH_FS */
 
 /* The opposite of VAttachVolume.  The volume header is written to disk, with
    the inUse bit turned off.  A copy of the header is maintained in memory,
@@ -2929,9 +4541,11 @@ void
 VOffline_r(Volume * vp, char *message)
 {
     Error error;
+#ifndef AFS_DEMAND_ATTACH_FS
     VolumeId vid = V_id(vp);
+#endif
 
-    assert(programType != volumeUtility);
+    osi_Assert(programType != volumeUtility && programType != volumeServer);
     if (!V_inUse(vp)) {
        VPutVolume_r(vp);
        return;
@@ -2945,20 +4559,73 @@ VOffline_r(Volume * vp, char *message)
     VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
     VCreateReservation_r(vp);
     VPutVolume_r(vp);
-
-    /* wait for the volume to go offline */
-    if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
-       VWaitStateChange_r(vp);
-    }
+    VWaitForOfflineByVp_r(&error, vp, 1);
     VCancelReservation_r(vp);
 #else /* AFS_DEMAND_ATTACH_FS */
     VPutVolume_r(vp);
-    vp = VGetVolume_r(&error, vid);    /* Wait for it to go offline */
-    if (vp)                    /* In case it was reattached... */
-       VPutVolume_r(vp);
+    VWaitForOffline_r(&error, vid);
 #endif /* AFS_DEMAND_ATTACH_FS */
 }
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * Take a volume offline in order to perform a volume operation.
+ *
+ * @param[inout] ec       address in which to store error code
+ * @param[in]    vp       volume object pointer
+ * @param[in]    message  volume offline status message
+ *
+ * @pre
+ *    - VOL_LOCK is held
+ *    - caller MUST hold a heavyweight ref on vp
+ *
+ * @post
+ *    - volume is taken offline
+ *    - if possible, volume operation is promoted to running state
+ *    - on failure, *ec is set to nonzero
+ *
+ * @note Although this function does not return any value, it may
+ *       still fail to promote our pending volume operation to
+ *       a running state.  Any caller MUST check the value of *ec,
+ *       and MUST NOT blindly assume success.
+ *
+ * @warning if the caller does not hold a lightweight ref on vp,
+ *          then it MUST NOT reference vp after this function
+ *          returns to the caller.
+ *
+ * @internal volume package internal use only
+ */
+void
+VOfflineForVolOp_r(Error *ec, Volume *vp, char *message)
+{
+    int salvok = 1;
+    osi_Assert(vp->pending_vol_op);
+    if (!V_inUse(vp)) {
+       VPutVolume_r(vp);
+        *ec = 1;
+       return;
+    }
+    if (V_offlineMessage(vp)[0] == '\0')
+       strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
+    V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
+    vp->goingOffline = 1;
+    VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
+    VCreateReservation_r(vp);
+    VPutVolume_r(vp);
+
+    if (vp->pending_vol_op->com.programType != salvageServer) {
+        /* do not give corrupted volumes to the volserver */
+       salvok = 0;
+    }
+
+    *ec = 0;
+    VWaitForOfflineByVp_r(ec, vp, salvok);
+
+    VCancelReservation_r(vp);
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 void
 VOffline(Volume * vp, char *message)
 {
@@ -2977,23 +4644,35 @@ VOffline(Volume * vp, char *message)
 void
 VDetachVolume_r(Error * ec, Volume * vp)
 {
+#ifdef FSSYNC_BUILD_CLIENT
     VolumeId volume;
-    struct DiskPartition *tpartp;
-    int notifyServer, useDone;
+    struct DiskPartition64 *tpartp;
+    int notifyServer = 0;
+    int  useDone = FSYNC_VOL_ON;
 
-    *ec = 0;                   /* always "succeeds" */
-    if (programType == volumeUtility) {
+    if (VCanUseFSSYNC()) {
        notifyServer = vp->needsPutBack;
-       useDone = (V_destroyMe(vp) == DESTROY_ME);
+       if (V_destroyMe(vp) == DESTROY_ME)
+           useDone = FSYNC_VOL_LEAVE_OFF;
+#ifdef AFS_DEMAND_ATTACH_FS
+       else if (!V_blessed(vp) || !V_inService(vp))
+           useDone = FSYNC_VOL_LEAVE_OFF;
+#endif
     }
     tpartp = vp->partition;
     volume = V_id(vp);
+#endif /* FSSYNC_BUILD_CLIENT */
+
+    *ec = 0;                   /* always "succeeds" */
     DeleteVolumeFromHashTable(vp);
     vp->shuttingDown = 1;
 #ifdef AFS_DEMAND_ATTACH_FS
     DeleteVolumeFromVByPList_r(vp);
     VLRU_Delete_r(vp);
     VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN);
+#else
+    if (programType != fileServer)
+       V_inUse(vp) = 0;
 #endif /* AFS_DEMAND_ATTACH_FS */
     VPutVolume_r(vp);
     /* Will be detached sometime in the future--this is OK since volume is offline */
@@ -3002,34 +4681,33 @@ VDetachVolume_r(Error * ec, Volume * vp)
      * is not technically detached until the refcounts reach zero
      */
 #ifdef FSSYNC_BUILD_CLIENT
-    if (programType == volumeUtility && notifyServer) {
-       /* 
-        * Note:  The server is not notified in the case of a bogus volume 
-        * explicitly to make it possible to create a volume, do a partial 
-        * restore, then abort the operation without ever putting the volume 
-        * online.  This is essential in the case of a volume move operation 
-        * between two partitions on the same server.  In that case, there 
-        * would be two instances of the same volume, one of them bogus, 
-        * which the file server would attempt to put on line 
+    if (VCanUseFSSYNC() && notifyServer) {
+       if (notifyServer == VOL_PUTBACK_DELETE) {
+           /* Only send FSYNC_VOL_DONE if the volume was actually deleted.
+            * volserver code will set needsPutBack to VOL_PUTBACK_DELETE
+            * to signify a deleted volume. */
+           useDone = FSYNC_VOL_DONE;
+       }
+       /*
+        * Note:  The server is not notified in the case of a bogus volume
+        * explicitly to make it possible to create a volume, do a partial
+        * restore, then abort the operation without ever putting the volume
+        * online.  This is essential in the case of a volume move operation
+        * between two partitions on the same server.  In that case, there
+        * would be two instances of the same volume, one of them bogus,
+        * which the file server would attempt to put on line
         */
-       if (useDone) {
-           /* don't put online */
-           FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_DONE, 0, NULL);
-       } else {
-           /* fs can use it again */
-           FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_ON, 0, NULL);
-
-           /* XXX this code path is only hit by volume utilities, thus
-            * V_BreakVolumeCallbacks will always be NULL.  if we really
-            * want to break callbacks in this path we need to use FSYNC_VolOp() */
+       FSYNC_VolOp(volume, tpartp->name, useDone, 0, NULL);
+       /* XXX this code path is only hit by volume utilities, thus
+        * V_BreakVolumeCallbacks will always be NULL.  if we really
+        * want to break callbacks in this path we need to use FSYNC_VolOp() */
 #ifdef notdef
-           /* Dettaching it so break all callbacks on it */
-           if (V_BreakVolumeCallbacks) {
-               Log("volume %u detached; breaking all call backs\n", volume);
-               (*V_BreakVolumeCallbacks) (volume);
-           }
-#endif
+       /* Dettaching it so break all callbacks on it */
+       if (V_BreakVolumeCallbacks) {
+           Log("volume %u detached; breaking all call backs\n", volume);
+           (*V_BreakVolumeCallbacks) (volume);
        }
+#endif
     }
 #endif /* FSSYNC_BUILD_CLIENT */
 }
@@ -3063,10 +4741,10 @@ VCloseVolumeHandles_r(Volume * vp)
     /* demand attach fs
      *
      * XXX need to investigate whether we can perform
-     * DFlushVolume outside of vol_glock_mutex... 
+     * DFlushVolume outside of vol_glock_mutex...
      *
      * VCloseVnodeFiles_r drops the glock internally */
-    DFlushVolume(V_id(vp));
+    DFlushVolume(vp->hashid);
     VCloseVnodeFiles_r(vp);
 
 #ifdef AFS_DEMAND_ATTACH_FS
@@ -3074,7 +4752,7 @@ VCloseVolumeHandles_r(Volume * vp)
 #endif
 
     /* Too time consuming and unnecessary for the volserver */
-    if (programType != volumeUtility) {
+    if (programType == fileServer) {
        IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
        IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
        IH_CONDSYNC(vp->diskDataHandle);
@@ -3089,6 +4767,10 @@ VCloseVolumeHandles_r(Volume * vp)
     IH_REALLYCLOSE(vp->linkHandle);
 
 #ifdef AFS_DEMAND_ATTACH_FS
+    if ((V_attachFlags(vp) & VOL_LOCKED)) {
+       VUnlockVolume(vp);
+    }
+
     VOL_LOCK;
     VChangeState_r(vp, state_save);
 #endif
@@ -3096,7 +4778,7 @@ VCloseVolumeHandles_r(Volume * vp)
 
 /* For both VForceOffline and VOffline, we close all relevant handles.
  * For VOffline, if we re-attach the volume, the files may possible be
- * different than before. 
+ * different than before.
  */
 /* for demand attach, caller MUST hold a ref count on vp */
 static void
@@ -3110,7 +4792,7 @@ VReleaseVolumeHandles_r(Volume * vp)
 
     /* XXX need to investigate whether we can perform
      * DFlushVolume outside of vol_glock_mutex... */
-    DFlushVolume(V_id(vp));
+    DFlushVolume(vp->hashid);
 
     VReleaseVnodeFiles_r(vp); /* releases the glock internally */
 
@@ -3119,7 +4801,7 @@ VReleaseVolumeHandles_r(Volume * vp)
 #endif
 
     /* Too time consuming and unnecessary for the volserver */
-    if (programType != volumeUtility) {
+    if (programType == fileServer) {
        IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
        IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
        IH_CONDSYNC(vp->diskDataHandle);
@@ -3134,6 +4816,10 @@ VReleaseVolumeHandles_r(Volume * vp)
     IH_RELEASE(vp->linkHandle);
 
 #ifdef AFS_DEMAND_ATTACH_FS
+    if ((V_attachFlags(vp) & VOL_LOCKED)) {
+       VUnlockVolume(vp);
+    }
+
     VOL_LOCK;
     VChangeState_r(vp, state_save);
 #endif
@@ -3180,7 +4866,7 @@ VUpdateVolume_r(Error * ec, Volume * vp, int flags)
     if (*ec) {
        Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
            V_id(vp), V_name(vp));
-       /* try to update on-disk header, 
+       /* try to update on-disk header,
         * while preventing infinite recursion */
        if (!(flags & VOL_UPDATE_NOFORCEOFF)) {
            VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE);
@@ -3216,9 +4902,9 @@ VSyncVolume_r(Error * ec, Volume * vp, int flags)
        VOL_UNLOCK;
 #endif
        fdP = IH_OPEN(V_diskDataHandle(vp));
-       assert(fdP != NULL);
+       osi_Assert(fdP != NULL);
        code = FDH_SYNC(fdP);
-       assert(code == 0);
+       osi_Assert(code == 0);
        FDH_CLOSE(fdP);
 #ifdef AFS_DEMAND_ATTACH_FS
        VOL_LOCK;
@@ -3283,40 +4969,68 @@ ReallyFreeVolume(Volume * vp)
  * returns 1 if volume was freed, 0 otherwise */
 #ifdef AFS_DEMAND_ATTACH_FS
 static int
-VCheckDetach(register Volume * vp)
+VCheckDetach(Volume * vp)
 {
     int ret = 0;
+    Error ec = 0;
 
     if (vp->nUsers || vp->nWaiters)
        return ret;
 
     if (vp->shuttingDown) {
        ret = 1;
+       if ((programType != fileServer) &&
+           (V_inUse(vp) == programType) &&
+           ((V_checkoutMode(vp) == V_VOLUPD) ||
+            (V_checkoutMode(vp) == V_SECRETLY) ||
+            ((V_checkoutMode(vp) == V_CLONE) &&
+             (VolumeWriteable(vp))))) {
+           V_inUse(vp) = 0;
+           VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
+           if (ec) {
+               Log("VCheckDetach: volume header update for volume %u "
+                   "failed with errno %d\n", vp->hashid, errno);
+           }
+       }
        VReleaseVolumeHandles_r(vp);
        VCheckSalvage(vp);
        ReallyFreeVolume(vp);
        if (programType == fileServer) {
-           assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+           CV_BROADCAST(&vol_put_volume_cond);
        }
     }
     return ret;
 }
 #else /* AFS_DEMAND_ATTACH_FS */
 static int
-VCheckDetach(register Volume * vp)
+VCheckDetach(Volume * vp)
 {
     int ret = 0;
+    Error ec = 0;
 
     if (vp->nUsers)
        return ret;
 
     if (vp->shuttingDown) {
        ret = 1;
+       if ((programType != fileServer) &&
+           (V_inUse(vp) == programType) &&
+           ((V_checkoutMode(vp) == V_VOLUPD) ||
+            (V_checkoutMode(vp) == V_SECRETLY) ||
+            ((V_checkoutMode(vp) == V_CLONE) &&
+             (VolumeWriteable(vp))))) {
+           V_inUse(vp) = 0;
+           VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
+           if (ec) {
+               Log("VCheckDetach: volume header update for volume %u failed with errno %d\n",
+                   vp->hashid, errno);
+           }
+       }
        VReleaseVolumeHandles_r(vp);
        ReallyFreeVolume(vp);
        if (programType == fileServer) {
 #if defined(AFS_PTHREAD_ENV)
-           assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+           CV_BROADCAST(&vol_put_volume_cond);
 #else /* AFS_PTHREAD_ENV */
            LWP_NoYieldSignal(VPutVolume);
 #endif /* AFS_PTHREAD_ENV */
@@ -3330,25 +5044,25 @@ VCheckDetach(register Volume * vp)
  * return 1 if volume went offline, 0 otherwise */
 #ifdef AFS_DEMAND_ATTACH_FS
 static int
-VCheckOffline(register Volume * vp)
+VCheckOffline(Volume * vp)
 {
-    Volume * rvp = NULL;
     int ret = 0;
 
     if (vp->goingOffline && !vp->nUsers) {
        Error error;
-       assert(programType == fileServer);
-       assert((V_attachState(vp) != VOL_STATE_ATTACHED) &&
+       osi_Assert(programType == fileServer);
+       osi_Assert((V_attachState(vp) != VOL_STATE_ATTACHED) &&
               (V_attachState(vp) != VOL_STATE_FREED) &&
               (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
-              (V_attachState(vp) != VOL_STATE_UNATTACHED));
+              (V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+              (V_attachState(vp) != VOL_STATE_DELETED));
 
        /* valid states:
         *
         * VOL_STATE_GOING_OFFLINE
         * VOL_STATE_SHUTTING_DOWN
-        * IsErrorState(V_attachState(vp))
-        * IsExclusiveState(V_attachState(vp))
+        * VIsErrorState(V_attachState(vp))
+        * VIsExclusiveState(V_attachState(vp))
         */
 
        VCreateReservation_r(vp);
@@ -3366,11 +5080,14 @@ VCheckOffline(register Volume * vp)
        VCloseVolumeHandles_r(vp);
 
        if (LogLevel) {
-           Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
-               V_name(vp));
-           if (V_offlineMessage(vp)[0])
-               Log(" (%s)", V_offlineMessage(vp));
-           Log("\n");
+           if (V_offlineMessage(vp)[0]) {
+               Log("VOffline: Volume %lu (%s) is now offline (%s)\n",
+                   afs_printable_uint32_lu(V_id(vp)), V_name(vp),
+                   V_offlineMessage(vp));
+           } else {
+               Log("VOffline: Volume %lu (%s) is now offline\n",
+                   afs_printable_uint32_lu(V_id(vp)), V_name(vp));
+           }
        }
 
        /* invalidate the volume header cache entry */
@@ -3378,7 +5095,7 @@ VCheckOffline(register Volume * vp)
 
        /* if nothing changed state to error or salvaging,
         * drop state to unattached */
-       if (!IsErrorState(V_attachState(vp))) {
+       if (!VIsErrorState(V_attachState(vp))) {
            VChangeState_r(vp, VOL_STATE_UNATTACHED);
        }
        VCancelReservation_r(vp);
@@ -3388,14 +5105,13 @@ VCheckOffline(register Volume * vp)
 }
 #else /* AFS_DEMAND_ATTACH_FS */
 static int
-VCheckOffline(register Volume * vp)
+VCheckOffline(Volume * vp)
 {
-    Volume * rvp = NULL;
     int ret = 0;
 
     if (vp->goingOffline && !vp->nUsers) {
        Error error;
-       assert(programType == fileServer);
+       osi_Assert(programType == fileServer);
 
        ret = 1;
        vp->goingOffline = 0;
@@ -3403,15 +5119,18 @@ VCheckOffline(register Volume * vp)
        VUpdateVolume_r(&error, vp, 0);
        VCloseVolumeHandles_r(vp);
        if (LogLevel) {
-           Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
-               V_name(vp));
-           if (V_offlineMessage(vp)[0])
-               Log(" (%s)", V_offlineMessage(vp));
-           Log("\n");
+           if (V_offlineMessage(vp)[0]) {
+               Log("VOffline: Volume %lu (%s) is now offline (%s)\n",
+                   afs_printable_uint32_lu(V_id(vp)), V_name(vp),
+                   V_offlineMessage(vp));
+           } else {
+               Log("VOffline: Volume %lu (%s) is now offline\n",
+                   afs_printable_uint32_lu(V_id(vp)), V_name(vp));
+           }
        }
        FreeVolumeHeader(vp);
 #ifdef AFS_PTHREAD_ENV
-       assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+       CV_BROADCAST(&vol_put_volume_cond);
 #else /* AFS_PTHREAD_ENV */
        LWP_NoYieldSignal(VPutVolume);
 #endif /* AFS_PTHREAD_ENV */
@@ -3432,17 +5151,37 @@ VCheckOffline(register Volume * vp)
  * from free()ing the Volume struct during an async i/o op */
 
 /* register with the async volume op ref counter */
-static void
-VCreateReservation_r(Volume * vp)
-{
-    vp->nWaiters++;
-}
+/* VCreateReservation_r moved into inline code header because it
+ * is now needed in vnode.c -- tkeiser 11/20/2007
+ */
 
-/* unregister with the async volume op ref counter */
-static void
+/**
+ * decrement volume-package internal refcount.
+ *
+ * @param vp  volume object pointer
+ *
+ * @internal volume package internal use only
+ *
+ * @pre
+ *    @arg VOL_LOCK is held
+ *    @arg lightweight refcount held
+ *
+ * @post volume waiters refcount is decremented; volume may
+ *       have been deallocated/shutdown/offlined/salvaged/
+ *       whatever during the process
+ *
+ * @warning once you have tossed your last reference (you can acquire
+ *          lightweight refs recursively) it is NOT SAFE to reference
+ *          a volume object pointer ever again
+ *
+ * @see VCreateReservation_r
+ *
+ * @note DEMAND_ATTACH_FS only
+ */
+void
 VCancelReservation_r(Volume * vp)
 {
-    assert(--vp->nWaiters >= 0);
+    osi_Assert(--vp->nWaiters >= 0);
     if (vp->nWaiters == 0) {
        VCheckOffline(vp);
        if (!VCheckDetach(vp)) {
@@ -3460,8 +5199,8 @@ VCheckFree(Volume * vp)
     int ret = 0;
     if ((vp->nUsers == 0) &&
        (vp->nWaiters == 0) &&
-       !(V_attachFlags(vp) & (VOL_IN_HASH | 
-                              VOL_ON_VBYP_LIST | 
+       !(V_attachFlags(vp) & (VOL_IN_HASH |
+                              VOL_ON_VBYP_LIST |
                               VOL_IS_BUSY |
                               VOL_ON_VLRU))) {
        ReallyFreeVolume(vp);
@@ -3477,6 +5216,21 @@ VCheckFree(Volume * vp)
 /***************************************************/
 
 #ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * register a volume operation on a given volume.
+ *
+ * @param[in] vp       volume object
+ * @param[in] vopinfo  volume operation info object
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @post volume operation info object attached to volume object.
+ *       volume operation statistics updated.
+ *
+ * @note by "attached" we mean a copy of the passed in object is made
+ *
+ * @internal volume package internal use only
+ */
 int
 VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 {
@@ -3484,7 +5238,7 @@ VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 
     /* attach a vol op info node to the volume struct */
     info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info));
-    assert(info != NULL);
+    osi_Assert(info != NULL);
     memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info));
     vp->pending_vol_op = info;
 
@@ -3496,8 +5250,19 @@ VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
     return 0;
 }
 
+/**
+ * deregister the volume operation attached to this volume.
+ *
+ * @param[in] vp  volume object pointer
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @post the volume operation info object is detached from the volume object
+ *
+ * @internal volume package internal use only
+ */
 int
-VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+VDeregisterVolOp_r(Volume * vp)
 {
     if (vp->pending_vol_op) {
        free(vp->pending_vol_op);
@@ -3507,22 +5272,99 @@ VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
+/**
+ * determine whether it is safe to leave a volume online during
+ * the volume operation described by the vopinfo object.
+ *
+ * @param[in] vp        volume object
+ * @param[in] vopinfo   volume operation info object
+ *
+ * @return whether it is safe to leave volume online
+ *    @retval 0  it is NOT SAFE to leave the volume online
+ *    @retval 1  it is safe to leave the volume online during the operation
+ *
+ * @pre
+ *    @arg VOL_LOCK is held
+ *    @arg disk header attached to vp (heavyweight ref on vp will guarantee
+ *         this condition is met)
+ *
+ * @internal volume package internal use only
+ */
 int
 VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 {
-    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+    return (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline ||
+           (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
            (vopinfo->com.reason == V_READONLY ||
             (!VolumeWriteable(vp) &&
              (vopinfo->com.reason == V_CLONE ||
-              vopinfo->com.reason == V_DUMP))));
+              vopinfo->com.reason == V_DUMP)))));
+}
+
+/**
+ * same as VVolOpLeaveOnline_r, but does not require a volume with an attached
+ * header.
+ *
+ * @param[in] vp        volume object
+ * @param[in] vopinfo   volume operation info object
+ *
+ * @return whether it is safe to leave volume online
+ *    @retval 0  it is NOT SAFE to leave the volume online
+ *    @retval 1  it is safe to leave the volume online during the operation
+ *    @retval -1 unsure; volume header is required in order to know whether or
+ *               not is is safe to leave the volume online
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @internal volume package internal use only
+ */
+int
+VVolOpLeaveOnlineNoHeader_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+    /* follow the logic in VVolOpLeaveOnline_r; this is the same, except
+     * assume that we don't know VolumeWriteable; return -1 if the answer
+     * depends on VolumeWriteable */
+
+    if (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline) {
+       return 1;
+    }
+    if (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+        vopinfo->com.reason == V_READONLY) {
+
+       return 1;
+    }
+    if (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+        (vopinfo->com.reason == V_CLONE ||
+         vopinfo->com.reason == V_DUMP)) {
+
+       /* must know VolumeWriteable */
+       return -1;
+    }
+    return 0;
 }
 
+/**
+ * determine whether VBUSY should be set during this volume operation.
+ *
+ * @param[in] vp        volume object
+ * @param[in] vopinfo   volume operation info object
+ *
+ * @return whether VBUSY should be set
+ *   @retval 0  VBUSY does NOT need to be set
+ *   @retval 1  VBUSY SHOULD be set
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @internal volume package internal use only
+ */
 int
 VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 {
-    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+    return ((vopinfo->com.command == FSYNC_VOL_OFF &&
+           vopinfo->com.reason == FSYNC_SALVAGE) ||
+           (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
            (vopinfo->com.reason == V_CLONE ||
-            vopinfo->com.reason == V_DUMP));
+            vopinfo->com.reason == V_DUMP)));
 }
 
 
@@ -3530,78 +5372,268 @@ VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 /* online salvager routines                        */
 /***************************************************/
 #if defined(AFS_DEMAND_ATTACH_FS)
-#define SALVAGE_PRIO_UPDATE_INTERVAL 3      /* number of seconds between prio updates */
-#define SALVAGE_COUNT_MAX 16                /* number of online salvages we
-                                            * allow before moving the volume
-                                            * into a permanent error state
-                                            *
-                                            * once this threshold is reached,
-                                            * the operator will have to manually
-                                            * issue a 'bos salvage' to bring
-                                            * the volume back online
-                                            */
-
-/* check to see if we should salvage this volume
- * returns 1 if salvage scheduled, 0 otherwise */
+
+/**
+ * offline a volume to let it be salvaged.
+ *
+ * @param[in] vp  Volume to offline
+ *
+ * @return whether we offlined the volume successfully
+ *  @retval 0 volume was not offlined
+ *  @retval 1 volume is now offline
+ *
+ * @note This is similar to VCheckOffline, but slightly different. We do not
+ *       deal with vp->goingOffline, and we try to avoid touching the volume
+ *       header except just to set needsSalvaged
+ *
+ * @pre VOL_LOCK held
+ * @pre vp->nUsers == 0
+ * @pre V_attachState(vp) == VOL_STATE_SALVAGE_REQ
+ */
+static int
+VOfflineForSalvage_r(struct Volume *vp)
+{
+    Error error;
+
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
+
+    if (vp->nUsers || V_attachState(vp) == VOL_STATE_SALVAGING) {
+       /* Someone's using the volume, or someone got to scheduling the salvage
+        * before us. I don't think either of these should be possible, as we
+        * should gain no new heavyweight references while we're trying to
+        * salvage, but just to be sure... */
+       VCancelReservation_r(vp);
+       return 0;
+    }
+
+    VChangeState_r(vp, VOL_STATE_OFFLINING);
+
+    VLRU_Delete_r(vp);
+    if (vp->header) {
+       V_needsSalvaged(vp) = 1;
+       /* ignore error; updating needsSalvaged is just best effort */
+       VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF);
+    }
+    VCloseVolumeHandles_r(vp);
+
+    FreeVolumeHeader(vp);
+
+    /* volume has been effectively offlined; we can mark it in the SALVAGING
+     * state now, which lets FSSYNC give it away */
+    VChangeState_r(vp, VOL_STATE_SALVAGING);
+
+    VCancelReservation_r(vp);
+
+    return 1;
+}
+
+/**
+ * check whether a salvage needs to be performed on this volume.
+ *
+ * @param[in] vp   pointer to volume object
+ *
+ * @return status code
+ *    @retval 0 no salvage scheduled
+ *    @retval 1 a salvage has been scheduled with the salvageserver
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @post if salvage request flag is set and nUsers and nWaiters are zero,
+ *       then a salvage will be requested
+ *
+ * @note this is one of the event handlers called by VCancelReservation_r
+ *
+ * @note the caller must check if the volume needs to be freed after calling
+ *       this; the volume may not have any references or be on any lists after
+ *       we return, and we do not free it
+ *
+ * @see VCancelReservation_r
+ *
+ * @internal volume package internal use only.
+ */
 static int
-VCheckSalvage(register Volume * vp)
+VCheckSalvage(Volume * vp)
 {
     int ret = 0;
-#ifdef SALVSYNC_BUILD_CLIENT
-    if (vp->nUsers || vp->nWaiters)
+#if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT)
+    if (vp->nUsers)
+       return ret;
+    if (!vp->salvage.requested) {
+       return ret;
+    }
+
+    /* prevent recursion; some of the code below creates and removes
+     * lightweight refs, which can call VCheckSalvage */
+    if (vp->salvage.scheduling) {
        return ret;
+    }
+    vp->salvage.scheduling = 1;
+
+    if (V_attachState(vp) == VOL_STATE_SALVAGE_REQ) {
+       if (!VOfflineForSalvage_r(vp)) {
+           vp->salvage.scheduling = 0;
+           return ret;
+       }
+    }
+
     if (vp->salvage.requested) {
        VScheduleSalvage_r(vp);
        ret = 1;
     }
-#endif /* SALVSYNC_BUILD_CLIENT */
+    vp->salvage.scheduling = 0;
+#endif /* SALVSYNC_BUILD_CLIENT || FSSYNC_BUILD_CLIENT */
     return ret;
 }
 
-/*
- * request that a salvage be performed once
- * ref counts reach zero
+/**
+ * request volume salvage.
+ *
+ * @param[out] ec      computed client error code
+ * @param[in]  vp      volume object pointer
+ * @param[in]  reason  reason code (passed to salvageserver via SALVSYNC)
+ * @param[in]  flags   see flags note below
+ *
+ * @note flags:
+ *       VOL_SALVAGE_INVALIDATE_HEADER causes volume header cache entry
+ *                                     to be invalidated.
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post volume state is changed.
+ *       for fileserver, salvage will be requested once refcount reaches zero.
+ *
+ * @return operation status code
+ *   @retval 0  volume salvage will occur
+ *   @retval 1  volume salvage could not be scheduled
+ *
+ * @note DAFS only
+ *
+ * @note in the fileserver, this call does not synchronously schedule a volume
+ *       salvage. rather, it sets volume state so that when volume refcounts
+ *       reach zero, a volume salvage will occur. by "refcounts", we mean both
+ *       nUsers and nWaiters must be zero.
+ *
+ * @internal volume package internal use only.
  */
 int
-VRequestSalvage_r(Volume * vp, int reason, int flags)
+VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags)
 {
-#ifdef SALVSYNC_BUILD_CLIENT
-    if (programType != fileServer)
+    int code = 0;
+    /*
+     * for DAFS volume utilities that are not supposed to schedule salvages,
+     * just transition to error state instead
+     */
+    if (!VCanScheduleSalvage()) {
+       VChangeState_r(vp, VOL_STATE_ERROR);
+       *ec = VSALVAGE;
        return 1;
+    }
+
+    if (programType != fileServer && !VCanUseFSSYNC()) {
+        VChangeState_r(vp, VOL_STATE_ERROR);
+        *ec = VSALVAGE;
+        return 1;
+    }
 
     if (!vp->salvage.requested) {
        vp->salvage.requested = 1;
        vp->salvage.reason = reason;
        vp->stats.last_salvage = FT_ApproxTime();
-       if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
-           ReleaseVolumeHeader(vp->header);
-       }
+
+       /* Note that it is not possible for us to reach this point if a
+        * salvage is already running on this volume (even if the fileserver
+        * was restarted during the salvage). If a salvage were running, the
+        * salvager would have write-locked the volume header file, so when
+        * we tried to lock the volume header, the lock would have failed,
+        * and we would have failed during attachment prior to calling
+        * VRequestSalvage. So we know that we can schedule salvages without
+        * fear of a salvage already running for this volume. */
+
        if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
-           VChangeState_r(vp, VOL_STATE_SALVAGING);
+
+           /* if we don't need to offline the volume, we can go directly
+            * to SALVAGING. SALVAGING says the volume is offline and is
+            * either salvaging or ready to be handed to the salvager.
+            * SALVAGE_REQ says that we want to salvage the volume, but we
+            * are waiting for it to go offline first. */
+           if (flags & VOL_SALVAGE_NO_OFFLINE) {
+               VChangeState_r(vp, VOL_STATE_SALVAGING);
+           } else {
+               VChangeState_r(vp, VOL_STATE_SALVAGE_REQ);
+               if (vp->nUsers == 0) {
+                   /* normally VOfflineForSalvage_r would be called from
+                    * PutVolume et al when nUsers reaches 0, but if
+                    * it's already 0, just do it ourselves, since PutVolume
+                    * isn't going to get called */
+                   VOfflineForSalvage_r(vp);
+               }
+           }
+           *ec = VSALVAGING;
        } else {
            Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
+
+           /* make sure neither VScheduleSalvage_r nor
+            * VUpdateSalvagePriority_r try to schedule another salvage */
+           vp->salvage.requested = vp->salvage.scheduled = 0;
+
            VChangeState_r(vp, VOL_STATE_ERROR);
+           *ec = VSALVAGE;
+           code = 1;
+       }
+       if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
+           /* Instead of ReleaseVolumeHeader, we do FreeVolumeHeader()
+               so that the the next VAttachVolumeByVp_r() invocation
+               of attach2() will pull in a cached header
+               entry and fail, then load a fresh one from disk and attach
+               it to the volume.
+           */
+           FreeVolumeHeader(vp);
        }
     }
-#endif /* SALVSYNC_BUILD_CLIENT */
-    return 0;
+    return code;
 }
 
-/*
- * update salvage priority
+/**
+ * update salvageserver scheduling priority for a volume.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @return operation status
+ *   @retval 0  success
+ *   @retval 1  request denied, or SALVSYNC communications failure
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post in-core salvage priority counter is incremented.  if at least
+ *       SALVAGE_PRIO_UPDATE_INTERVAL seconds have elapsed since the
+ *       last SALVSYNC_RAISEPRIO request, we contact the salvageserver
+ *       to update its priority queue.  if no salvage is scheduled,
+ *       this function is a no-op.
+ *
+ * @note DAFS fileserver only
+ *
+ * @note this should be called whenever a VGetVolume fails due to a
+ *       pending salvage request
+ *
+ * @todo should set exclusive state and drop glock around salvsync call
+ *
+ * @internal volume package internal use only.
  */
-static int
+int
 VUpdateSalvagePriority_r(Volume * vp)
 {
-    int code, ret=0;
+    int ret=0;
+
+#ifdef SALVSYNC_BUILD_CLIENT
     afs_uint32 now;
+    int code;
 
-#ifdef SALVSYNC_BUILD_CLIENT
     vp->salvage.prio++;
     now = FT_ApproxTime();
 
     /* update the salvageserver priority queue occasionally so that
-     * frequently requested volumes get moved to the head of the queue 
+     * frequently requested volumes get moved to the head of the queue
      */
     if ((vp->salvage.scheduled) &&
        (vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) {
@@ -3621,17 +5653,94 @@ VUpdateSalvagePriority_r(Volume * vp)
 }
 
 
-/*
- * schedule a salvage with the salvage server
+#if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT)
+
+/* A couple of little helper functions. These return true if we tried to
+ * use this mechanism to schedule a salvage, false if we haven't tried.
+ * If we did try a salvage then the results are contained in code.
+ */
+
+static_inline int
+try_SALVSYNC(Volume *vp, char *partName, int *code) {
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (VCanUseSALVSYNC()) {
+       Log("Scheduling salvage for volume %lu on part %s over SALVSYNC\n",
+           afs_printable_uint32_lu(vp->hashid), partName);
+
+       /* can't use V_id() since there's no guarantee
+        * we have the disk data header at this point */
+       *code = SALVSYNC_SalvageVolume(vp->hashid,
+                                      partName,
+                                      SALVSYNC_SALVAGE,
+                                      vp->salvage.reason,
+                                      vp->salvage.prio,
+                                      NULL);
+       return 1;
+    }
+#endif
+    return 0;
+}
+
+static_inline int
+try_FSSYNC(Volume *vp, char *partName, int *code) {
+#ifdef FSSYNC_BUILD_CLIENT
+    if (VCanUseFSSYNC()) {
+       Log("Scheduling salvage for volume %lu on part %s over FSSYNC\n",
+           afs_printable_uint32_lu(vp->hashid), partName);
+
+       /*
+        * If we aren't the fileserver, tell the fileserver the volume
+        * needs to be salvaged. We could directly tell the
+        * salvageserver, but the fileserver keeps track of some stats
+        * related to salvages, and handles some other salvage-related
+        * complications for us.
+         */
+        *code = FSYNC_VolOp(vp->hashid, partName,
+                            FSYNC_VOL_FORCE_ERROR, FSYNC_SALVAGE, NULL);
+       return 1;
+    }
+#endif /* FSSYNC_BUILD_CLIENT */
+    return 0;
+}
+
+/**
+ * schedule a salvage with the salvage server or fileserver.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @return operation status
+ *    @retval 0 salvage scheduled successfully
+ *    @retval 1 salvage not scheduled, or SALVSYNC/FSSYNC com error
+ *
+ * @pre
+ *    @arg VOL_LOCK is held.
+ *    @arg nUsers and nWaiters should be zero.
+ *
+ * @post salvageserver or fileserver is sent a salvage request
+ *
+ * @note If we are the fileserver, the request will be sent to the salvage
+ * server over SALVSYNC. If we are not the fileserver, the request will be
+ * sent to the fileserver over FSSYNC (FSYNC_VOL_FORCE_ERROR/FSYNC_SALVAGE).
+ *
+ * @note the caller must check if the volume needs to be freed after calling
+ *       this; the volume may not have any references or be on any lists after
+ *       we return, and we do not free it
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
  */
 static int
 VScheduleSalvage_r(Volume * vp)
 {
-    int code, ret=0;
-#ifdef SALVSYNC_BUILD_CLIENT
+    int ret=0;
+    int code = 0;
     VolState state_save;
+    VThreadOptions_t * thread_opts;
     char partName[16];
 
+    osi_Assert(VCanUseSALVSYNC() || VCanUseFSSYNC());
+
     if (vp->nWaiters || vp->nUsers) {
        return 1;
     }
@@ -3640,34 +5749,53 @@ VScheduleSalvage_r(Volume * vp)
     if (vp->stats.salvages >= SALVAGE_COUNT_MAX)
        return 1;
 
+    /*
+     * don't perform salvsync ops on certain threads
+     */
+    thread_opts = pthread_getspecific(VThread_key);
+    if (thread_opts == NULL) {
+       thread_opts = &VThread_defaults;
+    }
+    if (thread_opts->disallow_salvsync || vol_disallow_salvsync) {
+       return 1;
+    }
+
+    if (vp->salvage.scheduled) {
+       return ret;
+    }
+
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
+
+    /*
+     * XXX the scheduling process should really be done asynchronously
+     *     to avoid fssync deadlocks
+     */
     if (!vp->salvage.scheduled) {
-       /* if we haven't previously scheduled a salvage, do so now 
+       /* if we haven't previously scheduled a salvage, do so now
         *
         * set the volume to an exclusive state and drop the lock
         * around the SALVSYNC call
         */
        strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName));
        state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
-       V_attachFlags(vp) |= VOL_IS_BUSY;
        VOL_UNLOCK;
 
-       /* can't use V_id() since there's no guarantee
-        * we have the disk data header at this point */
-       code = SALVSYNC_SalvageVolume(vp->hashid,
-                                     partName,
-                                     SALVSYNC_SALVAGE,
-                                     vp->salvage.reason,
-                                     vp->salvage.prio,
-                                     NULL);
+       osi_Assert(try_SALVSYNC(vp, partName, &code) ||
+              try_FSSYNC(vp, partName, &code));
+
        VOL_LOCK;
        VChangeState_r(vp, state_save);
-       V_attachFlags(vp) &= ~(VOL_IS_BUSY);
 
        if (code == SYNC_OK) {
            vp->salvage.scheduled = 1;
-           vp->stats.salvages++;
            vp->stats.last_salvage_req = FT_ApproxTime();
-           IncUInt64(&VStats.salvages);
+           if (VCanUseSALVSYNC()) {
+               /* don't record these stats for non-fileservers; let the
+                * fileserver take care of these */
+               vp->stats.salvages++;
+               IncUInt64(&VStats.salvages);
+           }
        } else {
            ret = 1;
            switch(code) {
@@ -3675,49 +5803,48 @@ VScheduleSalvage_r(Volume * vp)
            case SYNC_COM_ERROR:
                break;
            case SYNC_DENIED:
-               Log("VScheduleSalvage_r:  SALVSYNC request denied\n");
+               Log("VScheduleSalvage_r: Salvage request for volume %lu "
+                   "denied\n", afs_printable_uint32_lu(vp->hashid));
                break;
            default:
-               Log("VScheduleSalvage_r:  SALVSYNC unknown protocol error\n");
+               Log("VScheduleSalvage_r: Salvage request for volume %lu "
+                   "received unknown protocol error %d\n",
+                   afs_printable_uint32_lu(vp->hashid), code);
                break;
            }
-       }
-    }
-#endif /* SALVSYNC_BUILD_CLIENT */
-    return ret;
-}
 
-/*
- * cancel a scheduled salvage operation
- */
-static int
-VCancelSalvage_r(Volume * vp, int reason)
-{
-    int code, ret = 0;
-
-#ifdef SALVSYNC_BUILD_CLIENT
-    if (vp->salvage.scheduled) {
-       code = SALVSYNC_SalvageVolume(vp->hashid,
-                                     VPartitionPath(vp->partition),
-                                     SALVSYNC_CANCEL,
-                                     reason,
-                                     0,
-                                     NULL);
-       if (code == SYNC_OK) {
-           vp->salvage.scheduled = 0;
-       } else {
-           ret = 1;
+           if (VCanUseFSSYNC()) {
+               VChangeState_r(vp, VOL_STATE_ERROR);
+           }
        }
     }
-#endif /* SALVSYNC_BUILD_CLIENT */
+
+    /* NB: this is cancelling the reservation we obtained above, but we do
+     * not call VCancelReservation_r, since that may trigger the vp dtor,
+     * possibly free'ing the vp. We need to keep the vp around after
+     * this, as the caller may reference vp without any refs. Instead, it
+     * is the duty of the caller to inspect 'vp' after we return to see if
+     * needs to be freed. */
+    osi_Assert(--vp->nWaiters >= 0);
     return ret;
 }
+#endif /* SALVSYNC_BUILD_CLIENT || FSSYNC_BUILD_CLIENT */
 
-/* This must be called by any volume utility which needs to run while the
-   file server is also running.  This is separated from VInitVolumePackage so
-   that a utility can fork--and each of the children can independently
-   initialize communication with the file server */
 #ifdef SALVSYNC_BUILD_CLIENT
+
+/**
+ * connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @post connection to salvageserver SYNC service established
+ *
+ * @see VConnectSALV_r
+ * @see VDisconnectSALV
+ * @see VReconnectSALV
+ */
 int
 VConnectSALV(void)
 {
@@ -3728,32 +5855,93 @@ VConnectSALV(void)
     return retVal;
 }
 
+/**
+ * connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post connection to salvageserver SYNC service established
+ *
+ * @see VConnectSALV
+ * @see VDisconnectSALV_r
+ * @see VReconnectSALV_r
+ * @see SALVSYNC_clientInit
+ *
+ * @internal volume package internal use only.
+ */
 int
 VConnectSALV_r(void)
 {
-    assert((programType != salvageServer) &&
-          (programType != volumeUtility));
     return SALVSYNC_clientInit();
 }
 
+/**
+ * disconnect from the salvageserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 success
+ *
+ * @pre client should have a live connection to the salvageserver
+ *
+ * @post connection to salvageserver SYNC service destroyed
+ *
+ * @see VDisconnectSALV_r
+ * @see VConnectSALV
+ * @see VReconnectSALV
+ */
 int
 VDisconnectSALV(void)
 {
-    int retVal;
     VOL_LOCK;
     VDisconnectSALV_r();
     VOL_UNLOCK;
-    return retVal;
+    return 0;
 }
 
+/**
+ * disconnect from the salvageserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 success
+ *
+ * @pre
+ *    @arg VOL_LOCK is held.
+ *    @arg client should have a live connection to the salvageserver.
+ *
+ * @post connection to salvageserver SYNC service destroyed
+ *
+ * @see VDisconnectSALV
+ * @see VConnectSALV_r
+ * @see VReconnectSALV_r
+ * @see SALVSYNC_clientFinis
+ *
+ * @internal volume package internal use only.
+ */
 int
 VDisconnectSALV_r(void)
-{ 
-    assert((programType != salvageServer) &&
-          (programType != volumeUtility));
+{
     return SALVSYNC_clientFinis();
 }
 
+/**
+ * disconnect and then re-connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre client should have a live connection to the salvageserver
+ *
+ * @post old connection is dropped, and a new one is established
+ *
+ * @see VConnectSALV
+ * @see VDisconnectSALV
+ * @see VReconnectSALV_r
+ */
 int
 VReconnectSALV(void)
 {
@@ -3764,11 +5952,29 @@ VReconnectSALV(void)
     return retVal;
 }
 
+/**
+ * disconnect and then re-connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre
+ *    @arg VOL_LOCK is held.
+ *    @arg client should have a live connection to the salvageserver.
+ *
+ * @post old connection is dropped, and a new one is established
+ *
+ * @see VConnectSALV_r
+ * @see VDisconnectSALV
+ * @see VReconnectSALV
+ * @see SALVSYNC_clientReconnect
+ *
+ * @internal volume package internal use only.
+ */
 int
 VReconnectSALV_r(void)
 {
-    assert((programType != salvageServer) &&
-          (programType != volumeUtility));
     return SALVSYNC_clientReconnect();
 }
 #endif /* SALVSYNC_BUILD_CLIENT */
@@ -3780,10 +5986,27 @@ VReconnectSALV_r(void)
 /***************************************************/
 
 /* This must be called by any volume utility which needs to run while the
-   file server is also running.  This is separated from VInitVolumePackage so
+   file server is also running.  This is separated from VInitVolumePackage2 so
    that a utility can fork--and each of the children can independently
    initialize communication with the file server */
 #ifdef FSSYNC_BUILD_CLIENT
+/**
+ * connect to the fileserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre
+ *    @arg VInit must equal 2.
+ *    @arg Program Type must not be fileserver or salvager.
+ *
+ * @post connection to fileserver SYNC service established
+ *
+ * @see VConnectFS_r
+ * @see VDisconnectFS
+ * @see VChildProcReconnectFS
+ */
 int
 VConnectFS(void)
 {
@@ -3794,28 +6017,78 @@ VConnectFS(void)
     return retVal;
 }
 
+/**
+ * connect to the fileserver SYNC service.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre
+ *    @arg VInit must equal 2.
+ *    @arg Program Type must not be fileserver or salvager.
+ *    @arg VOL_LOCK is held.
+ *
+ * @post connection to fileserver SYNC service established
+ *
+ * @see VConnectFS
+ * @see VDisconnectFS_r
+ * @see VChildProcReconnectFS_r
+ *
+ * @internal volume package internal use only.
+ */
 int
 VConnectFS_r(void)
 {
     int rc;
-    assert((VInit == 2) && 
+    osi_Assert((VInit == 2) &&
           (programType != fileServer) &&
           (programType != salvager));
     rc = FSYNC_clientInit();
-    if (rc)
-       VInit = 3;
+    if (rc) {
+       VSetVInit_r(3);
+    }
     return rc;
 }
 
+/**
+ * disconnect from the fileserver SYNC service.
+ *
+ * @pre
+ *    @arg client should have a live connection to the fileserver.
+ *    @arg VOL_LOCK is held.
+ *    @arg Program Type must not be fileserver or salvager.
+ *
+ * @post connection to fileserver SYNC service destroyed
+ *
+ * @see VDisconnectFS
+ * @see VConnectFS_r
+ * @see VChildProcReconnectFS_r
+ *
+ * @internal volume package internal use only.
+ */
 void
 VDisconnectFS_r(void)
 {
-    assert((programType != fileServer) &&
+    osi_Assert((programType != fileServer) &&
           (programType != salvager));
     FSYNC_clientFinis();
-    VInit = 2;
+    VSetVInit_r(2);
 }
 
+/**
+ * disconnect from the fileserver SYNC service.
+ *
+ * @pre
+ *    @arg client should have a live connection to the fileserver.
+ *    @arg Program Type must not be fileserver or salvager.
+ *
+ * @post connection to fileserver SYNC service destroyed
+ *
+ * @see VDisconnectFS_r
+ * @see VConnectFS
+ * @see VChildProcReconnectFS
+ */
 void
 VDisconnectFS(void)
 {
@@ -3824,12 +6097,48 @@ VDisconnectFS(void)
     VOL_UNLOCK;
 }
 
-static int
+/**
+ * connect to the fileserver SYNC service from a child process following a fork.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre
+ *    @arg VOL_LOCK is held.
+ *    @arg current FSYNC handle is shared with a parent process
+ *
+ * @post current FSYNC handle is discarded and a new connection to the
+ *       fileserver SYNC service is established
+ *
+ * @see VChildProcReconnectFS
+ * @see VConnectFS_r
+ * @see VDisconnectFS_r
+ *
+ * @internal volume package internal use only.
+ */
+int
 VChildProcReconnectFS_r(void)
 {
     return FSYNC_clientChildProcReconnect();
 }
 
+/**
+ * connect to the fileserver SYNC service from a child process following a fork.
+ *
+ * @return operation status
+ *    @retval 0 failure
+ *    @retval 1 success
+ *
+ * @pre current FSYNC handle is shared with a parent process
+ *
+ * @post current FSYNC handle is discarded and a new connection to the
+ *       fileserver SYNC service is established
+ *
+ * @see VChildProcReconnectFS_r
+ * @see VConnectFS
+ * @see VDisconnectFS
+ */
 int
 VChildProcReconnectFS(void)
 {
@@ -3846,20 +6155,34 @@ VChildProcReconnectFS(void)
 /* volume bitmap routines                          */
 /***************************************************/
 
+/**
+ * allocate a vnode bitmap number for the vnode
+ *
+ * @param[out] ec  error code
+ * @param[in] vp   volume object pointer
+ * @param[in] index vnode index number for the vnode
+ * @param[in] flags flag values described in note
+ *
+ * @note for DAFS, flags parameter controls locking behavior.
+ * If (flags & VOL_ALLOC_BITMAP_WAIT) is set, then this function
+ * will create a reservation and block on any other exclusive
+ * operations.  Otherwise, this function assumes the caller
+ * already has exclusive access to vp, and we just change the
+ * volume state.
+ *
+ * @pre VOL_LOCK held
+ *
+ * @return bit number allocated
+ */
 /*
- * For demand attach fs, flags parameter controls
- * locking behavior.  If (flags & VOL_ALLOC_BITMAP_WAIT)
- * is set, then this function will create a reservation
- * and block on any other exclusive operations.  Otherwise,
- * this function assumes the caller already has exclusive
- * access to vp, and we just change the volume state.
- */
-VnodeId
-VAllocBitmapEntry_r(Error * ec, Volume * vp, 
+
+ */
+int
+VAllocBitmapEntry_r(Error * ec, Volume * vp,
                    struct vnodeIndex *index, int flags)
 {
-    VnodeId ret;
-    register byte *bp, *ep;
+    int ret = 0;
+    byte *bp, *ep;
 #ifdef AFS_DEMAND_ATTACH_FS
     VolState state_save;
 #endif /* AFS_DEMAND_ATTACH_FS */
@@ -3869,7 +6192,7 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp,
     /* This test is probably redundant */
     if (!VolumeWriteable(vp)) {
        *ec = (bit32) VREADONLY;
-       return 0;
+       return ret;
     }
 
 #ifdef AFS_DEMAND_ATTACH_FS
@@ -3898,9 +6221,9 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp,
                    VOL_UNLOCK;
                    sleep(2);
                    VOL_LOCK;
-#else /* AFS_PTHREAD_ENV */
+#else /* !AFS_PTHREAD_ENV */
                    IOMGR_Sleep(2);
-#endif /* AFS_DEMAND_ATTACH_FS */
+#endif /* !AFS_PTHREAD_ENV */
                }
            }
        }
@@ -3914,14 +6237,12 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp,
                VGetBitmap_r(ec, vp, i);
                if (*ec) {
 #ifdef AFS_DEMAND_ATTACH_FS
-                   VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
-                   *ec = VSALVAGING;
+                   VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
 #else /* AFS_DEMAND_ATTACH_FS */
                    DeleteVolumeFromHashTable(vp);
                    vp->shuttingDown = 1;       /* Let who has it free it. */
                    vp->specialStatus = 0;
 #endif /* AFS_DEMAND_ATTACH_FS */
-                   ret = NULL;
                    goto done;
                }
            }
@@ -3946,7 +6267,7 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp,
                bp++;
            o = ffs(~*bp) - 1;  /* ffs is documented in BSTRING(3) */
            *bp |= (1 << o);
-           ret = (VnodeId) ((bp - index->bitmap) * 8 + o);
+           ret = ((bp - index->bitmap) * 8 + o);
 #ifdef AFS_DEMAND_ATTACH_FS
            VOL_LOCK;
 #endif /* AFS_DEMAND_ATTACH_FS */
@@ -3957,7 +6278,7 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp,
     /* No bit map entry--must grow bitmap */
     bp = (byte *)
        realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE);
-    assert(bp != NULL);
+    osi_Assert(bp != NULL);
     index->bitmap = bp;
     bp += index->bitmapSize;
     memset(bp, 0, VOLUME_BITMAP_GROWSIZE);
@@ -3979,10 +6300,10 @@ VAllocBitmapEntry_r(Error * ec, Volume * vp,
     return ret;
 }
 
-VnodeId
-VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
+int
+VAllocBitmapEntry(Error * ec, Volume * vp, struct vnodeIndex * index)
 {
-    VnodeId retVal;
+    int retVal;
     VOL_LOCK;
     retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT);
     VOL_UNLOCK;
@@ -3990,32 +6311,50 @@ VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
 }
 
 void
-VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
-                  unsigned bitNumber)
+VFreeBitMapEntry_r(Error * ec, Volume *vp, struct vnodeIndex *index,
+                  unsigned bitNumber, int flags)
 {
     unsigned int offset;
 
     *ec = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (flags & VOL_FREE_BITMAP_WAIT) {
+       /* VAllocBitmapEntry_r allocs bitmap entries under an exclusive volume
+        * state, so ensure we're not in an exclusive volume state when we update
+        * the bitmap */
+       VCreateReservation_r(vp);
+       VWaitExclusiveState_r(vp);
+    }
+#endif
+
 #ifdef BITMAP_LATER
     if (!index->bitmap)
-       return;
+       goto done;
 #endif /* BITMAP_LATER */
+
     offset = bitNumber >> 3;
     if (offset >= index->bitmapSize) {
        *ec = VNOVNODE;
-       return;
+       goto done;
     }
     if (offset < index->bitmapOffset)
        index->bitmapOffset = offset & ~3;      /* Truncate to nearest bit32 */
     *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7));
+
+ done:
+#ifdef AFS_DEMAND_ATTACH_FS
+    VCancelReservation_r(vp);
+#endif
+    return; /* make the compiler happy for non-DAFS */
 }
 
 void
-VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
+VFreeBitMapEntry(Error * ec, Volume *vp, struct vnodeIndex *index,
                 unsigned bitNumber)
 {
     VOL_LOCK;
-    VFreeBitMapEntry_r(ec, index, bitNumber);
+    VFreeBitMapEntry_r(ec, vp, index, bitNumber, VOL_FREE_BITMAP_WAIT);
     VOL_UNLOCK;
 }
 
@@ -4028,8 +6367,7 @@ static void
 VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
 {
     StreamHandle_t *file;
-    int nVnodes;
-    int size;
+    afs_sfsize_t nVnodes, size;
     struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
     struct vnodeIndex *vip = &vp->vnodeIndex[class];
     struct VnodeDiskObject *vnode;
@@ -4050,13 +6388,13 @@ VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
     VOL_UNLOCK;
 
     fdP = IH_OPEN(vip->handle);
-    assert(fdP != NULL);
+    osi_Assert(fdP != NULL);
     file = FDH_FDOPEN(fdP, "r");
-    assert(file != NULL);
+    osi_Assert(file != NULL);
     vnode = (VnodeDiskObject *) malloc(vcp->diskSize);
-    assert(vnode != NULL);
+    osi_Assert(vnode != NULL);
     size = OS_SIZE(fdP->fd_fd);
-    assert(size != -1);
+    osi_Assert(size != -1);
     nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize)
        >> vcp->logSize;
     vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4;    /* The 10 is a little extra so
@@ -4066,13 +6404,13 @@ VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
                                                         * it that way */
 #ifdef BITMAP_LATER
     BitMap = (byte *) calloc(1, vip->bitmapSize);
-    assert(BitMap != NULL);
+    osi_Assert(BitMap != NULL);
 #else /* BITMAP_LATER */
     vip->bitmap = (byte *) calloc(1, vip->bitmapSize);
-    assert(vip->bitmap != NULL);
+    osi_Assert(vip->bitmap != NULL);
     vip->bitmapOffset = 0;
 #endif /* BITMAP_LATER */
-    if (STREAM_SEEK(file, vcp->diskSize, 0) != -1) {
+    if (STREAM_ASEEK(file, vcp->diskSize) != -1) {
        int bitNumber = 0;
        for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) {
            if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1)
@@ -4098,141 +6436,70 @@ VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
 #endif /* !AFS_PTHREAD_ENV */
        }
     }
-    if (vp->nextVnodeUnique < unique) {
-       Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
-       *ec = VSALVAGE;
-    }
-    /* Paranoia, partly justified--I think fclose after fdopen
-     * doesn't seem to close fd.  In any event, the documentation
-     * doesn't specify, so it's safer to close it twice.
-     */
-    STREAM_CLOSE(file);
-    FDH_CLOSE(fdP);
-    free(vnode);
-
-    VOL_LOCK;
-#ifdef BITMAP_LATER
-    /* There may have been a racing condition with some other thread, both
-     * creating the bitmaps for this volume. If the other thread was faster
-     * the pointer to bitmap should already be filled and we can free ours.
-     */
-    if (vip->bitmap == NULL) {
-       vip->bitmap = BitMap;
-       vip->bitmapOffset = 0;
-    } else
-       free((byte *) BitMap);
-#endif /* BITMAP_LATER */
-#ifdef AFS_DEMAND_ATTACH_FS
-    VChangeState_r(vp, state_save);
-#endif /* AFS_DEMAND_ATTACH_FS */
-}
-
-
-/***************************************************/
-/* demand attach fs state machine routines         */
-/***************************************************/
-
-#ifdef AFS_DEMAND_ATTACH_FS
-/* wait for the volume to change states */
-static void
-VWaitStateChange_r(Volume * vp)
-{
-    VolState state_save = V_attachState(vp);
-
-    assert(vp->nWaiters || vp->nUsers);
-    do {
-       assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0);
-    } while (V_attachState(vp) == state_save);
-    assert(V_attachState(vp) != VOL_STATE_FREED);
-}
-
-/* wait for blocking ops to end */
-static void
-VWaitExclusiveState_r(Volume * vp)
-{
-    assert(vp->nWaiters || vp->nUsers);
-    while (IsExclusiveState(V_attachState(vp))) {
-       assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0);
-    }
-    assert(V_attachState(vp) != VOL_STATE_FREED);
-}
-
-/* change state, and notify other threads,
- * return previous state to caller */
-VolState
-VChangeState_r(Volume * vp, VolState new_state)
-{
-    VolState old_state = V_attachState(vp);
-
-    /* XXX profiling need to make sure these counters
-     * don't kill performance... */
-    VStats.state_levels[old_state]--;
-    VStats.state_levels[new_state]++;
-
-    V_attachState(vp) = new_state;
-    assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0);
-    return old_state;
-}
-
-/* tells caller whether or not the current state requires
- * exclusive access without holding glock */
-static int
-IsExclusiveState(VolState state)
-{
-    switch (state) {
-    case VOL_STATE_UPDATING:
-    case VOL_STATE_ATTACHING:
-    case VOL_STATE_GET_BITMAP:
-    case VOL_STATE_HDR_LOADING:
-    case VOL_STATE_HDR_ATTACHING:
-    case VOL_STATE_OFFLINING:
-    case VOL_STATE_DETACHING:
-       return 1;
-    }
-    return 0;
-}
-
-/* tell caller whether V_attachState is an error condition */
-static int
-IsErrorState(VolState state)
-{
-    switch (state) {
-    case VOL_STATE_ERROR:
-    case VOL_STATE_SALVAGING:
-       return 1;
-    }
-    return 0;
-}
-
-/* tell caller whether V_attachState is valid */
-static int
-IsValidState(VolState state)
-{
-    if ((state >= 0) && 
-       (state < VOL_STATE_COUNT) &&
-       (state != VOL_STATE_FREED)) {
-       return 1;
-    }
-    return 0;
-}
+    if (vp->nextVnodeUnique < unique) {
+       Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
+       *ec = VSALVAGE;
+    }
+    /* Paranoia, partly justified--I think fclose after fdopen
+     * doesn't seem to close fd.  In any event, the documentation
+     * doesn't specify, so it's safer to close it twice.
+     */
+    STREAM_CLOSE(file);
+    FDH_CLOSE(fdP);
+    free(vnode);
+
+    VOL_LOCK;
+#ifdef BITMAP_LATER
+    /* There may have been a racing condition with some other thread, both
+     * creating the bitmaps for this volume. If the other thread was faster
+     * the pointer to bitmap should already be filled and we can free ours.
+     */
+    if (vip->bitmap == NULL) {
+       vip->bitmap = BitMap;
+       vip->bitmapOffset = 0;
+    } else
+       free((byte *) BitMap);
+#endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, state_save);
 #endif /* AFS_DEMAND_ATTACH_FS */
+}
 
 
 /***************************************************/
 /* Volume Path and Volume Number utility routines  */
 /***************************************************/
 
-static void
-GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
+/**
+ * find the first occurrence of a volume header file and return the path.
+ *
+ * @param[out] ec          outbound error code
+ * @param[in]  volumeId    volume id to find
+ * @param[out] partitionp  pointer to disk partition path string
+ * @param[out] namep       pointer to volume header file name string
+ *
+ * @post path to first occurrence of volume header is returned in partitionp
+ *       and namep, or ec is set accordingly.
+ *
+ * @warning this function is NOT re-entrant -- partitionp and namep point to
+ *          static data segments
+ *
+ * @note if a volume utility inadvertently leaves behind a stale volume header
+ *       on a vice partition, it is possible for callers to get the wrong one,
+ *       depending on the order of the disk partition linked list.
+ *
+ */
+void
+VGetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
 {
     static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
     char path[VMAXPATHLEN];
     int found = 0;
-    struct DiskPartition *dp;
+    struct DiskPartition64 *dp;
 
     *ec = 0;
-    name[0] = '/';
-    (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, volumeId);
+    name[0] = OS_DIRSEPC;
+    (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, afs_printable_uint32_lu(volumeId));
     for (dp = DiskPartitionList; dp; dp = dp->next) {
        struct afs_stat status;
        strcpy(path, VPartitionPath(dp));
@@ -4252,26 +6519,71 @@ GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
     }
 }
 
+/**
+ * extract a volume number from a volume header filename string.
+ *
+ * @param[in] name  volume header filename string
+ *
+ * @return volume number
+ *
+ * @note the string must be of the form VFORMAT.  the only permissible
+ *       deviation is a leading OS_DIRSEPC character.
+ *
+ * @see VFORMAT
+ */
 int
 VolumeNumber(char *name)
 {
-    if (*name == '/')
+    if (*name == OS_DIRSEPC)
        name++;
     return atoi(name + 1);
 }
 
+/**
+ * compute the volume header filename.
+ *
+ * @param[in] volumeId
+ *
+ * @return volume header filename
+ *
+ * @post volume header filename string is constructed
+ *
+ * @warning this function is NOT re-entrant -- the returned string is
+ *          stored in a static char array.  see VolumeExternalName_r
+ *          for a re-entrant equivalent.
+ *
+ * @see VolumeExternalName_r
+ *
+ * @deprecated due to the above re-entrancy warning, this interface should
+ *             be considered deprecated.  Please use VolumeExternalName_r
+ *             in its stead.
+ */
 char *
 VolumeExternalName(VolumeId volumeId)
 {
     static char name[VMAXPATHLEN];
-    (void)afs_snprintf(name, sizeof name, VFORMAT, volumeId);
+    (void)afs_snprintf(name, sizeof name, VFORMAT, afs_printable_uint32_lu(volumeId));
     return name;
 }
 
-static int
+/**
+ * compute the volume header filename.
+ *
+ * @param[in]     volumeId
+ * @param[inout]  name       array in which to store filename
+ * @param[in]     len        length of name array
+ *
+ * @return result code from afs_snprintf
+ *
+ * @see VolumeExternalName
+ * @see afs_snprintf
+ *
+ * @note re-entrant equivalent of VolumeExternalName
+ */
+int
 VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
 {
-    return afs_snprintf(name, len, VFORMAT, volumeId);
+    return afs_snprintf(name, len, VFORMAT, afs_printable_uint32_lu(volumeId));
 }
 
 
@@ -4285,7 +6597,34 @@ VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
 #define OneDay (24*60*60)      /* 24 hours */
 #endif /* OPENAFS_VOL_STATS */
 
-#define Midnight(date) ((date-TimeZoneCorrection)/OneDay*OneDay+TimeZoneCorrection)
+static time_t
+Midnight(time_t t) {
+    struct tm local, *l;
+    time_t midnight;
+
+#if defined(AFS_PTHREAD_ENV) && !defined(AFS_NT40_ENV)
+    l = localtime_r(&t, &local);
+#else
+    l = localtime(&t);
+#endif
+
+    if (l != NULL) {
+       /* the following is strictly speaking problematic on the
+          switching day to daylight saving time, after the switch,
+          as tm_isdst does not match.  Similarly, on the looong day when
+          switching back the OneDay check will not do what naively expected!
+          The effects are minor, though, and more a matter of interpreting
+          the numbers. */
+#ifndef AFS_PTHREAD_ENV
+       local = *l;
+#endif
+       local.tm_hour = local.tm_min=local.tm_sec = 0;
+       midnight = mktime(&local);
+       if (midnight != (time_t) -1) return(midnight);
+    }
+    return( (t/OneDay)*OneDay );
+
+}
 
 /*------------------------------------------------------------------------
  * [export] VAdjustVolumeStatistics
@@ -4310,12 +6649,12 @@ VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
  *------------------------------------------------------------------------*/
 
 int
-VAdjustVolumeStatistics_r(register Volume * vp)
+VAdjustVolumeStatistics_r(Volume * vp)
 {
     unsigned int now = FT_ApproxTime();
 
     if (now - V_dayUseDate(vp) > OneDay) {
-       register int ndays, i;
+       int ndays, i;
 
        ndays = (now - V_dayUseDate(vp)) / OneDay;
        for (i = 6; i > ndays - 1; i--)
@@ -4332,7 +6671,7 @@ VAdjustVolumeStatistics_r(register Volume * vp)
         * All we need to do is bzero the entire VOL_STATS_BYTES of
         * the detailed volume statistics area.
         */
-       memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
+       memset((V_stat_area(vp)), 0, VOL_STATS_BYTES);
 #endif /* OPENAFS_VOL_STATS */
     }
 
@@ -4344,7 +6683,7 @@ VAdjustVolumeStatistics_r(register Volume * vp)
 }                              /*VAdjustVolumeStatistics */
 
 int
-VAdjustVolumeStatistics(register Volume * vp)
+VAdjustVolumeStatistics(Volume * vp)
 {
     int retVal;
     VOL_LOCK;
@@ -4354,9 +6693,10 @@ VAdjustVolumeStatistics(register Volume * vp)
 }
 
 void
-VBumpVolumeUsage_r(register Volume * vp)
+VBumpVolumeUsage_r(Volume * vp)
 {
     unsigned int now = FT_ApproxTime();
+    V_accessDate(vp) = now;
     if (now - V_dayUseDate(vp) > OneDay)
        VAdjustVolumeStatistics_r(vp);
     /*
@@ -4369,7 +6709,7 @@ VBumpVolumeUsage_r(register Volume * vp)
 }
 
 void
-VBumpVolumeUsage(register Volume * vp)
+VBumpVolumeUsage(Volume * vp)
 {
     VOL_LOCK;
     VBumpVolumeUsage_r(vp);
@@ -4388,7 +6728,7 @@ VSetDiskUsage_r(void)
         * initialization level indicates that all volumes are attached,
         * which implies that all partitions are initialized. */
 #ifdef AFS_PTHREAD_ENV
-       sleep(10);
+       VOL_CV_WAIT(&vol_vinit_cond);
 #else /* AFS_PTHREAD_ENV */
        IOMGR_Sleep(10);
 #endif /* AFS_PTHREAD_ENV */
@@ -4464,7 +6804,7 @@ VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
                                     sizeof(VolumeId) * updateSize);
        }
     }
-    assert(UpdateList != NULL);
+    osi_Assert(UpdateList != NULL);
     UpdateList[nUpdatedVolumes++] = V_id(vp);
 #endif /* !AFS_DEMAND_ATTACH_FS */
 }
@@ -4473,8 +6813,8 @@ VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
 static void
 VScanUpdateList(void)
 {
-    register int i, gap;
-    register Volume *vp;
+    int i, gap;
+    Volume *vp;
     Error error;
     afs_uint32 now = FT_ApproxTime();
     /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */
@@ -4519,7 +6859,7 @@ VScanUpdateList(void)
  * in order to speed up fileserver shutdown
  *
  * (1) by soft detach we mean a process very similar
- *     to VOffline, except the final state of the 
+ *     to VOffline, except the final state of the
  *     Volume will be VOL_STATE_PREATTACHED, instead
  *     of the usual VOL_STATE_UNATTACHED
  */
@@ -4541,50 +6881,79 @@ VScanUpdateList(void)
  * candidates for soft detachment. this queue is
  * unsorted
  */
-#define VLRU_GENERATIONS  3   /* number of generations in VLRU */
-#define VLRU_QUEUES       5   /* total number of VLRU queues */
+#define VLRU_GENERATIONS  3   /**< number of generations in VLRU */
+#define VLRU_QUEUES       5   /**< total number of VLRU queues */
+
+/**
+ * definition of a VLRU queue.
+ */
 struct VLRU_q {
     volatile struct rx_queue q;
     volatile int len;
     volatile int busy;
     pthread_cond_t cv;
 };
+
+/**
+ * main VLRU data structure.
+ */
 struct VLRU {
-    struct VLRU_q q[VLRU_QUEUES];
+    struct VLRU_q q[VLRU_QUEUES];   /**< VLRU queues */
 
     /* VLRU config */
-    afs_uint32 promotion_interval[VLRU_GENERATIONS-1];  /* interval between promotions */
-    afs_uint32 scan_interval[VLRU_GENERATIONS+1];       /* interval between scans for candidates */
+    /** time interval (in seconds) between promotion passes for
+     *  each young generation queue. */
+    afs_uint32 promotion_interval[VLRU_GENERATIONS-1];
 
-    /* state */
-    int next_idx;
-    afs_uint32 last_promotion[VLRU_GENERATIONS-1];      /* timestamp of last promotion scan */
-    afs_uint32 last_scan[VLRU_GENERATIONS+1];           /* timestamp of last detach scan */
+    /** time interval (in seconds) between soft detach candidate
+     *  scans for each generation queue.
+     *
+     *  scan_interval[VLRU_QUEUE_CANDIDATE] defines how frequently
+     *  we perform a soft detach pass. */
+    afs_uint32 scan_interval[VLRU_GENERATIONS+1];
+
+    /* scheduler state */
+    int next_idx;                                       /**< next queue to receive attention */
+    afs_uint32 last_promotion[VLRU_GENERATIONS-1];      /**< timestamp of last promotion scan */
+    afs_uint32 last_scan[VLRU_GENERATIONS+1];           /**< timestamp of last detach scan */
 
-    int scanner_state;                                  /* state of scanner thread */
-    pthread_cond_t cv;                                  /* state transition CV */
+    int scanner_state;                                  /**< state of scanner thread */
+    pthread_cond_t cv;                                  /**< state transition CV */
 };
 
+/** global VLRU state */
 static struct VLRU volume_LRU;
 
-/* valid scanner states */
-#define VLRU_SCANNER_STATE_OFFLINE        0
-#define VLRU_SCANNER_STATE_ONLINE         1
-#define VLRU_SCANNER_STATE_SHUTTING_DOWN  2
-#define VLRU_SCANNER_STATE_PAUSING        3
-#define VLRU_SCANNER_STATE_PAUSED         4
+/**
+ * defined states for VLRU scanner thread.
+ */
+typedef enum {
+    VLRU_SCANNER_STATE_OFFLINE        = 0,    /**< vlru scanner thread is offline */
+    VLRU_SCANNER_STATE_ONLINE         = 1,    /**< vlru scanner thread is online */
+    VLRU_SCANNER_STATE_SHUTTING_DOWN  = 2,    /**< vlru scanner thread is shutting down */
+    VLRU_SCANNER_STATE_PAUSING        = 3,    /**< vlru scanner thread is getting ready to pause */
+    VLRU_SCANNER_STATE_PAUSED         = 4     /**< vlru scanner thread is paused */
+} vlru_thread_state_t;
 
 /* vlru disk data header stuff */
-#define VLRU_DISK_MAGIC      0x7a8b9cad
-#define VLRU_DISK_VERSION    1
+#define VLRU_DISK_MAGIC      0x7a8b9cad        /**< vlru disk entry magic number */
+#define VLRU_DISK_VERSION    1                 /**< vlru disk entry version number */
 
-/* vlru default expiration time (for eventual fs state serialization of vlru data) */
+/** vlru default expiration time (for eventual fs state serialization of vlru data) */
 #define VLRU_DUMP_EXPIRATION_TIME   (60*60*24*7)  /* expire vlru data after 1 week */
 
 
+/** minimum volume inactivity (in seconds) before a volume becomes eligible for
+ *  soft detachment. */
 static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH;
+
+/** time interval (in seconds) between VLRU scanner thread soft detach passes. */
 static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL;
+
+/** maximum number of volumes to soft detach in a VLRU soft detach pass. */
 static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX;
+
+/** VLRU control flag.  non-zero value implies VLRU subsystem is activated. */
 static afs_uint32 VLRU_enabled = 1;
 
 /* queue synchronization routines */
@@ -4592,15 +6961,28 @@ static void VLRU_BeginExclusive_r(struct VLRU_q * q);
 static void VLRU_EndExclusive_r(struct VLRU_q * q);
 static void VLRU_Wait_r(struct VLRU_q * q);
 
-/* set the VLRU parameters 
+/**
+ * set VLRU subsystem tunable parameters.
+ *
+ * @param[in] option  tunable option to modify
+ * @param[in] val     new value for tunable parameter
+ *
+ * @pre @c VInitVolumePackage2 has not yet been called.
+ *
+ * @post tunable parameter is modified
+ *
+ * @note DAFS only
  *
- * valid options are:
- *  VLRU_SET_THRESH -- set the period of inactivity after
- *    which volumes are eligible for being detached
- *  VLRU_SET_INTERVAL -- the time interval between calls
- *    to the volume LRU "garbage collector"
- *  VLRU_SET_MAX -- the max number of volumes to deallocate
- *    in one GC pass
+ * @note valid option parameters are:
+ *    @arg @c VLRU_SET_THRESH
+ *         set the period of inactivity after which
+ *         volumes are eligible for soft detachment
+ *    @arg @c VLRU_SET_INTERVAL
+ *         set the time interval between calls
+ *         to the volume LRU "garbage collector"
+ *    @arg @c VLRU_SET_MAX
+ *         set the max number of volumes to deallocate
+ *         in one GC pass
  */
 void
 VLRU_SetOptions(int option, afs_uint32 val)
@@ -4617,7 +6999,18 @@ VLRU_SetOptions(int option, afs_uint32 val)
     VLRU_ComputeConstants();
 }
 
-/* compute the VLRU internal timing parameters based upon the user's inputs */
+/**
+ * compute VLRU internal timing parameters.
+ *
+ * @post VLRU scanner thread internal timing parameters are computed
+ *
+ * @note computes internal timing parameters based upon user-modifiable
+ *       tunable parameters.
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VLRU_ComputeConstants(void)
 {
@@ -4639,7 +7032,17 @@ VLRU_ComputeConstants(void)
     }
 }
 
-/* initialize VLRU */
+/**
+ * initialize VLRU subsystem.
+ *
+ * @pre this function has not yet been called
+ *
+ * @post VLRU subsystem is initialized and VLRU scanner thread is starting
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VInitVLRU(void)
 {
@@ -4657,7 +7060,7 @@ VInitVLRU(void)
        queue_Init(&volume_LRU.q[i]);
        volume_LRU.q[i].len = 0;
        volume_LRU.q[i].busy = 0;
-       assert(pthread_cond_init(&volume_LRU.q[i].cv, NULL) == 0);
+       CV_INIT(&volume_LRU.q[i].cv, "vol lru", CV_DEFAULT, 0);
     }
 
     /* setup the timing constants */
@@ -4675,32 +7078,71 @@ VInitVLRU(void)
     /* start up the VLRU scanner */
     volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
     if (programType == fileServer) {
-       assert(pthread_cond_init(&volume_LRU.cv, NULL) == 0);
-       assert(pthread_attr_init(&attrs) == 0);
-       assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
-       assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0);
+       CV_INIT(&volume_LRU.cv, "vol lru", CV_DEFAULT, 0);
+       osi_Assert(pthread_attr_init(&attrs) == 0);
+       osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+       osi_Assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0);
     }
 }
 
-/* initialize LRU support for a volume */
+/**
+ * initialize the VLRU-related fields of a newly allocated volume object.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre
+ *    @arg @c VOL_LOCK is held.
+ *    @arg volume object is not on a VLRU queue.
+ *
+ * @post VLRU fields are initialized to indicate that volume object is not
+ *       currently registered with the VLRU subsystem
+ *
+ * @note DAFS only
+ *
+ * @internal volume package interal use only.
+ */
 static void
-VLRU_Init_Node_r(volatile Volume * vp)
+VLRU_Init_Node_r(Volume * vp)
 {
     if (!VLRU_enabled)
        return;
 
-    assert(queue_IsNotOnQueue(&vp->vlru));
+    osi_Assert(queue_IsNotOnQueue(&vp->vlru));
     vp->vlru.idx = VLRU_QUEUE_INVALID;
 }
 
-/* add volume to VLRU 
- * now supports adding to queues other
- * than new for vlru state restore
- * caller MUST hold a ref count on vp */
+/**
+ * add a volume object to a VLRU queue.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre
+ *    @arg @c VOL_LOCK is held.
+ *    @arg caller MUST hold a lightweight ref on @p vp.
+ *    @arg caller MUST NOT hold exclusive ownership of the VLRU queue.
+ *
+ * @post the volume object is added to the appropriate VLRU queue
+ *
+ * @note if @c vp->vlru.idx contains the index of a valid VLRU queue,
+ *       then the volume is added to that queue.  Otherwise, the value
+ *       @c VLRU_QUEUE_NEW is stored into @c vp->vlru.idx and the
+ *       volume is added to the NEW generation queue.
+ *
+ * @note @c VOL_LOCK may be dropped internally
+ *
+ * @note Volume state is temporarily set to @c VOL_STATE_VLRU_ADD
+ *       during the add operation, and is restored to the previous
+ *       state prior to return.
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
-VLRU_Add_r(volatile Volume * vp)
+VLRU_Add_r(Volume * vp)
 {
     int idx;
+    VolState state_save;
 
     if (!VLRU_enabled)
        return;
@@ -4708,26 +7150,51 @@ VLRU_Add_r(volatile Volume * vp)
     if (queue_IsOnQueue(&vp->vlru))
        return;
 
-    VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+    state_save = VChangeState_r(vp, VOL_STATE_VLRU_ADD);
+
+    idx = vp->vlru.idx;
+    if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) {
+       idx = VLRU_QUEUE_NEW;
+    }
+
+    VLRU_Wait_r(&volume_LRU.q[idx]);
 
     /* repeat check since VLRU_Wait_r may have dropped
      * the glock */
     if (queue_IsNotOnQueue(&vp->vlru)) {
-       idx = vp->vlru.idx;
-       if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) {
-           idx = vp->vlru.idx = VLRU_QUEUE_NEW;
-       }
+       vp->vlru.idx = idx;
        queue_Prepend(&volume_LRU.q[idx], &vp->vlru);
        volume_LRU.q[idx].len++;
        V_attachFlags(vp) |= VOL_ON_VLRU;
        vp->stats.last_promote = FT_ApproxTime();
     }
+
+    VChangeState_r(vp, state_save);
 }
 
-/* delete volume from VLRU 
- * caller MUST hold a ref count on vp */
+/**
+ * delete a volume object from a VLRU queue.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre
+ *    @arg @c VOL_LOCK is held.
+ *    @arg caller MUST hold a lightweight ref on @p vp.
+ *    @arg caller MUST NOT hold exclusive ownership of the VLRU queue.
+ *
+ * @post volume object is removed from the VLRU queue
+ *
+ * @note @c VOL_LOCK may be dropped internally
+ *
+ * @note DAFS only
+ *
+ * @todo We should probably set volume state to something exlcusive
+ *       (as @c VLRU_Add_r does) prior to dropping @c VOL_LOCK.
+ *
+ * @internal volume package internal use only.
+ */
 static void
-VLRU_Delete_r(volatile Volume * vp)
+VLRU_Delete_r(Volume * vp)
 {
     int idx;
 
@@ -4745,7 +7212,7 @@ VLRU_Delete_r(volatile Volume * vp)
       VLRU_Wait_r(&volume_LRU.q[idx]);
     } while (idx != vp->vlru.idx);
 
-    /* now remove from the VLRU and update 
+    /* now remove from the VLRU and update
      * the appropriate counter */
     queue_Remove(&vp->vlru);
     volume_LRU.q[idx].len--;
@@ -4753,12 +7220,29 @@ VLRU_Delete_r(volatile Volume * vp)
     V_attachFlags(vp) &= ~(VOL_ON_VLRU);
 }
 
-/* signal that volume was just accessed.
- * caller MUST hold a ref count on vp */
+/**
+ * tell the VLRU subsystem that a volume was just accessed.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre
+ *    @arg @c VOL_LOCK is held
+ *    @arg caller MUST hold a lightweight ref on @p vp
+ *    @arg caller MUST NOT hold exclusive ownership of any VLRU queue
+ *
+ * @post volume VLRU access statistics are updated.  If the volume was on
+ *       the VLRU soft detach candidate queue, it is moved to the NEW
+ *       generation queue.
+ *
+ * @note @c VOL_LOCK may be dropped internally
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
-VLRU_UpdateAccess_r(volatile Volume * vp)
+VLRU_UpdateAccess_r(Volume * vp)
 {
-    afs_uint32 live_interval;
     Volume * rvp = NULL;
 
     if (!VLRU_enabled)
@@ -4767,7 +7251,7 @@ VLRU_UpdateAccess_r(volatile Volume * vp)
     if (queue_IsNotOnQueue(&vp->vlru))
        return;
 
-    assert(V_attachFlags(vp) & VOL_ON_VLRU);
+    osi_Assert(V_attachFlags(vp) & VOL_ON_VLRU);
 
     /* update the access timestamp */
     vp->stats.last_get = FT_ApproxTime();
@@ -4804,16 +7288,43 @@ VLRU_UpdateAccess_r(volatile Volume * vp)
     }
 }
 
-/* switch a volume between two VLRU queues */
+/**
+ * switch a volume between two VLRU queues.
+ *
+ * @param[in] vp       pointer to volume object
+ * @param[in] new_idx  index of VLRU queue onto which the volume will be moved
+ * @param[in] append   controls whether the volume will be appended or
+ *                     prepended to the queue.  A nonzero value means it will
+ *                     be appended; zero means it will be prepended.
+ *
+ * @pre The new (and old, if applicable) queue(s) must either be owned
+ *      exclusively by the calling thread for asynchronous manipulation,
+ *      or the queue(s) must be quiescent and VOL_LOCK must be held.
+ *      Please see VLRU_BeginExclusive_r, VLRU_EndExclusive_r and VLRU_Wait_r
+ *      for further details of the queue asynchronous processing mechanism.
+ *
+ * @post If the volume object was already on a VLRU queue, it is
+ *       removed from the queue.  Depending on the value of the append
+ *       parameter, the volume object is either appended or prepended
+ *       to the VLRU queue referenced by the new_idx parameter.
+ *
+ * @note DAFS only
+ *
+ * @see VLRU_BeginExclusive_r
+ * @see VLRU_EndExclusive_r
+ * @see VLRU_Wait_r
+ *
+ * @internal volume package internal use only.
+ */
 static void
-VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append)
+VLRU_SwitchQueues(Volume * vp, int new_idx, int append)
 {
     if (queue_IsNotOnQueue(&vp->vlru))
        return;
 
     queue_Remove(&vp->vlru);
     volume_LRU.q[vp->vlru.idx].len--;
-    
+
     /* put the volume back on the correct generational queue */
     if (append) {
        queue_Append(&volume_LRU.q[new_idx], &vp->vlru);
@@ -4825,16 +7336,27 @@ VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append)
     vp->vlru.idx = new_idx;
 }
 
-/* VLRU GC thread */
+/**
+ * VLRU background thread.
+ *
+ * The VLRU Scanner Thread is responsible for periodically scanning through
+ * each VLRU queue looking for volumes which should be moved to another
+ * queue, or soft detached.
+ *
+ * @param[in] args  unused thread arguments parameter
+ *
+ * @return unused thread return value
+ *    @retval NULL always
+ *
+ * @internal volume package internal use only.
+ */
 static void *
 VLRU_ScannerThread(void * args)
 {
     afs_uint32 now, min_delay, delay;
-    afs_uint32 next_scan[VLRU_GENERATIONS];
-    afs_uint32 next_promotion[VLRU_GENERATIONS];
     int i, min_idx, min_op, overdue, state;
 
-    /* set t=0 for promotion cycle to be 
+    /* set t=0 for promotion cycle to be
      * fileserver startup */
     now = FT_ApproxTime();
     for (i=0; i < VLRU_GENERATIONS-1; i++) {
@@ -4842,7 +7364,7 @@ VLRU_ScannerThread(void * args)
     }
 
     /* don't start the scanner until VLRU_offline_thresh
-     * plus a small delay for VInitVolumePackage to finish
+     * plus a small delay for VInitVolumePackage2 to finish
      * has gone by */
 
     sleep(VLRU_offline_thresh + 60);
@@ -4862,9 +7384,9 @@ VLRU_ScannerThread(void * args)
        /* check to see if we've been asked to pause */
        if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) {
            volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED;
-           assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
+           CV_BROADCAST(&volume_LRU.cv);
            do {
-               assert(pthread_cond_wait(&volume_LRU.cv, &vol_glock_mutex) == 0);
+               VOL_CV_WAIT(&volume_LRU.cv);
            } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED);
        }
 
@@ -4897,7 +7419,6 @@ VLRU_ScannerThread(void * args)
                min_delay = 0;
                min_idx = i;
                overdue = 1;
-               break;
            }
        }
 
@@ -4939,19 +7460,43 @@ VLRU_ScannerThread(void * args)
 
     /* signal that scanner is down */
     volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
-    assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
+    CV_BROADCAST(&volume_LRU.cv);
     VOL_UNLOCK;
     return NULL;
 }
 
-/* run the promotions */
+/**
+ * promote volumes from one VLRU generation to the next.
+ *
+ * This routine scans a VLRU generation looking for volumes which are
+ * eligible to be promoted to the next generation.  All volumes which
+ * meet the eligibility requirement are promoted.
+ *
+ * Promotion eligibility is based upon meeting both of the following
+ * requirements:
+ *
+ *    @arg The volume has been accessed since the last promotion:
+ *         @c (vp->stats.last_get >= vp->stats.last_promote)
+ *    @arg The last promotion occurred at least
+ *         @c volume_LRU.promotion_interval[idx] seconds ago
+ *
+ * As a performance optimization, promotions are "globbed".  In other
+ * words, we promote arbitrarily large contiguous sublists of elements
+ * as one operation.
+ *
+ * @param[in] idx  VLRU queue index to scan
+ *
+ * @note DAFS only
+ *
+ * @internal VLRU internal use only.
+ */
 static void
 VLRU_Promote_r(int idx)
 {
     int len, chaining, promote;
     afs_uint32 now, thresh;
     struct rx_queue *qp, *nqp;
-    Volume * vp, *start, *end;
+    Volume * vp, *start = NULL, *end = NULL;
 
     /* get exclusive access to two chains, and drop the glock */
     VLRU_Wait_r(&volume_LRU.q[idx]);
@@ -5014,11 +7559,11 @@ VLRU_Demote_r(int idx)
     int len, chaining, demote;
     afs_uint32 now, thresh;
     struct rx_queue *qp, *nqp;
-    Volume * vp, *start, *end;
+    Volume * vp, *start = NULL, *end = NULL;
     Volume ** salv_flag_vec = NULL;
     int salv_vec_offset = 0;
 
-    assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD);
+    osi_Assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD);
 
     /* get exclusive access to two chains, and drop the glock */
     VLRU_Wait_r(&volume_LRU.q[idx-1]);
@@ -5045,7 +7590,7 @@ VLRU_Demote_r(int idx)
         * demotion passes */
        if (salv_flag_vec &&
            !(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
-           demote && 
+           demote &&
            (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
            (V_attachState(vp) == VOL_STATE_ATTACHED)) {
            salv_flag_vec[salv_vec_offset++] = vp;
@@ -5114,10 +7659,10 @@ VLRU_Scan_r(int idx)
 {
     afs_uint32 now, thresh;
     struct rx_queue *qp, *nqp;
-    volatile Volume * vp;
+    Volume * vp;
     int i, locked = 1;
 
-    assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE);
+    osi_Assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE);
 
     /* gain exclusive access to the idx VLRU */
     VLRU_Wait_r(&volume_LRU.q[idx]);
@@ -5183,7 +7728,7 @@ VLRU_Scan_r(int idx)
 /* check whether volume is safe to soft detach
  * caller MUST NOT hold a ref count on vp */
 static int
-VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh)
+VCheckSoftDetach(Volume * vp, afs_uint32 thresh)
 {
     int ret=0;
 
@@ -5197,10 +7742,10 @@ VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh)
     return ret;
 }
 
-/* check whether volume should be made a 
+/* check whether volume should be made a
  * soft detach candidate */
 static int
-VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh)
+VCheckSoftDetachCandidate(Volume * vp, afs_uint32 thresh)
 {
     int idx, ret = 0;
     if (vp->nUsers || vp->nWaiters)
@@ -5208,7 +7753,7 @@ VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh)
 
     idx = vp->vlru.idx;
 
-    assert(idx == VLRU_QUEUE_NEW);
+    osi_Assert(idx == VLRU_QUEUE_NEW);
 
     if (vp->stats.last_get <= thresh) {
        /* move to candidate pool */
@@ -5228,7 +7773,7 @@ VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh)
 static void
 VLRU_BeginExclusive_r(struct VLRU_q * q)
 {
-    assert(q->busy == 0);
+    osi_Assert(q->busy == 0);
     q->busy = 1;
 }
 
@@ -5236,9 +7781,9 @@ VLRU_BeginExclusive_r(struct VLRU_q * q)
 static void
 VLRU_EndExclusive_r(struct VLRU_q * q)
 {
-    assert(q->busy);
+    osi_Assert(q->busy);
     q->busy = 0;
-    assert(pthread_cond_broadcast(&q->cv) == 0);
+    CV_BROADCAST(&q->cv);
 }
 
 /* wait for another thread to end exclusive access on VLRU */
@@ -5246,7 +7791,7 @@ static void
 VLRU_Wait_r(struct VLRU_q * q)
 {
     while(q->busy) {
-       assert(pthread_cond_wait(&q->cv, &vol_glock_mutex) == 0);
+       VOL_CV_WAIT(&q->cv);
     }
 }
 
@@ -5255,12 +7800,12 @@ VLRU_Wait_r(struct VLRU_q * q)
  *
  * caller MUST NOT hold a ref count on vp */
 static int
-VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
+VSoftDetachVolume_r(Volume * vp, afs_uint32 thresh)
 {
     afs_uint32 ts_save;
     int ret = 0;
 
-    assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+    osi_Assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
 
     ts_save = vp->stats.last_get;
     if (ts_save > thresh)
@@ -5269,7 +7814,7 @@ VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
     if (vp->nUsers || vp->nWaiters)
        return 0;
 
-    if (IsExclusiveState(V_attachState(vp))) {
+    if (VIsExclusiveState(V_attachState(vp))) {
        return 0;
     }
 
@@ -5280,6 +7825,7 @@ VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
     case VOL_STATE_GOING_OFFLINE:
     case VOL_STATE_SHUTTING_DOWN:
     case VOL_STATE_SALVAGING:
+    case VOL_STATE_DELETED:
        volume_LRU.q[vp->vlru.idx].len--;
 
        /* create and cancel a reservation to
@@ -5291,6 +7837,8 @@ VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
        V_attachFlags(vp) &= ~(VOL_ON_VLRU);
        VCancelReservation_r(vp);
        return 0;
+    default:
+       break;
     }
 
     /* hold the volume and take it offline.
@@ -5300,9 +7848,9 @@ VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
        /* vhold drops the glock, so now we should
         * check to make sure we aren't racing against
         * other threads.  if we are racing, offlining vp
-        * would be wasteful, and block the scanner for a while 
+        * would be wasteful, and block the scanner for a while
         */
-       if (vp->nWaiters || 
+       if (vp->nWaiters ||
            (vp->nUsers > 1) ||
            (vp->shuttingDown) ||
            (vp->goingOffline) ||
@@ -5312,7 +7860,7 @@ VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
            vp = NULL;
        } else {
            /* pull it off the VLRU */
-           assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+           osi_Assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
            volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--;
            queue_Remove(&vp->vlru);
            vp->vlru.idx = VLRU_QUEUE_INVALID;
@@ -5343,38 +7891,79 @@ VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
 /* Volume Header Cache routines                    */
 /***************************************************/
 
+/**
+ * volume header cache.
+ */
 struct volume_hdr_LRU_t volume_hdr_LRU;
 
-/* Allocate a bunch of headers; string them together */
+/**
+ * initialize the volume header cache.
+ *
+ * @param[in] howMany  number of header cache entries to preallocate
+ *
+ * @pre VOL_LOCK held.  Function has never been called before.
+ *
+ * @post howMany cache entries are allocated, initialized, and added
+ *       to the LRU list.  Header cache statistics are initialized.
+ *
+ * @note only applicable to fileServer program type.  Should only be
+ *       called once during volume package initialization.
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VInitVolumeHeaderCache(afs_uint32 howMany)
 {
-    register struct volHeader *hp;
+    struct volHeader *hp;
     if (programType != fileServer)
        return;
     queue_Init(&volume_hdr_LRU);
-#ifdef AFS_DEMAND_ATTACH_FS
     volume_hdr_LRU.stats.free = 0;
     volume_hdr_LRU.stats.used = howMany;
     volume_hdr_LRU.stats.attached = 0;
-#endif
     hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
+    osi_Assert(hp != NULL);
+
     while (howMany--)
+       /* We are using ReleaseVolumeHeader to initialize the values on the header list
+        * to ensure they have the right values
+        */
        ReleaseVolumeHeader(hp++);
 }
 
-#ifdef AFS_DEMAND_ATTACH_FS
-/* Get a volume header from the LRU list; update the old one if necessary */
-/* Returns 1 if there was already a header, which is removed from the LRU list */
-/* caller MUST has a ref count on vp */
+/**
+ * get a volume header and attach it to the volume object.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @return cache entry status
+ *    @retval 0  volume header was newly attached; cache data is invalid
+ *    @retval 1  volume header was previously attached; cache data is valid
+ *
+ * @pre VOL_LOCK held.  For DAFS, lightweight ref must be held on volume object.
+ *
+ * @post volume header attached to volume object.  if necessary, header cache
+ *       entry on LRU is synchronized to disk.  Header is removed from LRU list.
+ *
+ * @note VOL_LOCK may be dropped
+ *
+ * @warning this interface does not load header data from disk.  it merely
+ *          attaches a header object to the volume object, and may sync the old
+ *          header cache data out to disk in the process.
+ *
+ * @internal volume package internal use only.
+ */
 static int
-GetVolumeHeader(register Volume * vp)
+GetVolumeHeader(Volume * vp)
 {
     Error error;
-    register struct volHeader *hd;
+    struct volHeader *hd;
     int old;
     static int everLogged = 0;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState vp_save = 0, back_save = 0;
+
     /* XXX debug 9/19/05 we've apparently got
      * a ref counting bug somewhere that's
      * breaking the nUsers == 0 => header on LRU
@@ -5383,6 +7972,7 @@ GetVolumeHeader(register Volume * vp)
        Log("nUsers == 0, but header not on LRU\n");
        return 1;
     }
+#endif
 
     old = (vp->header != NULL);        /* old == volume already has a header */
 
@@ -5390,102 +7980,12 @@ GetVolumeHeader(register Volume * vp)
        /* for volume utilities, we allocate volHeaders as needed */
        if (!vp->header) {
            hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
-           assert(hd != NULL);
-           vp->header = hd;
-           hd->back = vp;
-           V_attachFlags(vp) |= VOL_HDR_ATTACHED;
-       }
-    } else {
-       if (old) {
-           /* the header we previously dropped in the lru is
-            * still available. pull it off the lru and return */
-           hd = vp->header;
-           queue_Remove(hd);
-           assert(hd->back == vp);
-       } else {
-           /* we need to grab a new element off the LRU */
-           if (queue_IsNotEmpty(&volume_hdr_LRU)) {
-               /* grab an element and pull off of LRU */
-               hd = queue_First(&volume_hdr_LRU, volHeader);
-               queue_Remove(hd);
-           } else {
-               /* LRU is empty, so allocate a new volHeader 
-                * this is probably indicative of a leak, so let the user know */
-               hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
-               assert(hd != NULL);
-               if (!everLogged) {
-                   Log("****Allocated more volume headers, probably leak****\n");
-                   everLogged = 1;
-               }
-               volume_hdr_LRU.stats.free++;
-           }
-           if (hd->back) {
-               VolState vp_save, back_save;
-               /* this header used to belong to someone else. 
-                * we'll need to check if the header needs to
-                * be sync'd out to disk */
-
-               /* if hd->back were in an exclusive state, then
-                * its volHeader would not be on the LRU... */
-               assert(!IsExclusiveState(V_attachState(hd->back)));
-
-               if (hd->diskstuff.inUse) {
-                   /* volume was in use, so we'll need to sync
-                    * its header to disk */
-                   back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING);
-                   vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING);
-                   VCreateReservation_r(hd->back);
-                   VOL_UNLOCK;
-
-                   WriteVolumeHeader_r(&error, hd->back);
-                   /* Ignore errors; catch them later */
-
-                   VOL_LOCK;
-               }
-
-               V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU);
-               hd->back->header = NULL;
-
-               if (hd->diskstuff.inUse) {
-                   VChangeState_r(hd->back, back_save);
-                   VCancelReservation_r(hd->back);
-                   VChangeState_r(vp, vp_save);
-               }
-           } else {
-               volume_hdr_LRU.stats.attached++;
-           }
-           hd->back = vp;
-           vp->header = hd;
-           V_attachFlags(vp) |= VOL_HDR_ATTACHED;
-       }
-       volume_hdr_LRU.stats.free--;
-       volume_hdr_LRU.stats.used++;
-    }
-    IncUInt64(&VStats.hdr_gets);
-    IncUInt64(&vp->stats.hdr_gets);
-    vp->stats.last_hdr_get = FT_ApproxTime();
-    return old;
-}
-#else /* AFS_DEMAND_ATTACH_FS */
-/* Get a volume header from the LRU list; update the old one if necessary */
-/* Returns 1 if there was already a header, which is removed from the LRU list */
-static int
-GetVolumeHeader(register Volume * vp)
-{
-    Error error;
-    register struct volHeader *hd;
-    int old;
-    static int everLogged = 0;
-
-    old = (vp->header != NULL);        /* old == volume already has a header */
-
-    if (programType != fileServer) {
-       /* for volume utilities, we allocate volHeaders as needed */
-       if (!vp->header) {
-           hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
-           assert(hd != NULL);
+           osi_Assert(hd != NULL);
            vp->header = hd;
            hd->back = vp;
+#ifdef AFS_DEMAND_ATTACH_FS
+           V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+#endif
        }
     } else {
        /* for the fileserver, we keep a volume header cache */
@@ -5494,52 +7994,107 @@ GetVolumeHeader(register Volume * vp)
             * still available. pull it off the lru and return */
            hd = vp->header;
            queue_Remove(hd);
-           assert(hd->back == vp);
+           osi_Assert(hd->back == vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+            V_attachFlags(vp) &= ~(VOL_HDR_IN_LRU);
+#endif
        } else {
            /* we need to grab a new element off the LRU */
            if (queue_IsNotEmpty(&volume_hdr_LRU)) {
-               /* grab an element */
+               /* grab an element and pull off of LRU */
                hd = queue_First(&volume_hdr_LRU, volHeader);
                queue_Remove(hd);
            } else {
-               /* LRU is empty, so allocate a new volHeader 
+               /* LRU is empty, so allocate a new volHeader
                 * this is probably indicative of a leak, so let the user know */
                hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
-               assert(hd != NULL);
+               osi_Assert(hd != NULL);
                if (!everLogged) {
                    Log("****Allocated more volume headers, probably leak****\n");
                    everLogged = 1;
                }
+               volume_hdr_LRU.stats.free++;
            }
            if (hd->back) {
-               /* this header used to belong to someone else. 
+               /* this header used to belong to someone else.
                 * we'll need to check if the header needs to
                 * be sync'd out to disk */
 
+#ifdef AFS_DEMAND_ATTACH_FS
+               /* if hd->back were in an exclusive state, then
+                * its volHeader would not be on the LRU... */
+               osi_Assert(!VIsExclusiveState(V_attachState(hd->back)));
+#endif
+
                if (hd->diskstuff.inUse) {
+                   /* volume was in use, so we'll need to sync
+                    * its header to disk */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+                   back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING);
+                   vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING);
+                   VCreateReservation_r(hd->back);
+                   VOL_UNLOCK;
+#endif
+
                    WriteVolumeHeader_r(&error, hd->back);
                    /* Ignore errors; catch them later */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+                   VOL_LOCK;
+#endif
                }
+
                hd->back->header = NULL;
+#ifdef AFS_DEMAND_ATTACH_FS
+               V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU);
+
+               if (hd->diskstuff.inUse) {
+                   VChangeState_r(hd->back, back_save);
+                   VCancelReservation_r(hd->back);
+                   VChangeState_r(vp, vp_save);
+               }
+#endif
+           } else {
+               volume_hdr_LRU.stats.attached++;
            }
            hd->back = vp;
            vp->header = hd;
+#ifdef AFS_DEMAND_ATTACH_FS
+           V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+#endif
        }
+       volume_hdr_LRU.stats.free--;
+       volume_hdr_LRU.stats.used++;
     }
+    IncUInt64(&VStats.hdr_gets);
+#ifdef AFS_DEMAND_ATTACH_FS
+    IncUInt64(&vp->stats.hdr_gets);
+    vp->stats.last_hdr_get = FT_ApproxTime();
+#endif
     return old;
 }
-#endif /* AFS_DEMAND_ATTACH_FS */
 
 
-/* make sure a volume header is attached to
- * vp, and has the correct data loaded from
- * disk. */
-#ifdef AFS_DEMAND_ATTACH_FS
-/* caller MUST hold a ref count on vp */
+/**
+ * make sure volume header is attached and contains valid cache data.
+ *
+ * @param[out] ec  outbound error code
+ * @param[in]  vp  pointer to volume object
+ *
+ * @pre VOL_LOCK held.  For DAFS, lightweight ref held on vp.
+ *
+ * @post header cache entry attached, and loaded with valid data, or
+ *       *ec is nonzero, and the header is released back into the LRU.
+ *
+ * @internal volume package internal use only.
+ */
 static void
 LoadVolumeHeader(Error * ec, Volume * vp)
 {
+#ifdef AFS_DEMAND_ATTACH_FS
     VolState state_save;
+    afs_uint32 now;
     *ec = 0;
 
     if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
@@ -5551,21 +8106,16 @@ LoadVolumeHeader(Error * ec, Volume * vp)
                   sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
                   VOLUMEINFOVERSION);
        IncUInt64(&vp->stats.hdr_loads);
+       now = FT_ApproxTime();
 
        VOL_LOCK;
-       if (!*ec)
+       if (!*ec) {
            V_attachFlags(vp) |= VOL_HDR_LOADED;
+           vp->stats.last_hdr_load = now;
+       }
        VChangeState_r(vp, state_save);
     }
-    if (*ec) {
-       /* maintain (nUsers==0) => header in LRU invariant */
-       ReleaseVolumeHeader(vp->header);
-    }
-}
 #else /* AFS_DEMAND_ATTACH_FS */
-static void
-LoadVolumeHeader(Error * ec, Volume * vp)
-{
     *ec = 0;
     if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
        IncUInt64(&VStats.hdr_loads);
@@ -5574,16 +8124,31 @@ LoadVolumeHeader(Error * ec, Volume * vp)
                   sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
                   VOLUMEINFOVERSION);
     }
+#endif /* AFS_DEMAND_ATTACH_FS */
     if (*ec) {
        /* maintain (nUsers==0) => header in LRU invariant */
-       ReleaseVolumeHeader(vp->header);
+       FreeVolumeHeader(vp);
     }
 }
-#endif /* AFS_DEMAND_ATTACH_FS */
 
-/* Put it at the top of the LRU chain */
+/**
+ * release a header cache entry back into the LRU list.
+ *
+ * @param[in] hd  pointer to volume header cache object
+ *
+ * @pre VOL_LOCK held.
+ *
+ * @post header cache object appended onto end of LRU list.
+ *
+ * @note only applicable to fileServer program type.
+ *
+ * @note used to place a header cache entry back into the
+ *       LRU pool without invalidating it as a cache entry.
+ *
+ * @internal volume package internal use only.
+ */
 static void
-ReleaseVolumeHeader(register struct volHeader *hd)
+ReleaseVolumeHeader(struct volHeader *hd)
 {
     if (programType != fileServer)
        return;
@@ -5594,19 +8159,33 @@ ReleaseVolumeHeader(register struct volHeader *hd)
     if (hd->back) {
        V_attachFlags(hd->back) |= VOL_HDR_IN_LRU;
     }
+#endif
     volume_hdr_LRU.stats.free++;
     volume_hdr_LRU.stats.used--;
-#endif
 }
 
-/* for fileserver, return header to LRU, and
- * invalidate it as a cache entry.
+/**
+ * free/invalidate a volume header cache entry.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre VOL_LOCK is held.
  *
- * for volume utilities, free the heap space */
+ * @post For fileserver, header cache entry is returned to LRU, and it is
+ *       invalidated as a cache entry.  For volume utilities, the header
+ *       cache entry is freed.
+ *
+ * @note For fileserver, this should be utilized instead of ReleaseVolumeHeader
+ *       whenever it is necessary to invalidate the header cache entry.
+ *
+ * @see ReleaseVolumeHeader
+ *
+ * @internal volume package internal use only.
+ */
 static void
-FreeVolumeHeader(register Volume * vp)
+FreeVolumeHeader(Volume * vp)
 {
-    register struct volHeader *hd = vp->header;
+    struct volHeader *hd = vp->header;
     if (!hd)
        return;
     if (programType == fileServer) {
@@ -5617,8 +8196,8 @@ FreeVolumeHeader(register Volume * vp)
     }
 #ifdef AFS_DEMAND_ATTACH_FS
     V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED);
-    volume_hdr_LRU.stats.attached--;
 #endif
+    volume_hdr_LRU.stats.attached--;
     vp->header = NULL;
 }
 
@@ -5627,14 +8206,27 @@ FreeVolumeHeader(register Volume * vp)
 /* Volume Hash Table routines                      */
 /***************************************************/
 
-int 
+/**
+ * set size of volume object hash table.
+ *
+ * @param[in] logsize   log(2) of desired hash table size
+ *
+ * @return operation status
+ *    @retval 0 success
+ *    @retval -1 failure
+ *
+ * @pre MUST be called prior to VInitVolumePackage2
+ *
+ * @post Volume Hash Table will have 2^logsize buckets
+ */
+int
 VSetVolHashSize(int logsize)
 {
-    /* 64 to 16384 hash buckets seems like a reasonable range */
-    if ((logsize < 6 ) || (logsize > 14)) {
+    /* 64 to 268435456 hash buckets seems like a reasonable range */
+    if ((logsize < 6 ) || (logsize > 28)) {
         return -1;
     }
-    
+
     if (!VInit) {
         VolumeHashTable.Size = 1 << logsize;
         VolumeHashTable.Mask = VolumeHashTable.Size - 1;
@@ -5647,26 +8239,48 @@ VSetVolHashSize(int logsize)
     return 0;
 }
 
+/**
+ * initialize dynamic data structures for volume hash table.
+ *
+ * @post hash table is allocated, and fields are initialized.
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VInitVolumeHash(void)
 {
-    register int i;
+    int i;
 
-    VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size, 
+    VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size,
                                                           sizeof(VolumeHashChainHead));
-    assert(VolumeHashTable.Table != NULL);
-    
+    osi_Assert(VolumeHashTable.Table != NULL);
+
     for (i=0; i < VolumeHashTable.Size; i++) {
        queue_Init(&VolumeHashTable.Table[i]);
 #ifdef AFS_DEMAND_ATTACH_FS
-       assert(pthread_cond_init(&VolumeHashTable.Table[i].chain_busy_cv, NULL) == 0);
+       CV_INIT(&VolumeHashTable.Table[i].chain_busy_cv, "vhash busy", CV_DEFAULT, 0);
 #endif /* AFS_DEMAND_ATTACH_FS */
     }
 }
 
-/* for demand-attach, caller MUST hold a ref count on vp */
+/**
+ * add a volume object to the hash table.
+ *
+ * @param[in] vp      pointer to volume object
+ * @param[in] hashid  hash of volume id
+ *
+ * @pre VOL_LOCK is held.  For DAFS, caller must hold a lightweight
+ *      reference on vp.
+ *
+ * @post volume is added to hash chain.
+ *
+ * @internal volume package internal use only.
+ *
+ * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
+ *       asynchronous hash chain reordering to finish.
+ */
 static void
-AddVolumeToHashTable(register Volume * vp, int hashid)
+AddVolumeToHashTable(Volume * vp, int hashid)
 {
     VolumeHashChainHead * head;
 
@@ -5689,9 +8303,23 @@ AddVolumeToHashTable(register Volume * vp, int hashid)
     vp->vnodeHashOffset = VolumeHashOffset_r();
 }
 
-/* for demand-attach, caller MUST hold a ref count on vp */
+/**
+ * delete a volume object from the hash table.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre VOL_LOCK is held.  For DAFS, caller must hold a lightweight
+ *      reference on vp.
+ *
+ * @post volume is removed from hash chain.
+ *
+ * @internal volume package internal use only.
+ *
+ * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
+ *       asynchronous hash chain reordering to finish.
+ */
 static void
-DeleteVolumeFromHashTable(register Volume * vp)
+DeleteVolumeFromHashTable(Volume * vp)
 {
     VolumeHashChainHead * head;
 
@@ -5715,19 +8343,45 @@ DeleteVolumeFromHashTable(register Volume * vp)
      * after the volume is removed from the hash */
 }
 
-/* - look up a volume id in the hash table
- * - occasionally rebalance hash chains
- * - update lookup statistics accordingly
+/**
+ * lookup a volume object in the hash table given a volume id.
+ *
+ * @param[out] ec        error code return
+ * @param[in]  volumeId  volume id
+ * @param[in]  hint      volume object which we believe could be the correct
+                         mapping
+ *
+ * @return volume object pointer
+ *    @retval NULL  no such volume id is registered with the hash table.
+ *
+ * @pre VOL_LOCK is held.  For DAFS, caller must hold a lightweight
+        ref on hint.
+ *
+ * @post volume object with the given id is returned.  volume object and
+ *       hash chain access statistics are updated.  hash chain may have
+ *       been reordered.
+ *
+ * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
+ *       asynchronous hash chain reordering operation to finish, or
+ *       in order for us to perform an asynchronous chain reordering.
+ *
+ * @note Hash chain reorderings occur when the access count for the
+ *       volume object being looked up exceeds the sum of the previous
+ *       node's (the node ahead of it in the hash chain linked list)
+ *       access count plus the constant VOLUME_HASH_REORDER_THRESHOLD.
+ *
+ * @note For DAFS, the hint parameter allows us to short-circuit if the
+ *       cacheCheck fields match between the hash chain head and the
+ *       hint volume object.
  */
-/* the hint parameter allows us to short-circuit on
- * DEMAND_ATTACH_FS if the cacheChecks match between
- * the hash chain head and hint
- * caller MUST hold a refcount on hint */
 Volume *
 VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
 {
-    register int looks = 0;
-    Volume * vp, *np, *pp;
+    int looks = 0;
+    Volume * vp, *np;
+#ifdef AFS_DEMAND_ATTACH_FS
+    Volume *pp;
+#endif
     VolumeHashChainHead * head;
     *ec = 0;
 
@@ -5745,7 +8399,7 @@ VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
 #endif /* AFS_DEMAND_ATTACH_FS */
 
     /* someday we need to either do per-chain locks, RWlocks,
-     * or both for volhash access. 
+     * or both for volhash access.
      * (and move to a data structure with better cache locality) */
 
     /* search the chain for this volume id */
@@ -5788,7 +8442,7 @@ VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
        /* update the short-circuit cache check */
        vp->chainCacheCheck = head->cacheCheck;
     }
-#endif /* AFS_DEMAND_ATTACH_FS */    
+#endif /* AFS_DEMAND_ATTACH_FS */
 
     return vp;
 }
@@ -5849,29 +8503,88 @@ VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp)
 /* demand-attach fs volume hash
  * asynchronous exclusive operations */
 
-/* take exclusive control over the hash chain */
+/**
+ * begin an asynchronous exclusive operation on a volume hash chain.
+ *
+ * @param[in] head   pointer to volume hash chain head object
+ *
+ * @pre VOL_LOCK held.  hash chain is quiescent.
+ *
+ * @post hash chain marked busy.
+ *
+ * @note this interface is used in conjunction with VHashEndExclusive_r and
+ *       VHashWait_r to perform asynchronous (wrt VOL_LOCK) operations on a
+ *       volume hash chain.  Its main use case is hash chain reordering, which
+ *       has the potential to be a highly latent operation.
+ *
+ * @see VHashEndExclusive_r
+ * @see VHashWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VHashBeginExclusive_r(VolumeHashChainHead * head)
 {
-    assert(head->busy == 0);
+    osi_Assert(head->busy == 0);
     head->busy = 1;
 }
 
-/* relinquish exclusive control over the hash chain */
+/**
+ * relinquish exclusive ownership of a volume hash chain.
+ *
+ * @param[in] head   pointer to volume hash chain head object
+ *
+ * @pre VOL_LOCK held.  thread owns the hash chain exclusively.
+ *
+ * @post hash chain is marked quiescent.  threads awaiting use of
+ *       chain are awakened.
+ *
+ * @see VHashBeginExclusive_r
+ * @see VHashWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VHashEndExclusive_r(VolumeHashChainHead * head)
 {
-    assert(head->busy);
+    osi_Assert(head->busy);
     head->busy = 0;
-    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+    CV_BROADCAST(&head->chain_busy_cv);
 }
 
-/* wait for another thread to finish its exclusive ops */
+/**
+ * wait for all asynchronous operations on a hash chain to complete.
+ *
+ * @param[in] head   pointer to volume hash chain head object
+ *
+ * @pre VOL_LOCK held.
+ *
+ * @post hash chain object is quiescent.
+ *
+ * @see VHashBeginExclusive_r
+ * @see VHashEndExclusive_r
+ *
+ * @note DAFS only
+ *
+ * @note This interface should be called before any attempt to
+ *       traverse the hash chain.  It is permissible for a thread
+ *       to gain exclusive access to the chain, and then perform
+ *       latent operations on the chain asynchronously wrt the
+ *       VOL_LOCK.
+ *
+ * @warning if waiting is necessary, VOL_LOCK is dropped
+ *
+ * @internal volume package internal use only.
+ */
 static void
 VHashWait_r(VolumeHashChainHead * head)
 {
     while (head->busy) {
-       assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+       VOL_CV_WAIT(&head->chain_busy_cv);
     }
 }
 #endif /* AFS_DEMAND_ATTACH_FS */
@@ -5890,6 +8603,28 @@ VHashWait_r(VolumeHashChainHead * head)
  */
 
 #ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * add a volume to its disk partition VByPList.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre either the disk partition VByPList is owned exclusively
+ *      by the calling thread, or the list is quiescent and
+ *      VOL_LOCK is held.
+ *
+ * @post volume is added to disk partition VByPList
+ *
+ * @note DAFS only
+ *
+ * @warning it is the caller's responsibility to ensure list
+ *          quiescence.
+ *
+ * @see VVByPListWait_r
+ * @see VVByPListBeginExclusive_r
+ * @see VVByPListEndExclusive_r
+ *
+ * @internal volume package internal use only.
+ */
 static void
 AddVolumeToVByPList_r(Volume * vp)
 {
@@ -5900,6 +8635,28 @@ AddVolumeToVByPList_r(Volume * vp)
     }
 }
 
+/**
+ * delete a volume from its disk partition VByPList.
+ *
+ * @param[in] vp  pointer to volume object
+ *
+ * @pre either the disk partition VByPList is owned exclusively
+ *      by the calling thread, or the list is quiescent and
+ *      VOL_LOCK is held.
+ *
+ * @post volume is removed from the disk partition VByPList
+ *
+ * @note DAFS only
+ *
+ * @warning it is the caller's responsibility to ensure list
+ *          quiescence.
+ *
+ * @see VVByPListWait_r
+ * @see VVByPListBeginExclusive_r
+ * @see VVByPListEndExclusive_r
+ *
+ * @internal volume package internal use only.
+ */
 static void
 DeleteVolumeFromVByPList_r(Volume * vp)
 {
@@ -5910,29 +8667,88 @@ DeleteVolumeFromVByPList_r(Volume * vp)
     }
 }
 
+/**
+ * begin an asynchronous exclusive operation on a VByPList.
+ *
+ * @param[in] dp   pointer to disk partition object
+ *
+ * @pre VOL_LOCK held.  VByPList is quiescent.
+ *
+ * @post VByPList marked busy.
+ *
+ * @note this interface is used in conjunction with VVByPListEndExclusive_r and
+ *       VVByPListWait_r to perform asynchronous (wrt VOL_LOCK) operations on a
+ *       VByPList.
+ *
+ * @see VVByPListEndExclusive_r
+ * @see VVByPListWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 /* take exclusive control over the list */
 static void
-VVByPListBeginExclusive_r(struct DiskPartition * dp)
+VVByPListBeginExclusive_r(struct DiskPartition64 * dp)
 {
-    assert(dp->vol_list.busy == 0);
+    osi_Assert(dp->vol_list.busy == 0);
     dp->vol_list.busy = 1;
 }
 
-/* relinquish exclusive control over the list */
+/**
+ * relinquish exclusive ownership of a VByPList.
+ *
+ * @param[in] dp   pointer to disk partition object
+ *
+ * @pre VOL_LOCK held.  thread owns the VByPList exclusively.
+ *
+ * @post VByPList is marked quiescent.  threads awaiting use of
+ *       the list are awakened.
+ *
+ * @see VVByPListBeginExclusive_r
+ * @see VVByPListWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
 static void
-VVByPListEndExclusive_r(struct DiskPartition * dp)
+VVByPListEndExclusive_r(struct DiskPartition64 * dp)
 {
-    assert(dp->vol_list.busy);
+    osi_Assert(dp->vol_list.busy);
     dp->vol_list.busy = 0;
-    assert(pthread_cond_broadcast(&dp->vol_list.cv) == 0);
+    CV_BROADCAST(&dp->vol_list.cv);
 }
 
-/* wait for another thread to finish its exclusive ops */
+/**
+ * wait for all asynchronous operations on a VByPList to complete.
+ *
+ * @param[in] dp  pointer to disk partition object
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post disk partition's VByP list is quiescent
+ *
+ * @note DAFS only
+ *
+ * @note This interface should be called before any attempt to
+ *       traverse the VByPList.  It is permissible for a thread
+ *       to gain exclusive access to the list, and then perform
+ *       latent operations on the list asynchronously wrt the
+ *       VOL_LOCK.
+ *
+ * @warning if waiting is necessary, VOL_LOCK is dropped
+ *
+ * @see VVByPListEndExclusive_r
+ * @see VVByPListBeginExclusive_r
+ *
+ * @internal volume package internal use only.
+ */
 static void
-VVByPListWait_r(struct DiskPartition * dp)
+VVByPListWait_r(struct DiskPartition64 * dp)
 {
     while (dp->vol_list.busy) {
-       assert(pthread_cond_wait(&dp->vol_list.cv, &vol_glock_mutex) == 0);
+       VOL_CV_WAIT(&dp->vol_list.cv);
     }
 }
 #endif /* AFS_DEMAND_ATTACH_FS */
@@ -5945,7 +8761,7 @@ void
 VPrintCacheStats_r(void)
 {
     afs_uint32 get_hi, get_lo, load_hi, load_lo;
-    register struct VnodeClassInfo *vcp;
+    struct VnodeClassInfo *vcp;
     vcp = &VnodeClassInfo[vLarge];
     Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
     vcp = &VnodeClassInfo[vSmall];
@@ -5995,10 +8811,108 @@ DoubleToPrintable(double x, char * buf, int len)
     return buf;
 }
 
+struct VLRUExtStatsEntry {
+    VolumeId volid;
+};
+
+struct VLRUExtStats {
+    afs_uint32 len;
+    afs_uint32 used;
+    struct {
+       afs_uint32 start;
+       afs_uint32 len;
+    } queue_info[VLRU_QUEUE_INVALID];
+    struct VLRUExtStatsEntry * vec;
+};
+
+/**
+ * add a 256-entry fudge factor onto the vector in case state changes
+ * out from under us.
+ */
+#define VLRU_EXT_STATS_VEC_LEN_FUDGE   256
+
+/**
+ * collect extended statistics for the VLRU subsystem.
+ *
+ * @param[out] stats  pointer to stats structure to be populated
+ * @param[in] nvols   number of volumes currently known to exist
+ *
+ * @pre VOL_LOCK held
+ *
+ * @post stats->vec allocated and populated
+ *
+ * @return operation status
+ *    @retval 0 success
+ *    @retval 1 failure
+ */
+static int
+VVLRUExtStats_r(struct VLRUExtStats * stats, afs_uint32 nvols)
+{
+    afs_uint32 cur, idx, len;
+    struct rx_queue * qp, * nqp;
+    Volume * vp;
+    struct VLRUExtStatsEntry * vec;
+
+    len = nvols + VLRU_EXT_STATS_VEC_LEN_FUDGE;
+    vec = stats->vec = calloc(len,
+                             sizeof(struct VLRUExtStatsEntry));
+    if (vec == NULL) {
+       return 1;
+    }
+
+    cur = 0;
+    for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
+       VLRU_Wait_r(&volume_LRU.q[idx]);
+       VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+       VOL_UNLOCK;
+
+       stats->queue_info[idx].start = cur;
+
+       for (queue_Scan(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+           if (cur == len) {
+               /* out of space in vec */
+               break;
+           }
+           vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+           vec[cur].volid = vp->hashid;
+           cur++;
+       }
+
+       stats->queue_info[idx].len = cur - stats->queue_info[idx].start;
+
+       VOL_LOCK;
+       VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+    }
+
+    stats->len = len;
+    stats->used = cur;
+    return 0;
+}
+
+#define ENUMTOSTRING(en)  #en
+#define ENUMCASE(en) \
+    case en: return ENUMTOSTRING(en)
+
+static char *
+vlru_idx_to_string(int idx)
+{
+    switch (idx) {
+       ENUMCASE(VLRU_QUEUE_NEW);
+       ENUMCASE(VLRU_QUEUE_MID);
+       ENUMCASE(VLRU_QUEUE_OLD);
+       ENUMCASE(VLRU_QUEUE_CANDIDATE);
+       ENUMCASE(VLRU_QUEUE_HELD);
+       ENUMCASE(VLRU_QUEUE_INVALID);
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
 void
 VPrintExtendedCacheStats_r(int flags)
 {
-    int i, j;
+    int i;
+    afs_uint32 vol_sum = 0;
     struct stats {
        double min;
        double max;
@@ -6010,6 +8924,7 @@ VPrintExtendedCacheStats_r(int flags)
     char pr_buf[4][32];
     VolumeHashChainHead *head;
     Volume *vp, *np;
+    struct VLRUExtStats vlru_stats;
 
     /* zero out stats */
     memset(&looks, 0, sizeof(struct stats));
@@ -6037,7 +8952,8 @@ VPrintExtendedCacheStats_r(int flags)
            gets.sum     += ch_gets.sum;
            reorders.sum += ch_reorders.sum;
            len.sum      += (double)head->len;
-           
+           vol_sum      += head->len;
+
            if (i == 0) {
                len.min      = (double) head->len;
                len.max      = (double) head->len;
@@ -6116,7 +9032,7 @@ VPrintExtendedCacheStats_r(int flags)
 
            /* dump per-chain stats */
            Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n",
-               i, head->len, 
+               i, head->len,
                DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
                DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1])));
            Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n",
@@ -6137,7 +9053,7 @@ VPrintExtendedCacheStats_r(int flags)
        } else if (flags & VOL_STATS_PER_CHAIN) {
            /* dump simple per-chain stats */
            Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n",
-               i, head->len, 
+               i, head->len,
                DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
                DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])),
                DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2])));
@@ -6180,7 +9096,7 @@ VPrintExtendedCacheStats_r(int flags)
 
     /* print extended disk related statistics */
     {
-       struct DiskPartition * diskP;
+       struct DiskPartition64 * diskP;
        afs_uint32 vol_count[VOLMAXPARTS+1];
        byte part_exists[VOLMAXPARTS+1];
        Device id;
@@ -6200,9 +9116,11 @@ VPrintExtendedCacheStats_r(int flags)
        VOL_UNLOCK;
        for (i = 0; i <= VOLMAXPARTS; i++) {
            if (part_exists[i]) {
+               /* XXX while this is currently safe, it is a violation
+                *     of the VGetPartitionById_r interface contract. */
                diskP = VGetPartitionById_r(i, 0);
                if (diskP) {
-                   Log("Partition %s has %d online volumes\n", 
+                   Log("Partition %s has %d online volumes\n",
                        VPartitionPath(diskP), diskP->vol_list.len);
                }
            }
@@ -6210,6 +9128,44 @@ VPrintExtendedCacheStats_r(int flags)
        VOL_LOCK;
     }
 
+    /* print extended VLRU statistics */
+    if (VVLRUExtStats_r(&vlru_stats, vol_sum) == 0) {
+       afs_uint32 idx, cur, lpos;
+       VolumeId line[5];
+
+        VOL_UNLOCK;
+
+       Log("VLRU State Dump:\n\n");
+
+       for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
+           Log("\t%s:\n", vlru_idx_to_string(idx));
+
+           lpos = 0;
+           for (cur = vlru_stats.queue_info[idx].start;
+                cur < vlru_stats.queue_info[idx].len;
+                cur++) {
+               line[lpos++] = vlru_stats.vec[cur].volid;
+               if (lpos==5) {
+                   Log("\t\t%u, %u, %u, %u, %u,\n",
+                       line[0], line[1], line[2], line[3], line[4]);
+                   lpos = 0;
+               }
+           }
+
+           if (lpos) {
+               while (lpos < 5) {
+                   line[lpos++] = 0;
+               }
+               Log("\t\t%u, %u, %u, %u, %u\n",
+                   line[0], line[1], line[2], line[3], line[4]);
+           }
+           Log("\n");
+       }
+
+       free(vlru_stats.vec);
+
+       VOL_LOCK;
+    }
 }
 
 void
@@ -6220,3 +9176,27 @@ VPrintExtendedCacheStats(int flags)
     VOL_UNLOCK;
 }
 #endif /* AFS_DEMAND_ATTACH_FS */
+
+afs_int32
+VCanScheduleSalvage(void)
+{
+    return vol_opts.canScheduleSalvage;
+}
+
+afs_int32
+VCanUseFSSYNC(void)
+{
+    return vol_opts.canUseFSSYNC;
+}
+
+afs_int32
+VCanUseSALVSYNC(void)
+{
+    return vol_opts.canUseSALVSYNC;
+}
+
+afs_int32
+VCanUnsafeAttach(void)
+{
+    return vol_opts.unsafe_attach;
+}