dafs-vol-offline-race-20090127

[openafs.git] / src / vol / volume.c
diff --git a/src/vol/volume.c b/src/vol/volume.c

index 7138dbc..b69de93 100644 (file)
--- a/src/vol/volume.c
+++ b/src/vol/volume.c
@@ -170,7 +170,7 @@ extern void *calloc(), *realloc();
 /* Forward declarations */
 static Volume *attach2(Error * ec, VolId vid, char *path,
                       register struct VolumeHeader *header,
-                      struct DiskPartition *partp, Volume * vp, 
+                      struct DiskPartition64 *partp, Volume * vp, 
                       int isbusy, int mode);
 static void ReallyFreeVolume(Volume * vp);
 #ifdef AFS_DEMAND_ATTACH_FS
@@ -188,8 +188,6 @@ static void DeleteVolumeFromHashTable(register Volume * vp);
 static int VHold(Volume * vp);
 static int VHold_r(Volume * vp);
 static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class);
-static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp,
-                         char **namep);
 static void VReleaseVolumeHandles_r(Volume * vp);
 static void VCloseVolumeHandles_r(Volume * vp);
 static void LoadVolumeHeader(Error * ec, Volume * vp);
@@ -278,7 +276,7 @@ ffs(x)
 #ifdef AFS_PTHREAD_ENV
 typedef struct diskpartition_queue_t {
     struct rx_queue queue;
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
 } diskpartition_queue_t;
 typedef struct vinitvolumepackage_thread_t {
     struct rx_queue queue;
@@ -288,7 +286,7 @@ typedef struct vinitvolumepackage_thread_t {
 static void * VInitVolumePackageThread(void * args);
 #endif /* AFS_PTHREAD_ENV */
 
-static int VAttachVolumesByPartition(struct DiskPartition *diskP, 
+static int VAttachVolumesByPartition(struct DiskPartition64 *diskP, 
                                     int * nAttached, int * nUnattached);
 
 
@@ -345,9 +343,9 @@ static int VCheckFree(Volume * vp);
 /* VByP List */
 static void AddVolumeToVByPList_r(Volume * vp);
 static void DeleteVolumeFromVByPList_r(Volume * vp);
-static void VVByPListBeginExclusive_r(struct DiskPartition * dp);
-static void VVByPListEndExclusive_r(struct DiskPartition * dp);
-static void VVByPListWait_r(struct DiskPartition * dp);
+static void VVByPListBeginExclusive_r(struct DiskPartition64 * dp);
+static void VVByPListEndExclusive_r(struct DiskPartition64 * dp);
+static void VVByPListWait_r(struct DiskPartition64 * dp);
 
 /* online salvager */
 static int VCheckSalvage(register Volume * vp);
@@ -362,8 +360,8 @@ static void VHashEndExclusive_r(VolumeHashChainHead * head);
 static void VHashWait_r(VolumeHashChainHead * head);
 
 /* shutdown */
-static int ShutdownVByPForPass_r(struct DiskPartition * dp, int pass);
-static int ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+static int ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass);
+static int ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
                                struct rx_queue ** idx);
 static void ShutdownController(vshutdown_thread_t * params);
 static void ShutdownCreateSchedule(vshutdown_thread_t * params);
@@ -385,6 +383,12 @@ static void VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append);
 static int VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh);
 static int VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh);
 static int VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh);
+
+
+pthread_key_t VThread_key;
+VThreadOptions_t VThread_defaults = {
+    0                           /**< allow salvsync */
+};
 #endif /* AFS_DEMAND_ATTACH_FS */
 
 
@@ -441,6 +445,7 @@ VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVno
     } else {
        VLRU_SetOptions(VLRU_SET_ENABLED, 0);
     }
+    assert(pthread_key_create(&VThread_key, NULL) == 0);
 #endif
 
 #ifdef AFS_PTHREAD_ENV
@@ -497,7 +502,7 @@ VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVno
        return -1;
 
     if (programType == fileServer) {
-       struct DiskPartition *diskP;
+       struct DiskPartition64 *diskP;
 #ifdef AFS_PTHREAD_ENV
        struct vinitvolumepackage_thread_t params;
        struct diskpartition_queue_t * dpq;
@@ -579,8 +584,8 @@ VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVno
 #ifdef FSSYNC_BUILD_CLIENT
     if (programType == volumeUtility && connect) {
        if (!VConnectFS()) {
-           Log("Unable to connect to file server; aborted\n");
-           exit(1);
+           Log("Unable to connect to file server; will retry at need\n");
+           /*exit(1);*/
        }
     }
 #ifdef AFS_DEMAND_ATTACH_FS
@@ -602,7 +607,7 @@ VInitVolumePackageThread(void * args) {
 
     DIR *dirp;
     struct dirent *dp;
-    struct DiskPartition *diskP;
+    struct DiskPartition64 *diskP;
     struct vinitvolumepackage_thread_t * params;
     struct diskpartition_queue_t * dpq;
 
@@ -636,7 +641,7 @@ VInitVolumePackageThread(void * args) {
  * attach all volumes on a given disk partition
  */
 static int
-VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nUnattached)
+VAttachVolumesByPartition(struct DiskPartition64 *diskP, int * nAttached, int * nUnattached)
 {
   DIR * dirp;
   struct dirent * dp;
@@ -747,7 +752,7 @@ VShutdown_r(void)
     register Volume *vp, *np;
     register afs_int32 code;
 #ifdef AFS_DEMAND_ATTACH_FS
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
     struct diskpartition_queue_t * dpq;
     vshutdown_thread_t params;
     pthread_t tid;
@@ -798,7 +803,7 @@ VShutdown_r(void)
            dpq->diskP = diskP;
            queue_Prepend(&params, dpq);
 
-           params.part_pass_head[diskP->device] = queue_First(&diskP->vol_list, rx_queue);
+           params.part_pass_head[diskP->index] = queue_First(&diskP->vol_list, rx_queue);
        }
 
        Log("VShutdown:  beginning parallel fileserver shutdown\n");
@@ -843,10 +848,10 @@ VShutdown_r(void)
            VVByPListEndExclusive_r(diskP);
            Log("VShutdown:  %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
                VPartitionPath(diskP),
-               params.stats[0][diskP->device],
-               params.stats[1][diskP->device],
-               params.stats[2][diskP->device],
-               params.stats[3][diskP->device]);
+               params.stats[0][diskP->index],
+               params.stats[1][diskP->index],
+               params.stats[2][diskP->index],
+               params.stats[3][diskP->index]);
        }
 
        Log("VShutdown:  shutdown finished using %d threads\n", params.n_threads);
@@ -898,7 +903,7 @@ static void
 ShutdownController(vshutdown_thread_t * params)
 {
     /* XXX debug */
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
     Device id;
     vshutdown_thread_t shadow;
 
@@ -917,7 +922,7 @@ ShutdownController(vshutdown_thread_t * params)
        Log("ShutdownController:  n_threads_complete=%d, n_parts_done_pass=%d\n",
            shadow.n_threads_complete, shadow.n_parts_done_pass);
        for (diskP = DiskPartitionList; diskP; diskP=diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            Log("ShutdownController:  part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n",
                id, 
                diskP->vol_list.len,
@@ -943,7 +948,7 @@ ShutdownController(vshutdown_thread_t * params)
 static void
 ShutdownCreateSchedule(vshutdown_thread_t * params)
 {
-    struct DiskPartition * diskP;
+    struct DiskPartition64 * diskP;
     int sum, thr_workload, thr_left;
     int part_residue[VOLMAXPARTS+1];
     Device id;
@@ -971,7 +976,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
     /* for fairness, give every partition with volumes remaining
      * at least one thread */
     for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
-       id = diskP->device;
+       id = diskP->index;
        if (diskP->vol_list.len) {
            params->part_thread_target[id] = 1;
            thr_left--;
@@ -985,7 +990,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
        int delta;
 
        for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            delta = (diskP->vol_list.len / thr_workload) -
                params->part_thread_target[id];
            if (delta < 0) {
@@ -1009,7 +1014,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
 
        /* compute the residues */
        for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            part_residue[id] = diskP->vol_list.len - 
                (params->part_thread_target[id] * thr_workload);
        }
@@ -1019,7 +1024,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
        while (thr_left) {
            max_residue = 0;
            for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-               id = diskP->device;
+               id = diskP->index;
                if (part_residue[id] > max_residue) {
                    max_residue = part_residue[id];
                    max_id = id;
@@ -1042,7 +1047,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
        if (thr_left >= params->n_parts) {
            alloc = thr_left / params->n_parts;
            for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-               id = diskP->device;
+               id = diskP->index;
                params->part_thread_target[id] += alloc;
                thr_left -= alloc;
            }
@@ -1050,7 +1055,7 @@ ShutdownCreateSchedule(vshutdown_thread_t * params)
 
        /* finish off the last of the threads */
        for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            params->part_thread_target[id]++;
            thr_left--;
        }
@@ -1065,7 +1070,7 @@ VShutdownThread(void * args)
     Volume * vp;
     vshutdown_thread_t * params;
     int part, code, found, pass, schedule_version_save, count;
-    struct DiskPartition *diskP;
+    struct DiskPartition64 *diskP;
     struct diskpartition_queue_t * dpq;
     Device id;
 
@@ -1082,12 +1087,12 @@ VShutdownThread(void * args)
        assert(pthread_mutex_unlock(&params->lock) == 0);
        diskP = dpq->diskP;
        free(dpq);
-       id = diskP->device;
+       id = diskP->index;
 
        count = 0;
        while (ShutdownVolumeWalk_r(diskP, 0, &params->part_pass_head[id]))
            count++;
-       params->stats[0][diskP->device] = count;
+       params->stats[0][diskP->index] = count;
        assert(pthread_mutex_lock(&params->lock) == 0);
     }
 
@@ -1113,7 +1118,7 @@ VShutdownThread(void * args)
        found = 0;
        /* find a disk partition to work on */
        for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-           id = diskP->device;
+           id = diskP->index;
            if (params->part_thread_target[id] && !params->part_done_pass[id]) {
                params->part_thread_target[id]--;
                found = 1;
@@ -1125,7 +1130,7 @@ VShutdownThread(void * args)
            /* hmm. for some reason the controller thread couldn't find anything for 
             * us to do. let's see if there's anything we can do */
            for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-               id = diskP->device;
+               id = diskP->index;
                if (diskP->vol_list.len && !params->part_done_pass[id]) {
                    found = 1;
                    break;
@@ -1176,7 +1181,7 @@ VShutdownThread(void * args)
                    params->n_parts_done_pass = 0;
                    params->pass++;
                    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
-                       id = diskP->device;
+                       id = diskP->index;
                        params->part_done_pass[id] = 0;
                        params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue);
                    }
@@ -1214,7 +1219,7 @@ VShutdownThread(void * args)
  * note that this function will not allow mp-fast
  * shutdown of a partition */
 int
-VShutdownByPartition_r(struct DiskPartition * dp)
+VShutdownByPartition_r(struct DiskPartition64 * dp)
 {
     int pass, retVal;
     int pass_stats[4];
@@ -1265,7 +1270,7 @@ VShutdownByPartition_r(struct DiskPartition * dp)
  * traversal
  */
 static int
-ShutdownVByPForPass_r(struct DiskPartition * dp, int pass)
+ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass)
 {
     struct rx_queue * q = queue_First(&dp->vol_list, rx_queue);
     register int i = 0;
@@ -1280,7 +1285,7 @@ ShutdownVByPForPass_r(struct DiskPartition * dp, int pass)
  * returns 1 if a volume was shutdown in this pass,
  * 0 otherwise */
 static int
-ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
                     struct rx_queue ** idx)
 {
     struct rx_queue *qp, *nqp;
@@ -1587,7 +1592,7 @@ VPreAttachVolumeById_r(Error * ec,
                       VolId volumeId)
 {
     Volume *vp;
-    struct DiskPartition *partp;
+    struct DiskPartition64 *partp;
 
     *ec = 0;
 
@@ -1635,7 +1640,7 @@ VPreAttachVolumeById_r(Error * ec,
  */
 Volume * 
 VPreAttachVolumeByVp_r(Error * ec, 
-                      struct DiskPartition * partp, 
+                      struct DiskPartition64 * partp, 
                       Volume * vp,
                       VolId vid)
 {
@@ -1646,17 +1651,16 @@ VPreAttachVolumeByVp_r(Error * ec,
     /* check to see if pre-attach already happened */
     if (vp && 
        (V_attachState(vp) != VOL_STATE_UNATTACHED) && 
-       !VIsErrorState(V_attachState(vp)) &&
-       ((V_attachState(vp) != VOL_STATE_PREATTACHED) ||
-        vp->pending_vol_op == NULL)) {
+       (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
+       !VIsErrorState(V_attachState(vp))) {
        /*
         * pre-attach is a no-op in all but the following cases:
         *
         *   - volume is unattached
         *   - volume is in an error state
-        *   - volume is pre-attached with a pending volume operation
-        *     (e.g. vos move between two partitions on same server)
+        *   - volume is pre-attached
         */
+       Log("VPreattachVolumeByVp_r: volume %u not in quiescent state\n", vid);
        goto done;
     } else if (vp) {
        /* we're re-attaching a volume; clear out some old state */
@@ -1684,7 +1688,9 @@ VPreAttachVolumeByVp_r(Error * ec,
     /* link the volume with its associated vice partition */
     vp->device = partp->device;
     vp->partition = partp;
+
     vp->hashid = vid;
+    vp->specialStatus = 0;
 
     /* if we dropped the lock, reacquire the lock,
      * check for pre-attach races, and then add
@@ -1747,7 +1753,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
     struct afs_stat status;
     struct VolumeDiskHeader diskHeader;
     struct VolumeHeader iheader;
-    struct DiskPartition *partp;
+    struct DiskPartition64 *partp;
     char path[64];
     int isbusy = 0;
     VolId volumeId;
@@ -1793,12 +1799,12 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
            VWaitExclusiveState_r(vp);
 
            /* at this point state must be one of:
-            *   UNATTACHED,
-            *   ATTACHED,
-            *   SHUTTING_DOWN,
-            *   GOING_OFFLINE,
-            *   SALVAGING,
-            *   ERROR
+            *   - UNATTACHED
+            *   - ATTACHED
+            *   - SHUTTING_DOWN
+            *   - GOING_OFFLINE
+            *   - SALVAGING
+            *   - ERROR
             */
 
            if (vp->specialStatus == VBUSY)
@@ -1807,7 +1813,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
            /* if it's already attached, see if we can return it */
            if (V_attachState(vp) == VOL_STATE_ATTACHED) {
                VGetVolumeByVp_r(ec, vp);
-               if (V_inUse(vp)) {
+               if (V_inUse(vp) == fileServer) {
                    VCancelReservation_r(vp);
                    return vp;
                }
@@ -1877,7 +1883,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
 #else /* AFS_DEMAND_ATTACH_FS */
        vp = VGetVolume_r(ec, volumeId);
        if (vp) {
-           if (V_inUse(vp))
+           if (V_inUse(vp) == fileServer)
                return vp;
            if (vp->specialStatus == VBUSY)
                isbusy = 1;
@@ -1951,6 +1957,11 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
     vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
 
     if (programType == volumeUtility && vp) {
+       if ((mode == V_VOLUPD) || (VolumeWriteable(vp) && (mode == V_CLONE))) {
+           /* mark volume header as in use so that volser crashes lead to a
+            * salvage attempt */
+           VUpdateVolume_r(ec, vp, 0);
+       }
 #ifdef AFS_DEMAND_ATTACH_FS
        /* for dafs, we should tell the fileserver, except for V_PEEK
          * where we know it is not necessary */
@@ -2070,7 +2081,7 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
     struct afs_stat status;
     struct VolumeDiskHeader diskHeader;
     struct VolumeHeader iheader;
-    struct DiskPartition *partp;
+    struct DiskPartition64 *partp;
     char path[64];
     int isbusy = 0;
     VolId volumeId;
@@ -2094,7 +2105,7 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
     /* if it's already attached, see if we can return it */
     if (V_attachState(vp) == VOL_STATE_ATTACHED) {
        VGetVolumeByVp_r(ec, vp);
-       if (V_inUse(vp)) {
+       if (V_inUse(vp) == fileServer) {
            return vp;
        } else {
            if (vp->specialStatus == VBUSY)
@@ -2237,7 +2248,7 @@ VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
  */
 private Volume * 
 attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * header,
-       struct DiskPartition * partp, register Volume * vp, int isbusy, int mode)
+       struct DiskPartition64 * partp, register Volume * vp, int isbusy, int mode)
 {
     vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
     IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent,
@@ -2280,7 +2291,7 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
        res.payload.buf = &vp->header->diskstuff;
 
        if (FSYNC_VolOp(volumeId,
-                       VPartitionPath(partp),
+                       partp->name,
                        FSYNC_VOL_QUERY_HDR,
                        FSYNC_WHATEVER,
                        &res) == SYNC_OK) {
@@ -2311,7 +2322,19 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
        /* check for pending volume operations */
        if (vp->pending_vol_op) {
            /* see if the pending volume op requires exclusive access */
-           if (!VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+           switch (vp->pending_vol_op->vol_op_state) {
+           case FSSYNC_VolOpPending:
+               /* this should never happen */
+               assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpPending);
+               break;
+
+           case FSSYNC_VolOpRunningUnknown:
+               vp->pending_vol_op->vol_op_state = 
+                   (VVolOpLeaveOnline_r(vp, vp->pending_vol_op) ? 
+                    FSSYNC_VolOpRunningOnline : FSSYNC_VolOpRunningOffline);
+               /* fall through */
+
+           case FSSYNC_VolOpRunningOffline:
                /* mark the volume down */
                *ec = VOFFLINE;
                VChangeState_r(vp, VOL_STATE_UNATTACHED);
@@ -2488,17 +2511,21 @@ attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * h
        if (vp->specialStatus)
            vp->specialStatus = 0;
        if (V_blessed(vp) && V_inService(vp) && !V_needsSalvaged(vp)) {
-           V_inUse(vp) = 1;
+           V_inUse(vp) = fileServer;
            V_offlineMessage(vp)[0] = '\0';
        }
+    } else {
+       if ((mode != V_PEEK) && (mode != V_SECRETLY))
+           V_inUse(vp) = programType;
+       V_checkoutMode(vp) = mode;
     }
 
     AddVolumeToHashTable(vp, V_id(vp));
 #ifdef AFS_DEMAND_ATTACH_FS
-    AddVolumeToVByPList_r(vp);
-    VLRU_Add_r(vp);
     if ((programType != fileServer) ||
-       V_inUse(vp)) {
+       (V_inUse(vp) == fileServer)) {
+       AddVolumeToVByPList_r(vp);
+       VLRU_Add_r(vp);
        VChangeState_r(vp, VOL_STATE_ATTACHED);
     } else {
        VChangeState_r(vp, VOL_STATE_UNATTACHED);
@@ -2526,7 +2553,7 @@ Volume *
 VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
 {
     char *part, *name;
-    GetVolumePath(ec, volumeId, &part, &name);
+    VGetVolumePath(ec, volumeId, &part, &name);
     if (*ec) {
        register Volume *vp;
        Error error;
@@ -2687,6 +2714,20 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
     Volume *avp, * rvp = hint;
 #endif
 
+    /* 
+     * if VInit is zero, the volume package dynamic
+     * data structures have not been initialized yet,
+     * and we must immediately return an error
+     */
+    if (VInit == 0) {
+       vp = NULL;
+       *ec = VOFFLINE;
+       if (client_ec) {
+           *client_ec = VOFFLINE;
+       }
+       goto not_inited;
+    }
+
 #ifdef AFS_DEMAND_ATTACH_FS
     if (rvp) {
        VCreateReservation_r(rvp);
@@ -2744,11 +2785,12 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
 
        /* short circuit with VNOVOL in the following circumstances:
         *
-        *   VOL_STATE_ERROR
-        *   VOL_STATE_SHUTTING_DOWN
+        *   - VOL_STATE_ERROR
+        *   - VOL_STATE_SHUTTING_DOWN
         */
        if ((V_attachState(vp) == VOL_STATE_ERROR) ||
-           (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN)) {
+           (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN) ||
+           (V_attachState(vp) == VOL_STATE_GOING_OFFLINE)) {
            *ec = VNOVOL;
            vp = NULL;
            break;
@@ -2757,20 +2799,22 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
        /*
         * short circuit with VOFFLINE in the following circumstances:
         *
-        *   VOL_STATE_UNATTACHED
+        *   - VOL_STATE_UNATTACHED
         */
        if (V_attachState(vp) == VOL_STATE_UNATTACHED) {
-           *ec = VOFFLINE;
+          if (vp->specialStatus) {
+              *ec = vp->specialStatus;
+          } else {
+              *ec = VOFFLINE;
+          }
            vp = NULL;
            break;
        }
 
        /* allowable states:
-        *   UNATTACHED
-        *   PREATTACHED
-        *   ATTACHED
-        *   GOING_OFFLINE
-        *   SALVAGING
+        *   - PREATTACHED
+        *   - ATTACHED
+        *   - SALVAGING
         */
 
        if (vp->salvage.requested) {
@@ -2857,21 +2901,41 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
        /*
         * this test MUST happen after the volume header is loaded
         */
-       if (vp->pending_vol_op && !VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
-           if (client_ec) {
-               /* see CheckVnode() in afsfileprocs.c for an explanation
-                * of this error code logic */
-               afs_uint32 now = FT_ApproxTime();
-               if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
-                   *client_ec = VBUSY;
-               } else {
-                   *client_ec = VRESTARTING;
-               }
-           }
-           *ec = VOFFLINE;
-           ReleaseVolumeHeader(vp->header);
-           vp = NULL;
-           break;
+        
+         /* only valid before/during demand attachment */
+         assert(!vp->pending_vol_op || vp->pending_vol_op != FSSYNC_VolOpRunningUnknown);
+        
+         /* deny getvolume due to running mutually exclusive vol op */
+         if (vp->pending_vol_op && vp->pending_vol_op->vol_op_state==FSSYNC_VolOpRunningOffline) {
+          /* 
+           * volume cannot remain online during this volume operation.
+           * notify client. 
+           */
+          if (vp->specialStatus) {
+              /*
+               * special status codes outrank normal VOFFLINE code
+               */
+              *ec = vp->specialStatus;
+              if (client_ec) {
+                  *client_ec = vp->specialStatus;
+              }
+          } else {
+              if (client_ec) {
+                  /* see CheckVnode() in afsfileprocs.c for an explanation
+                   * of this error code logic */
+                  afs_uint32 now = FT_ApproxTime();
+                  if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
+                      *client_ec = VBUSY;
+                  } else {
+                      *client_ec = VRESTARTING;
+                  }
+              }
+              *ec = VOFFLINE;
+          }
+          VChangeState_r(vp, VOL_STATE_UNATTACHED);
+          FreeVolumeHeader(vp);
+          vp = NULL;
+          break;
        }
 #endif /* AFS_DEMAND_ATTACH_FS */
        
@@ -2939,6 +3003,7 @@ GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flag
     }
 #endif /* AFS_DEMAND_ATTACH_FS */
 
+ not_inited:
     assert(vp || *ec);
     return vp;
 }
@@ -3105,6 +3170,67 @@ VOffline_r(Volume * vp, char *message)
 #endif /* AFS_DEMAND_ATTACH_FS */
 }
 
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * Take a volume offline in order to perform a volume operation.
+ *
+ * @param[inout] ec       address in which to store error code
+ * @param[in]    vp       volume object pointer
+ * @param[in]    message  volume offline status message
+ *
+ * @pre
+ *    - VOL_LOCK is held
+ *    - caller MUST hold a heavyweight ref on vp
+ *
+ * @post
+ *    - volume is taken offline
+ *    - if possible, volume operation is promoted to running state
+ *    - on failure, *ec is set to nonzero
+ *
+ * @note Although this function does not return any value, it may
+ *       still fail to promote our pending volume operation to
+ *       a running state.  Any caller MUST check the value of *ec,
+ *       and MUST NOT blindly assume success.
+ *
+ * @warning if the caller does not hold a lightweight ref on vp,
+ *          then it MUST NOT reference vp after this function
+ *          returns to the caller.
+ *
+ * @internal volume package internal use only
+ */
+void
+VOfflineForVolOp_r(Error *ec, Volume *vp, char *message)
+{
+    assert(vp->pending_vol_op);
+    if (!V_inUse(vp)) {
+       VPutVolume_r(vp);
+        *ec = 1;
+       return;
+    }
+    if (V_offlineMessage(vp)[0] == '\0')
+       strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
+    V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
+    vp->goingOffline = 1;
+    VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
+    VCreateReservation_r(vp);
+    VPutVolume_r(vp);
+
+    /* Wait for the volume to go offline */
+    while (!VIsOfflineState(V_attachState(vp))) {
+        /* do not give corrupted volumes to the volserver */
+        if (vp->salvage.requested && vp->pending_vol_op->com.programType != salvageServer) {
+           *ec = 1; 
+          goto error;
+        }
+       VWaitStateChange_r(vp);
+    }
+    *ec = 0; 
+ error:
+    VCancelReservation_r(vp);
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
 void
 VOffline(Volume * vp, char *message)
 {
@@ -3124,8 +3250,9 @@ void
 VDetachVolume_r(Error * ec, Volume * vp)
 {
     VolumeId volume;
-    struct DiskPartition *tpartp;
-    int notifyServer, useDone = FSYNC_VOL_ON;
+    struct DiskPartition64 *tpartp;
+    int notifyServer = 0;
+    int  useDone = FSYNC_VOL_ON;
 
     *ec = 0;                   /* always "succeeds" */
     if (programType == volumeUtility) {
@@ -3145,6 +3272,9 @@ VDetachVolume_r(Error * ec, Volume * vp)
     DeleteVolumeFromVByPList_r(vp);
     VLRU_Delete_r(vp);
     VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN);
+#else
+    if (programType != fileServer) 
+       V_inUse(vp) = 0;
 #endif /* AFS_DEMAND_ATTACH_FS */
     VPutVolume_r(vp);
     /* Will be detached sometime in the future--this is OK since volume is offline */
@@ -3430,12 +3560,25 @@ static int
 VCheckDetach(register Volume * vp)
 {
     int ret = 0;
+    Error ec = 0;
 
     if (vp->nUsers || vp->nWaiters)
        return ret;
 
     if (vp->shuttingDown) {
        ret = 1;
+       if ((programType != fileServer) &&
+           (V_inUse(vp) == programType) &&
+           ((V_checkoutMode(vp) == V_VOLUPD) ||
+            ((V_checkoutMode(vp) == V_CLONE) &&
+             (VolumeWriteable(vp))))) {
+           V_inUse(vp) = 0;
+           VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
+           if (ec) {
+               Log("VCheckDetach: volume header update for volume %u "
+                   "failed with errno %d\n", vp->hashid, errno);
+           }
+       }
        VReleaseVolumeHandles_r(vp);
        VCheckSalvage(vp);
        ReallyFreeVolume(vp);
@@ -3450,12 +3593,25 @@ static int
 VCheckDetach(register Volume * vp)
 {
     int ret = 0;
+    Error ec = 0;
 
     if (vp->nUsers)
        return ret;
 
     if (vp->shuttingDown) {
        ret = 1;
+       if ((programType != fileServer) &&
+           (V_inUse(vp) == programType) &&
+           ((V_checkoutMode(vp) == V_VOLUPD) ||
+            ((V_checkoutMode(vp) == V_CLONE) &&
+             (VolumeWriteable(vp))))) {
+           V_inUse(vp) = 0;
+           VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
+           if (ec) {
+               Log("VCheckDetach: volume header update for volume %u failed with errno %d\n",
+                   vp->hashid, errno);
+           }
+       }
        VReleaseVolumeHandles_r(vp);
        ReallyFreeVolume(vp);
        if (programType == fileServer) {
@@ -3718,11 +3874,12 @@ VDeregisterVolOp_r(Volume * vp)
 int
 VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 {
-    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+    return (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline ||
+           (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
            (vopinfo->com.reason == V_READONLY ||
             (!VolumeWriteable(vp) &&
              (vopinfo->com.reason == V_CLONE ||
-              vopinfo->com.reason == V_DUMP))));
+              vopinfo->com.reason == V_DUMP)))));
 }
 
 /**
@@ -3742,9 +3899,11 @@ VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 int
 VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
 {
-    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+    return ((vopinfo->com.command == FSYNC_VOL_OFF &&
+           vopinfo->com.reason == FSYNC_SALVAGE) ||
+           (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
            (vopinfo->com.reason == V_CLONE ||
-            vopinfo->com.reason == V_DUMP));
+            vopinfo->com.reason == V_DUMP)));
 }
 
 
@@ -3848,8 +4007,13 @@ VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags)
        vp->salvage.reason = reason;
        vp->stats.last_salvage = FT_ApproxTime();
        if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
-           /* XXX this should likely be changed to FreeVolumeHeader() */
-           ReleaseVolumeHeader(vp->header);
+           /* Instead of ReleaseVolumeHeader, we do FreeVolumeHeader() 
+               so that the the next VAttachVolumeByVp_r() invocation 
+               of attach2() will pull in a cached header 
+               entry and fail, then load a fresh one from disk and attach 
+               it to the volume.             
+           */
+           FreeVolumeHeader(vp);
        }
        if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
            VChangeState_r(vp, VOL_STATE_SALVAGING);
@@ -3946,6 +4110,7 @@ VScheduleSalvage_r(Volume * vp)
     int code, ret=0;
 #ifdef SALVSYNC_BUILD_CLIENT
     VolState state_save;
+    VThreadOptions_t * thread_opts;
     char partName[16];
 
     if (vp->nWaiters || vp->nUsers) {
@@ -3956,6 +4121,21 @@ VScheduleSalvage_r(Volume * vp)
     if (vp->stats.salvages >= SALVAGE_COUNT_MAX)
        return 1;
 
+    /*
+     * don't perform salvsync ops on certain threads
+     */
+    thread_opts = pthread_getspecific(VThread_key);
+    if (thread_opts == NULL) {
+       thread_opts = &VThread_defaults;
+    }
+    if (thread_opts->disallow_salvsync) {
+       return 1;
+    }
+
+    /*
+     * XXX the scheduling process should really be done asynchronously
+     *     to avoid fssync deadlocks
+     */
     if (!vp->salvage.scheduled) {
        /* if we haven't previously scheduled a salvage, do so now 
         *
@@ -3967,7 +4147,6 @@ VScheduleSalvage_r(Volume * vp)
         */
        strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName));
        state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
-       V_attachFlags(vp) |= VOL_IS_BUSY;
        VOL_UNLOCK;
 
        /* can't use V_id() since there's no guarantee
@@ -3980,7 +4159,6 @@ VScheduleSalvage_r(Volume * vp)
                                      NULL);
        VOL_LOCK;
        VChangeState_r(vp, state_save);
-       V_attachFlags(vp) &= ~(VOL_IS_BUSY);
 
        if (code == SYNC_OK) {
            vp->salvage.scheduled = 1;
@@ -4018,9 +4196,8 @@ VScheduleSalvage_r(Volume * vp)
  *
  * @pre VOL_LOCK is held.
  *
- * @post salvageserver is sent a request to cancel the volume salvage
- *
- * @todo should set exclusive state and drop glock around salvsync call
+ * @post salvageserver is sent a request to cancel the volume salvage.
+ *       volume is transitioned to a hard error state.
  *
  * @internal volume package internal use only.
  */
@@ -4031,14 +4208,24 @@ VCancelSalvage_r(Volume * vp, int reason)
 
 #ifdef SALVSYNC_BUILD_CLIENT
     if (vp->salvage.scheduled) {
+       VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
+       VOL_UNLOCK;
+
+       /* can't use V_id() since there's no guarantee
+        * we have the disk data header at this point */
        code = SALVSYNC_SalvageVolume(vp->hashid,
                                      VPartitionPath(vp->partition),
                                      SALVSYNC_CANCEL,
                                      reason,
                                      0,
                                      NULL);
+
+       VOL_LOCK;
+       VChangeState_r(vp, VOL_STATE_ERROR);
+
        if (code == SYNC_OK) {
            vp->salvage.scheduled = 0;
+           vp->salvage.requested = 0;
        } else {
            ret = 1;
        }
@@ -4675,15 +4862,14 @@ VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
  *       on a vice partition, it is possible for callers to get the wrong one,
  *       depending on the order of the disk partition linked list.
  *
- * @internal volume package internal use only.
  */
-static void
-GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
+void
+VGetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
 {
     static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
     char path[VMAXPATHLEN];
     int found = 0;
-    struct DiskPartition *dp;
+    struct DiskPartition64 *dp;
 
     *ec = 0;
     name[0] = '/';
@@ -5584,7 +5770,6 @@ VLRU_ScannerThread(void * args)
                min_delay = 0;
                min_idx = i;
                overdue = 1;
-               break;
            }
        }
 
@@ -6085,7 +6270,12 @@ VInitVolumeHeaderCache(afs_uint32 howMany)
     volume_hdr_LRU.stats.used = howMany;
     volume_hdr_LRU.stats.attached = 0;
     hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
+    assert(hp != NULL);
+
     while (howMany--)
+       /* We are using ReleaseVolumeHeader to initialize the values on the header list
+        * to ensure they have the right values
+        */
        ReleaseVolumeHeader(hp++);
 }
 
@@ -6282,7 +6472,7 @@ LoadVolumeHeader(Error * ec, Volume * vp)
 #endif /* AFS_DEMAND_ATTACH_FS */
     if (*ec) {
        /* maintain (nUsers==0) => header in LRU invariant */
-       ReleaseVolumeHeader(vp->header);
+       FreeVolumeHeader(vp);
     }
 }
 
@@ -6841,7 +7031,7 @@ DeleteVolumeFromVByPList_r(Volume * vp)
  */
 /* take exclusive control over the list */
 static void
-VVByPListBeginExclusive_r(struct DiskPartition * dp)
+VVByPListBeginExclusive_r(struct DiskPartition64 * dp)
 {
     assert(dp->vol_list.busy == 0);
     dp->vol_list.busy = 1;
@@ -6865,7 +7055,7 @@ VVByPListBeginExclusive_r(struct DiskPartition * dp)
  * @internal volume package internal use only.
  */
 static void
-VVByPListEndExclusive_r(struct DiskPartition * dp)
+VVByPListEndExclusive_r(struct DiskPartition64 * dp)
 {
     assert(dp->vol_list.busy);
     dp->vol_list.busy = 0;
@@ -6897,7 +7087,7 @@ VVByPListEndExclusive_r(struct DiskPartition * dp)
  * @internal volume package internal use only.
  */
 static void
-VVByPListWait_r(struct DiskPartition * dp)
+VVByPListWait_r(struct DiskPartition64 * dp)
 {
     while (dp->vol_list.busy) {
        VOL_CV_WAIT(&dp->vol_list.cv);
@@ -6963,10 +7153,110 @@ DoubleToPrintable(double x, char * buf, int len)
     return buf;
 }
 
+struct VLRUExtStatsEntry {
+    VolumeId volid;
+};
+
+struct VLRUExtStats {
+    afs_uint32 len;
+    afs_uint32 used;
+    struct {
+       afs_uint32 start;
+       afs_uint32 len;
+    } queue_info[VLRU_QUEUE_INVALID];
+    struct VLRUExtStatsEntry * vec;
+};
+
+/** 
+ * add a 256-entry fudge factor onto the vector in case state changes
+ * out from under us.
+ */
+#define VLRU_EXT_STATS_VEC_LEN_FUDGE   256
+
+/**
+ * collect extended statistics for the VLRU subsystem.
+ *
+ * @param[out] stats  pointer to stats structure to be populated
+ * @param[in] nvols   number of volumes currently known to exist
+ *
+ * @pre VOL_LOCK held
+ *
+ * @post stats->vec allocated and populated
+ *
+ * @return operation status
+ *    @retval 0 success
+ *    @retval 1 failure
+ */
+static int
+VVLRUExtStats_r(struct VLRUExtStats * stats, afs_uint32 nvols)
+{
+    afs_uint32 cur, idx, len;
+    struct rx_queue * qp, * nqp;
+    Volume * vp;
+    struct VLRUExtStatsEntry * vec;
+
+    len = nvols + VLRU_EXT_STATS_VEC_LEN_FUDGE;
+    vec = stats->vec = calloc(len,
+                             sizeof(struct VLRUExtStatsEntry));
+    if (vec == NULL) {
+       return 1;
+    }
+
+    cur = 0;
+    for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
+       VLRU_Wait_r(&volume_LRU.q[idx]);
+       VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+       VOL_UNLOCK;
+
+       stats->queue_info[idx].start = cur;
+
+       for (queue_Scan(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+           if (cur == len) {
+               /* out of space in vec */
+               break;
+           }
+           vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+           vec[cur].volid = vp->hashid;
+           cur++;
+       }
+
+       stats->queue_info[idx].len = cur - stats->queue_info[idx].start;
+
+       VOL_LOCK;
+       VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+    }
+
+    stats->len = len;
+    stats->used = cur;
+    return 0;
+}
+
+#define ENUMTOSTRING(en)  #en
+#define ENUMCASE(en) \
+    case en: \
+        return ENUMTOSTRING(en); \
+        break
+
+static char *
+vlru_idx_to_string(int idx)
+{
+    switch (idx) {
+       ENUMCASE(VLRU_QUEUE_NEW);
+       ENUMCASE(VLRU_QUEUE_MID);
+       ENUMCASE(VLRU_QUEUE_OLD);
+       ENUMCASE(VLRU_QUEUE_CANDIDATE);
+       ENUMCASE(VLRU_QUEUE_HELD);
+       ENUMCASE(VLRU_QUEUE_INVALID);
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
 void
 VPrintExtendedCacheStats_r(int flags)
 {
     int i, j;
+    afs_uint32 vol_sum = 0;
     struct stats {
        double min;
        double max;
@@ -6978,6 +7268,7 @@ VPrintExtendedCacheStats_r(int flags)
     char pr_buf[4][32];
     VolumeHashChainHead *head;
     Volume *vp, *np;
+    struct VLRUExtStats vlru_stats;
 
     /* zero out stats */
     memset(&looks, 0, sizeof(struct stats));
@@ -7005,6 +7296,7 @@ VPrintExtendedCacheStats_r(int flags)
            gets.sum     += ch_gets.sum;
            reorders.sum += ch_reorders.sum;
            len.sum      += (double)head->len;
+           vol_sum      += head->len;
            
            if (i == 0) {
                len.min      = (double) head->len;
@@ -7148,7 +7440,7 @@ VPrintExtendedCacheStats_r(int flags)
 
     /* print extended disk related statistics */
     {
-       struct DiskPartition * diskP;
+       struct DiskPartition64 * diskP;
        afs_uint32 vol_count[VOLMAXPARTS+1];
        byte part_exists[VOLMAXPARTS+1];
        Device id;
@@ -7168,6 +7460,8 @@ VPrintExtendedCacheStats_r(int flags)
        VOL_UNLOCK;
        for (i = 0; i <= VOLMAXPARTS; i++) {
            if (part_exists[i]) {
+               /* XXX while this is currently safe, it is a violation
+                *     of the VGetPartitionById_r interface contract. */
                diskP = VGetPartitionById_r(i, 0);
                if (diskP) {
                    Log("Partition %s has %d online volumes\n", 
@@ -7178,6 +7472,43 @@ VPrintExtendedCacheStats_r(int flags)
        VOL_LOCK;
     }
 
+    /* print extended VLRU statistics */
+    if (VVLRUExtStats_r(&vlru_stats, vol_sum) == 0) {
+       afs_uint32 idx, cur, lpos;
+       VOL_UNLOCK;
+       VolumeId line[5];
+
+       Log("VLRU State Dump:\n\n");
+
+       for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
+           Log("\t%s:\n", vlru_idx_to_string(idx));
+
+           lpos = 0;
+           for (cur = vlru_stats.queue_info[idx].start;
+                cur < vlru_stats.queue_info[idx].len;
+                cur++) {
+               line[lpos++] = vlru_stats.vec[cur].volid;
+               if (lpos==5) {
+                   Log("\t\t%u, %u, %u, %u, %u,\n",
+                       line[0], line[1], line[2], line[3], line[4]);
+                   lpos = 0;
+               }
+           }
+
+           if (lpos) {
+               while (lpos < 5) {
+                   line[lpos++] = 0;
+               }
+               Log("\t\t%u, %u, %u, %u, %u\n",
+                   line[0], line[1], line[2], line[3], line[4]);
+           }
+           Log("\n");
+       }
+
+       free(vlru_stats.vec);
+
+       VOL_LOCK;
+    }
 }
 
 void