windows-volume-status-20080418
authorJeffrey Altman <jaltman@secure-endpoints.com>
Fri, 18 Apr 2008 17:43:19 +0000 (17:43 +0000)
committerJeffrey Altman <jaltman@secure-endpoints.com>
Fri, 18 Apr 2008 17:43:19 +0000 (17:43 +0000)
LICENSE MIT

A problem was discovered with cm_Analyze when serverRef instances are
marked offline.  cm_CheckOfflineVolume() was not resetting the serverRef
state.  cm_Analyze was also waiting in sleep calls when it was it is
clear that waiting would not result in a change of state.

cm_CheckOfflineVolume() was updated to always reset the serverRef state
and indicate that the volume should be considered online for the purpose
of retrying when the serverRef state is reset to srv_not_busy.

The problems identified in the MIT dev.mit.edu stress test environment
are resolved by these changes.

src/WINNT/afsd/cm_conn.c
src/WINNT/afsd/cm_volume.c

index 6f08197..130deda 100644 (file)
@@ -127,7 +127,9 @@ static long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp,
     
     *serversppp = cm_GetVolServers(volp, fidp->volume);
 
+    lock_ObtainRead(&cm_volumeLock);
     cm_PutVolume(volp);
+    lock_ReleaseRead(&cm_volumeLock);
     return 0;
 }
 
@@ -261,7 +263,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     else if (errorCode == CM_ERROR_ALLDOWN) {
        osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLDOWN.");
        /* Servers marked DOWN will be restored by the background daemon
-        * thread as they become available.
+        * thread as they become available.  The volume status is 
+         * updated as the server state changes.
         */
     }
 
@@ -270,43 +273,45 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         /* Volume instances marked offline will be restored by the 
          * background daemon thread as they become available 
          */
-        if (timeLeft > 7 && fidp) {
-            thrd_Sleep(5000);
-
+        if (fidp) {
             code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
-                                    CM_GETVOL_FLAG_NO_LRU_UPDATE, 
-                                    &volp);
+                                      CM_GETVOL_FLAG_NO_LRU_UPDATE, 
+                                      &volp);
             if (code == 0) {
-                statep = cm_VolumeStateByID(volp, fidp->volume);
-                if (statep->state != vl_offline && statep->state != vl_unknown) {
-                    retry = 1;
-                } else {
-                    if (cm_CheckOfflineVolume(volp, statep->ID))
+                if (timeLeft > 7) {
+                    thrd_Sleep(5000);
+
+                    /* cm_CheckOfflineVolume() resets the serverRef state */
+                    if (cm_CheckOfflineVolume(volp, fidp->volume))
                         retry = 1;
+                } else {
+                    cm_UpdateVolumeStatus(volp, fidp->volume);
                 }
-            
+                lock_ObtainRead(&cm_volumeLock);
                 cm_PutVolume(volp);
+                lock_ReleaseRead(&cm_volumeLock);
+                volp = NULL;
             }
-        }
+        } 
     }
     else if (errorCode == CM_ERROR_ALLBUSY) {
         /* Volumes that are busy cannot be determined to be non-busy 
          * without actually attempting to access them.
          */
        osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY.");
-        if (timeLeft > 7) {
 
-            thrd_Sleep(5000);
-
-            if (fidp) { /* File Server query */
-                code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
-                                        CM_GETVOL_FLAG_NO_LRU_UPDATE, 
-                                        &volp);
-                if (code == 0) {
+        if (fidp) { /* File Server query */
+            code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
+                                     CM_GETVOL_FLAG_NO_LRU_UPDATE, 
+                                     &volp);
+            if (code == 0) {
+                if (timeLeft > 7) {
+                    thrd_Sleep(5000);
+                    
                     statep = cm_VolumeStateByID(volp, fidp->volume);
                     if (statep->state != vl_offline && 
-                        statep->state != vl_busy &&
-                        statep->state != vl_unknown) {
+                         statep->state != vl_busy &&
+                         statep->state != vl_unknown) {
                         retry = 1;
                     } else {
                         if (!serversp) {
@@ -320,7 +325,7 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
                             if (tsrp->status == srv_busy) {
                                 tsrp->status = srv_not_busy;
-                            }
+                            }       
                         }
                         lock_ReleaseWrite(&cm_serverLock);
                         if (free_svr_list) {
@@ -328,13 +333,22 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                             *serverspp = serversp;
                         }
 
-                        cm_UpdateVolumeStatus(volp, statep->ID);
+                        cm_UpdateVolumeStatus(volp, fidp->volume);
                         retry = 1;
                     }
-            
-                    cm_PutVolume(volp);
+                } else {
+                    cm_UpdateVolumeStatus(volp, fidp->volume);
                 }
-            } else {    /* VL Server query */
+
+                lock_ObtainRead(&cm_volumeLock);
+                cm_PutVolume(volp);
+                lock_ReleaseRead(&cm_volumeLock);
+                volp = NULL;
+            }
+        } else {    /* VL Server query */
+            if (timeLeft > 7) {
+                thrd_Sleep(5000);
+
                 if (serversp) {
                     lock_ObtainWrite(&cm_serverLock);
                     for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
@@ -376,7 +390,10 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         
         if (statep) {
             cm_UpdateVolumeStatus(volp, statep->ID);
+            lock_ObtainRead(&cm_volumeLock);
             cm_PutVolume(volp);
+            lock_ReleaseRead(&cm_volumeLock);
+            volp = NULL;
         }
 
         if (free_svr_list) {
@@ -467,7 +484,10 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
 
         if (statep) {
             cm_UpdateVolumeStatus(volp, statep->ID);
+            lock_ObtainRead(&cm_volumeLock);
             cm_PutVolume(volp);
+            lock_ReleaseRead(&cm_volumeLock);
+            volp = NULL;
         }
 
         if (free_svr_list) {
index 8be20ec..ffe8ef1 100644 (file)
@@ -694,9 +694,11 @@ long cm_FindVolumeByID(cm_cell_t *cellp, afs_uint32 volumeID, cm_user_t *userp,
                 cm_AdjustVolumeLRU(volp);
                 lock_ReleaseWrite(&cm_volumeLock);
             }
-        } else
+        } else {
+            lock_ObtainRead(&cm_volumeLock);
             cm_PutVolume(volp);
-
+            lock_ReleaseRead(&cm_volumeLock);
+        }
         return code;
     }
         
@@ -859,9 +861,11 @@ long cm_FindVolumeByName(struct cm_cell *cellp, char *volumeNamep,
             cm_AdjustVolumeLRU(volp);
             lock_ReleaseWrite(&cm_volumeLock);
         }
-    } else
+    } else {
+        lock_ObtainRead(&cm_volumeLock);
         cm_PutVolume(volp);
-
+        lock_ReleaseRead(&cm_volumeLock);
+    }
     return code;
 }      
 
@@ -924,6 +928,9 @@ void cm_ForceUpdateVolume(cm_fid_t *fidp, cm_user_t *userp, cm_req_t *reqp)
 
     lock_ReleaseRead(&cm_volumeLock);
 
+    if (!volp)
+        return;
+
     /* update it */
     cm_data.mountRootGen = time(NULL);
     lock_ObtainWrite(&volp->rw);
@@ -945,7 +952,9 @@ void cm_ForceUpdateVolume(cm_fid_t *fidp, cm_user_t *userp, cm_req_t *reqp)
 #endif
     lock_ReleaseWrite(&volp->rw);
 
+    lock_ObtainRead(&cm_volumeLock);
     cm_PutVolume(volp);
+    lock_ReleaseRead(&cm_volumeLock);
 }
 
 /* find the appropriate servers from a volume */
@@ -1075,110 +1084,122 @@ cm_CheckOfflineVolume(cm_volume_t *volp, afs_uint32 volID)
     }
 
     if (volp->vol[RWVOL].ID != 0 && (!volID || volID == volp->vol[RWVOL].ID) &&
-         volp->vol[RWVOL].serversp &&
-         (volp->vol[RWVOL].state == vl_busy || volp->vol[RWVOL].state == vl_offline || volp->vol[RWVOL].state == vl_unknown)) {
-        cm_InitReq(&req);
-
+         volp->vol[RWVOL].serversp) {
+       
         for (serversp = volp->vol[RWVOL].serversp; serversp; serversp = serversp->next) {
-            if (serversp->status == srv_busy || serversp->status == srv_offline)
+            if (serversp->status == srv_busy || serversp->status == srv_offline) {
                 serversp->status = srv_not_busy;
+                online = 1;
+            }
         }
 
-        lock_ReleaseWrite(&volp->rw);
-        do {
-            code = cm_ConnFromVolume(volp, volp->vol[RWVOL].ID, cm_rootUserp, &req, &connp);
-            if (code) 
-                continue;
+        if (volp->vol[RWVOL].state == vl_busy || volp->vol[RWVOL].state == vl_offline || volp->vol[RWVOL].state == vl_unknown) {
+            cm_InitReq(&req);
 
-            callp = cm_GetRxConn(connp);
-            code = RXAFS_GetVolumeStatus(callp, volp->vol[RWVOL].ID,
-                                          &volStat, &Name, &OfflineMsg, &MOTD);
-            rx_PutConnection(callp);        
+            lock_ReleaseWrite(&volp->rw);
+            do {
+                code = cm_ConnFromVolume(volp, volp->vol[RWVOL].ID, cm_rootUserp, &req, &connp);
+                if (code) 
+                    continue;
 
-        } while (cm_Analyze(connp, cm_rootUserp, &req, NULL, NULL, NULL, NULL, code));
-        code = cm_MapRPCError(code, &req);
+                callp = cm_GetRxConn(connp);
+                code = RXAFS_GetVolumeStatus(callp, volp->vol[RWVOL].ID,
+                                             &volStat, &Name, &OfflineMsg, &MOTD);
+                rx_PutConnection(callp);            
 
-        lock_ObtainWrite(&volp->rw);
-        if (code == 0 && volStat.Online) {
-            cm_VolumeStatusNotification(volp, volp->vol[RWVOL].ID, volp->vol[RWVOL].state, vl_online);
-            volp->vol[RWVOL].state = vl_online;
-            online = 1;
-        } else if (code == CM_ERROR_NOACCESS) {
-            cm_VolumeStatusNotification(volp, volp->vol[RWVOL].ID, volp->vol[RWVOL].state, vl_unknown);
-            volp->vol[RWVOL].state = vl_unknown;
-            online = 1;
+            } while (cm_Analyze(connp, cm_rootUserp, &req, NULL, NULL, NULL, NULL, code));
+            code = cm_MapRPCError(code, &req);
+
+            lock_ObtainWrite(&volp->rw);
+            if (code == 0 && volStat.Online) {
+                cm_VolumeStatusNotification(volp, volp->vol[RWVOL].ID, volp->vol[RWVOL].state, vl_online);
+                volp->vol[RWVOL].state = vl_online;
+                online = 1;
+            } else if (code == CM_ERROR_NOACCESS) {
+                cm_VolumeStatusNotification(volp, volp->vol[RWVOL].ID, volp->vol[RWVOL].state, vl_unknown);
+                volp->vol[RWVOL].state = vl_unknown;
+                online = 1;
+            }
         }
     }
 
     if (volp->vol[ROVOL].ID != 0 && (!volID || volID == volp->vol[ROVOL].ID) &&
-         volp->vol[ROVOL].serversp &&
-         (volp->vol[ROVOL].state == vl_busy || volp->vol[ROVOL].state == vl_offline || volp->vol[ROVOL].state == vl_unknown)) {
-        cm_InitReq(&req);
+         volp->vol[ROVOL].serversp) {
 
         for (serversp = volp->vol[ROVOL].serversp; serversp; serversp = serversp->next) {
-            if (serversp->status == srv_busy || serversp->status == srv_offline)
+            if (serversp->status == srv_busy || serversp->status == srv_offline) {
                 serversp->status = srv_not_busy;
+                online = 1;
+            }
         }
 
-        lock_ReleaseWrite(&volp->rw);
-        do {
-            code = cm_ConnFromVolume(volp, volp->vol[ROVOL].ID, cm_rootUserp, &req, &connp);
-            if (code) 
-                continue;
+        if (volp->vol[ROVOL].state == vl_busy || volp->vol[ROVOL].state == vl_offline || volp->vol[ROVOL].state == vl_unknown) {
+            cm_InitReq(&req);
 
-            callp = cm_GetRxConn(connp);
-            code = RXAFS_GetVolumeStatus(callp, volp->vol[ROVOL].ID,
-                                          &volStat, &Name, &OfflineMsg, &MOTD);
-            rx_PutConnection(callp);        
+            lock_ReleaseWrite(&volp->rw);
+            do {
+                code = cm_ConnFromVolume(volp, volp->vol[ROVOL].ID, cm_rootUserp, &req, &connp);
+                if (code) 
+                    continue;
 
-        } while (cm_Analyze(connp, cm_rootUserp, &req, NULL, NULL, NULL, NULL, code));
-        code = cm_MapRPCError(code, &req);
+                callp = cm_GetRxConn(connp);
+                code = RXAFS_GetVolumeStatus(callp, volp->vol[ROVOL].ID,
+                                              &volStat, &Name, &OfflineMsg, &MOTD);
+                rx_PutConnection(callp);        
 
-        lock_ObtainWrite(&volp->rw);
-        if (code == 0 && volStat.Online) {
-            cm_VolumeStatusNotification(volp, volp->vol[ROVOL].ID, volp->vol[ROVOL].state, vl_online);
-            volp->vol[ROVOL].state = vl_online;
-            online = 1;
-        } else if (code == CM_ERROR_NOACCESS) {
-            cm_VolumeStatusNotification(volp, volp->vol[ROVOL].ID, volp->vol[ROVOL].state, vl_unknown);
-            volp->vol[ROVOL].state = vl_unknown;
-            online = 1;
+            } while (cm_Analyze(connp, cm_rootUserp, &req, NULL, NULL, NULL, NULL, code));
+            code = cm_MapRPCError(code, &req);
+
+            lock_ObtainWrite(&volp->rw);
+            if (code == 0 && volStat.Online) {
+                cm_VolumeStatusNotification(volp, volp->vol[ROVOL].ID, volp->vol[ROVOL].state, vl_online);
+                volp->vol[ROVOL].state = vl_online;
+                online = 1;
+            } else if (code == CM_ERROR_NOACCESS) {
+                cm_VolumeStatusNotification(volp, volp->vol[ROVOL].ID, volp->vol[ROVOL].state, vl_unknown);
+                volp->vol[ROVOL].state = vl_unknown;
+                online = 1;
+            }
         }
     }
 
     if (volp->vol[BACKVOL].ID != 0 && (!volID || volID == volp->vol[BACKVOL].ID) &&
-         volp->vol[BACKVOL].serversp &&
-         (volp->vol[BACKVOL].state == vl_busy || volp->vol[BACKVOL].state == vl_offline || volp->vol[BACKVOL].state == vl_unknown)) {
-        cm_InitReq(&req);
-
+         volp->vol[BACKVOL].serversp) {
+        
         for (serversp = volp->vol[BACKVOL].serversp; serversp; serversp = serversp->next) {
-            if (serversp->status == srv_busy || serversp->status == srv_offline)
+            if (serversp->status == srv_busy || serversp->status == srv_offline) {
                 serversp->status = srv_not_busy;
+                online = 1;
+            }
         }
 
-        lock_ReleaseWrite(&volp->rw);
-        do {
-            code = cm_ConnFromVolume(volp, volp->vol[BACKVOL].ID, cm_rootUserp, &req, &connp);
-            if (code) 
-                continue;
+        if (volp->vol[BACKVOL].state == vl_busy || volp->vol[BACKVOL].state == vl_offline || volp->vol[BACKVOL].state == vl_unknown) {
+            cm_InitReq(&req);
 
-            callp = cm_GetRxConn(connp);
-            code = RXAFS_GetVolumeStatus(callp, volp->vol[BACKVOL].ID,
-                                          &volStat, &Name, &OfflineMsg, &MOTD);
-            rx_PutConnection(callp);        
+            lock_ReleaseWrite(&volp->rw);
+            do {
+                code = cm_ConnFromVolume(volp, volp->vol[BACKVOL].ID, cm_rootUserp, &req, &connp);
+                if (code) 
+                    continue;
 
-        } while (cm_Analyze(connp, cm_rootUserp, &req, NULL, NULL, NULL, NULL, code));
-        code = cm_MapRPCError(code, &req);
+                callp = cm_GetRxConn(connp);
+                code = RXAFS_GetVolumeStatus(callp, volp->vol[BACKVOL].ID,
+                                              &volStat, &Name, &OfflineMsg, &MOTD);
+                rx_PutConnection(callp);        
 
-        lock_ObtainWrite(&volp->rw);
-        if (code == 0 && volStat.Online) {
-            cm_VolumeStatusNotification(volp, volp->vol[BACKVOL].ID, volp->vol[BACKVOL].state, vl_online);
-            volp->vol[BACKVOL].state = vl_online;
-            online = 1;
-        } else if (code == CM_ERROR_NOACCESS) {
-            cm_VolumeStatusNotification(volp, volp->vol[BACKVOL].ID, volp->vol[BACKVOL].state, vl_unknown);
-            volp->vol[BACKVOL].state = vl_unknown;
-            online = 1;
+            } while (cm_Analyze(connp, cm_rootUserp, &req, NULL, NULL, NULL, NULL, code));
+            code = cm_MapRPCError(code, &req);
+
+            lock_ObtainWrite(&volp->rw);
+            if (code == 0 && volStat.Online) {
+                cm_VolumeStatusNotification(volp, volp->vol[BACKVOL].ID, volp->vol[BACKVOL].state, vl_online);
+                volp->vol[BACKVOL].state = vl_online;
+                online = 1;
+            } else if (code == CM_ERROR_NOACCESS) {
+                cm_VolumeStatusNotification(volp, volp->vol[BACKVOL].ID, volp->vol[BACKVOL].state, vl_unknown);
+                volp->vol[BACKVOL].state = vl_unknown;
+                online = 1;
+            }
         }
     }