windows-vmoved-vnovol-failover-20090212
authorJeffrey Altman <jaltman@secure-endpoints.com>
Thu, 12 Feb 2009 21:30:03 +0000 (21:30 +0000)
committerJeffrey Altman <jaltman@secure-endpoints.com>
Thu, 12 Feb 2009 21:30:03 +0000 (21:30 +0000)
LICENSE MIT
FIXES 124276

A readonly volume with multiple instances was being marked alldown
when one of the instances resulted in VNOVOL because the vldb and
the indicated file server were out of sync.  The cache manager would
then attempt to re-obtain the volume location info from the vldb
for the entire timeout period.

This patch:

adds trace logging to cm_Analyze indicating which server references
are being deleted and which remain in response to a VNOVOL/VMOVED error.

adds trace logging to cm_UpdateVolumeStatusInt showing how the
new volume status value is determined.

corrects cm_Analyze() so that it doesn't corrupt the server reference
list during cm_FreeServerList() calls; doesn't orphan the server reference
list by setting *serverpp to NULL; and only re-obtains the volume info
list once per request.

src/WINNT/afsd/cm_conn.c
src/WINNT/afsd/cm_conn.h
src/WINNT/afsd/cm_volume.c

index 1d4e0fd..693b7fe 100644 (file)
@@ -345,8 +345,9 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                         }
                         lock_ReleaseWrite(&cm_serverLock);
                         if (free_svr_list) {
-                            cm_FreeServerList(&serversp, 0);
-                            *serverspp = serversp = NULL;
+                            cm_FreeServerList(serverspp, 0);
+                            serverspp = NULL;
+                            serversp = NULL;
                             free_svr_list = 0;
                         }
 
@@ -420,8 +421,9 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         }
 
         if (free_svr_list) {
-            cm_FreeServerList(&serversp, 0);
-            *serverspp = serversp = NULL;
+            cm_FreeServerList(serverspp, 0);
+            serverspp = NULL;
+            serversp = NULL;
             free_svr_list = 0;
         }
         retry = 1;
@@ -434,6 +436,10 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         char addr[16];
         char *format;
        DWORD msgID;
+
+        /* In case of timeout */
+        reqp->volumeError = errorCode;
+
         switch ( errorCode ) {
         case VNOVOL:
            msgID = MSG_SERVER_REPORTS_VNOVOL;
@@ -473,11 +479,14 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
            LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume);
         }
 
-        /* Mark server offline for this volume */
+        /* 
+         * Mark server offline for this volume or delete the volume
+         * from the server list if it was moved or is not present.
+         */
         if (!serversp && fidp) {
             code = cm_GetServerList(fidp, userp, reqp, &serverspp);
             if (code == 0) {
-                serversp = *serverspp = NULL;
+                serversp = *serverspp;
                 free_svr_list = 1;
             }
         }
@@ -486,23 +495,39 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
             if (tsrp->status == srv_deleted)
                 continue;
+
+            sprintf(addr, "%d.%d.%d.%d",
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff)),
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff00)>> 8),
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+
             if (tsrp->server == serverp) {
                 /* REDIRECT */
                 if (errorCode == VMOVED || errorCode == VNOVOL) {
+                    osi_Log2(afsd_logp, "volume %d not present on server %s", 
+                             fidp->volume, osi_LogSaveString(afsd_logp,addr));
                     tsrp->status = srv_deleted;
                     if (fidp)
                         cm_RemoveVolumeFromServer(serverp, fidp->volume);
                 } else {
+                    osi_Log2(afsd_logp, "volume %d instance on server %s marked offline", 
+                             fidp->volume, osi_LogSaveString(afsd_logp,addr));
                     tsrp->status = srv_offline;
                 }
+                /* break; */
+            } else {
+                osi_Log3(afsd_logp, "volume %d exists on server %s with status %u", 
+                         fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
             }
         }   
         lock_ReleaseWrite(&cm_serverLock);
 
         /* Free the server list before cm_ForceUpdateVolume is called */
         if (free_svr_list) {
-            cm_FreeServerList(&serversp, 0);
-            *serverspp = serversp = NULL;
+            cm_FreeServerList(serverspp, 0);
+            serverspp = NULL;
+            serversp = NULL;
             free_svr_list = 0;
         }
 
@@ -513,10 +538,14 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
             if (code == 0)
                 statep = cm_VolumeStateByID(volp, fidp->volume);
 
-            if (errorCode == VMOVED || errorCode == VNOVOL) {
+            if ((errorCode == VMOVED || errorCode == VNOVOL) &&
+                !(reqp->flags & CM_REQ_VOLUME_UPDATED)) 
+            {
                 code = cm_ForceUpdateVolume(fidp, userp, reqp);
                 if (code) 
                     timeLeft = 0;   /* prevent a retry on failure */
+                else
+                    reqp->flags |= CM_REQ_VOLUME_UPDATED;
                 osi_Log3(afsd_logp, "cm_Analyze called cm_ForceUpdateVolume cell %u vol %u code 0x%x",
                         fidp->cell, fidp->volume, code);
             }
@@ -799,6 +828,16 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     if (connp)
         cm_PutConn(connp);
 
+    /* 
+     * clear the volume updated flag if we succeed.
+     * this way the flag will not prevent a subsequent volume 
+     * from being updated if necessary.
+     */
+    if (errorCode == 0)
+    {
+        reqp->flags &= ~CM_REQ_VOLUME_UPDATED;
+    }
+
     /* retry until we fail to find a connection */
     return retry;
 }
index 68b464c..d5f41e2 100644 (file)
@@ -55,6 +55,7 @@ typedef struct cm_req {
 #define        CM_REQ_NORETRY          0x1
 #define CM_REQ_NEW_CONN_FORCED  0x2
 #define CM_REQ_SOURCE_SMB       0x4
+#define CM_REQ_VOLUME_UPDATED   0x8
 
 /*
  * Vice2 error codes
index 5f8fe84..e107e16 100644 (file)
@@ -1283,6 +1283,7 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep)
     cm_serverRef_t *tsrp;
     cm_server_t *tsp;
     int someBusy = 0, someOffline = 0, allOffline = 1, allBusy = 1, allDown = 1;
+    char addr[16];
 
     if (!volp || !statep) {
 #ifdef DEBUG
@@ -1293,29 +1294,50 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep)
 
     lock_ObtainWrite(&cm_serverLock);
     for (tsrp = statep->serversp; tsrp; tsrp=tsrp->next) {
-        if (tsrp->status == srv_deleted)
-            continue;
         tsp = tsrp->server;
+        sprintf(addr, "%d.%d.%d.%d", 
+                 ((tsp->addr.sin_addr.s_addr & 0xff)),
+                 ((tsp->addr.sin_addr.s_addr & 0xff00)>> 8),
+                 ((tsp->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                 ((tsp->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+
+        if (tsrp->status == srv_deleted) {
+            osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s deleted", 
+                     statep->ID, osi_LogSaveString(afsd_logp,addr));
+            continue;
+        }
         if (tsp) {
             cm_GetServerNoLock(tsp);
             if (!(tsp->flags & CM_SERVERFLAG_DOWN)) {
                 allDown = 0;
                 if (tsrp->status == srv_busy) {
+                    osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s busy", 
+                              statep->ID, osi_LogSaveString(afsd_logp,addr));
                     allOffline = 0;
                     someBusy = 1;
                 } else if (tsrp->status == srv_offline) {
+                    osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s offline", 
+                              statep->ID, osi_LogSaveString(afsd_logp,addr));
                     allBusy = 0;
                     someOffline = 1;
                 } else {
+                    osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s online", 
+                              statep->ID, osi_LogSaveString(afsd_logp,addr));
                     allOffline = 0;
                     allBusy = 0;
                 }
+            } else {
+                osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s online", 
+                          statep->ID, osi_LogSaveString(afsd_logp,addr));
             }
             cm_PutServerNoLock(tsp);
         }
     }   
     lock_ReleaseWrite(&cm_serverLock);
 
+    osi_Log5(afsd_logp, "cm_UpdateVolumeStatusInt allDown %d allBusy %d someBusy %d someOffline %d allOffline %d", 
+             allDown, allBusy, someBusy, someOffline, allOffline);
+
     if (allDown)
        newStatus = vl_alldown;
     else if (allBusy || (someBusy && someOffline))