Windows: Make CM resilient to transient VNOVOL
authorJeffrey Altman <jaltman@your-file-system.com>
Sun, 6 May 2012 00:46:08 +0000 (20:46 -0400)
committerJeffrey Altman <jaltman@secure-endpoints.com>
Sun, 6 May 2012 15:42:18 +0000 (08:42 -0700)
The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which
are no indicative of the volume not being present.  For example,
VNOVOL can be sent during a transition to a VBUSY state prior to
salvaging or when cloning a .backup volume instance.  As a result
the cache manager must attempt at least one retry when a VNOVOL is
receive but there are no changes to the volume location information.

This patchset records the VNOVOL error in the cm_req_t structure
If the volume is replicated, the volume's server reference into a busy state.
If the volume is not replicated, the thread is paused for two seconds.
In both cases, the request is retried.  If the VNOVOL error is received
a second time from the same server, the volume server reference is
deleted as before.  This is done to prevent repeated requests to the
VLDB server and the file server that are expected to fail.  The server
reference will be restored to the volume on the next volume location
update.

Change-Id: Ica51f853683f80cb17c804cdc216f7a113cca60a
Reviewed-on: http://gerrit.openafs.org/7353
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Tested-by: Jeffrey Altman <jaltman@secure-endpoints.com>
Reviewed-by: Jeffrey Altman <jaltman@secure-endpoints.com>

src/WINNT/afsd/cm_conn.c
src/WINNT/afsd/cm_conn.h

index ab95902..15f0f50 100644 (file)
@@ -688,25 +688,53 @@ cm_Analyze(cm_conn_t *connp,
 
             if (cm_ServerEqual(tsrp->server, serverp)) {
                 /* REDIRECT */
-                if (errorCode == VMOVED || errorCode == VNOVOL) {
-                    osi_Log2(afsd_logp, "volume %d not present on server %s",
+                switch (errorCode) {
+                case VMOVED:
+                    osi_Log2(afsd_logp, "volume %u moved from server %s",
                              fidp->volume, osi_LogSaveString(afsd_logp,addr));
                     tsrp->status = srv_deleted;
                     if (fidp)
                         cm_RemoveVolumeFromServer(serverp, fidp->volume);
-                } else {
-                    osi_Log2(afsd_logp, "volume %d instance on server %s marked offline",
-                             fidp->volume, osi_LogSaveString(afsd_logp,addr));
-                    tsrp->status = srv_offline;
+                    break;
+                case VNOVOL:
+                    /*
+                     * The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which
+                     * are no indicative of the volume not being present.  For example,
+                     * VNOVOL can be sent during a transition to a VBUSY state prior to
+                     * salvaging or when cloning a .backup volume instance.  As a result
+                     * the cache manager must attempt at least one retry when a VNOVOL is
+                     * receive but there are no changes to the volume location information.
+                     */
+                    if (reqp->vnovolError > 0 && cm_ServerEqual(reqp->errorServp, serverp)) {
+                        osi_Log2(afsd_logp, "volume %u not present on server %s",
+                                  fidp->volume, osi_LogSaveString(afsd_logp,addr));
+                        tsrp->status = srv_deleted;
+                        if (fidp)
+                            cm_RemoveVolumeFromServer(serverp, fidp->volume);
+                    } else {
+                        osi_Log2(afsd_logp, "VNOVOL received for volume %u from server %s",
+                                 fidp->volume, osi_LogSaveString(afsd_logp,addr));
+                        if (replicated) {
+                            cm_SetServerBusyStatus(serversp, serverp);
+                        } else {
+                            Sleep(2000);
+                        }
+                    }
+                    break;
+                default:
+                    osi_Log3(afsd_logp, "volume %u exists on server %s with status %u",
+                             fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
                 }
-                /* break; */
-            } else {
-                osi_Log3(afsd_logp, "volume %d exists on server %s with status %u",
-                         fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
             }
         }
         lock_ReleaseWrite(&cm_serverLock);
 
+        /* Remember that the VNOVOL error occurred */
+        if (errorCode == VNOVOL) {
+            reqp->errorServp = serverp;
+            reqp->vnovolError++;
+        }
+
         /* Free the server list before cm_ForceUpdateVolume is called */
         if (free_svr_list) {
             cm_FreeServerList(serverspp, 0);
@@ -779,7 +807,7 @@ cm_Analyze(cm_conn_t *connp,
             LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr);
             osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]",
                      osi_LogSaveString(afsd_logp,addr));
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->idleError++;
         }
 
@@ -947,7 +975,7 @@ cm_Analyze(cm_conn_t *connp,
         }
 
         if (replicated && serverp) {
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->tokenError = errorCode;
 
             if (timeLeft > 2)
@@ -1023,7 +1051,7 @@ cm_Analyze(cm_conn_t *connp,
 
         if (serverp) {
             if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
-                reqp->tokenIdleErrorServp = serverp;
+                reqp->errorServp = serverp;
                 reqp->tokenError = errorCode;
             } else {
                 reqp->flags |= CM_REQ_NEW_CONN_FORCED;
@@ -1071,7 +1099,7 @@ cm_Analyze(cm_conn_t *connp,
                   errorCode, s);
 
         if (serverp) {
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->tokenError = errorCode;
             retry = 1;
         }
@@ -1084,7 +1112,7 @@ cm_Analyze(cm_conn_t *connp,
          * and force the use of another server.
          */
         if (serverp) {
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->tokenError = errorCode;
             retry = 1;
         }
@@ -1280,15 +1308,15 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, afs_uint32 replicated, cm_user_
             continue;
 
         tsp = tsrp->server;
-        if (reqp->tokenIdleErrorServp) {
+        if (reqp->errorServp) {
             /*
              * search the list until we find the server
              * that failed last time.  When we find it
              * clear the error, skip it and try the next one
              * in the list.
              */
-            if (tsp == reqp->tokenIdleErrorServp)
-                reqp->tokenIdleErrorServp = NULL;
+            if (tsp == reqp->errorServp)
+                reqp->errorServp = NULL;
             continue;
         }
         if (tsp) {
index 2c7bdfc..affae47 100644 (file)
@@ -62,9 +62,10 @@ typedef struct cm_req {
     int rpcError;              /* RPC error code */
     int volumeError;           /* volume error code */
     int accessError;           /* access error code */
-    struct cm_server * tokenIdleErrorServp;  /* server that reported a token/idle error other than expired */
+    struct cm_server * errorServp;  /* server that reported a token/idle error other than expired */
     int tokenError;
     int idleError;
+    int vnovolError;
     afs_uint32 flags;
     clientchar_t * tidPathp;
     clientchar_t * relPathp;