Windows: Do not mark server down due to Store / Fetch Data protocol error
authorJeffrey Altman <jaltman@secure-endpoints.com>
Sat, 19 Sep 2009 17:52:32 +0000 (13:52 -0400)
committerJeffrey Altman <jaltman|account-1000011@unknown>
Sun, 20 Sep 2009 12:47:25 +0000 (05:47 -0700)
When performing a StoreData or FetchData operation there are
several data validation checks performed to ensure that the
lengths of data obtained with rx_Write and rx_Read are consistent
with the RXAFS_FetchData and RXAFS_StoreData protocol operations.
When an inconsistency is detected the cache manager terminates the
call and returns an error to the caller which is passed to cm_Analyze().
The cache manager was returning -1 as the error code which is
equivalent to RX_CALL_DEAD which in turn will result in the server
being marked down.

This commit makes the following changes:

 . add trace logging to permit monitoring this case

 . instead of returning -1 return either RX_PROTOCOL_ERROR or RX_EOF
   depending on the situation

 . in cm_Analyze do not mark a server as down for rx errors other
   than RX_CALL_DEAD.  Instead, force a new connection and retry
   until the request timeout limit is reached.

LICENSE MIT

Reviewed-on: http://gerrit.openafs.org/470
Reviewed-by: Derrick Brashear <shadow@dementia.org>
Reviewed-by: Jeffrey Altman <jaltman@openafs.org>
Tested-by: Jeffrey Altman <jaltman@openafs.org>

src/WINNT/afsd/cm_conn.c
src/WINNT/afsd/cm_dcache.c

index 5c6b764..2939ba1 100644 (file)
@@ -469,7 +469,6 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     else if (errorCode == VNOVOL || errorCode == VMOVED || errorCode == VOFFLINE ||
              errorCode == VSALVAGE || errorCode == VNOSERVICE || errorCode == VIO) 
     {       
-        char addr[16];
         char *format;
        DWORD msgID;
 
@@ -676,11 +675,11 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     }
     else if (errorCode >= -64 && errorCode < 0) {
         /* mark server as down */
-        sprintf(addr, "%d.%d.%d.%d", 
+        sprintf(addr, "%d.%d.%d.%d",
                 ((serverp->addr.sin_addr.s_addr & 0xff)),
                 ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
                 ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
-                ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+                ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24));
 
         if (errorCode == RX_CALL_DEAD)
             osi_Log2(afsd_logp, "cm_Analyze: Rx Call Dead addr[%s] forcedNew[%s]",
@@ -693,15 +692,20 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                      (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no"));
 
         lock_ObtainMutex(&serverp->mx);
-       if (errorCode != RX_CALL_DEAD || 
+       if (errorCode == RX_CALL_DEAD &&
             (reqp->flags & CM_REQ_NEW_CONN_FORCED)) {
             if (!(serverp->flags & CM_SERVERFLAG_DOWN)) {
                 serverp->flags |= CM_SERVERFLAG_DOWN;
                 serverp->downTime = time(NULL);
             }
         } else {
-           reqp->flags |= CM_REQ_NEW_CONN_FORCED;
-           forcing_new = 1;
+            if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
+                reqp->tokenIdleErrorServp = serverp;
+                reqp->tokenError = errorCode;
+            } else {
+                reqp->flags |= CM_REQ_NEW_CONN_FORCED;
+                forcing_new = 1;
+            }
        }
         lock_ReleaseMutex(&serverp->mx);
        cm_ForceNewConnections(serverp);
index 2b22338..ab5411e 100644 (file)
@@ -232,7 +232,7 @@ long cm_BufWrite(void *vscp, osi_hyper_t *offsetp, long length, long flags,
                 temp = rx_Write(rxcallp, bufferp, wbytes);
                 if (temp != wbytes) {
                     osi_Log3(afsd_logp, "rx_Write failed bp 0x%p, %d != %d",bufp,temp,wbytes);
-                    code = -1;
+                    code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR;
                     break;
                 } else {
                     osi_Log2(afsd_logp, "rx_Write succeeded bp 0x%p, %d",bufp,temp);
@@ -1609,10 +1609,13 @@ long cm_GetBuffer(cm_scache_t *scp, cm_buf_t *bufp, int *cpffp, cm_user_t *userp
             if (temp == sizeof(afs_int32)) {
                 nbytes = ntohl(nbytes);
                 FillInt64(length_found, nbytes_hi, nbytes);
-                if (length_found > biod.length) 
-                    code = (rxcallp->error < 0) ? rxcallp->error : -1;
+                if (length_found > biod.length) {
+                    osi_Log0(afsd_logp, "cm_GetBuffer length_found > biod.length");
+                    code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR;
+                }
             } else {
-                code = (rxcallp->error < 0) ? rxcallp->error : -1;
+                osi_Log1(afsd_logp, "cm_GetBuffer rx_Read32 returns %d != 4", temp);
+                code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR;
             }
         }
         /* for the moment, nbytes_hi will always be 0 if code == 0
@@ -1629,11 +1632,15 @@ long cm_GetBuffer(cm_scache_t *scp, cm_buf_t *bufp, int *cpffp, cm_user_t *userp
             temp  = rx_Read32(rxcallp, &nbytes);
             if (temp == sizeof(afs_int32)) {
                 nbytes = ntohl(nbytes);
-                if (nbytes > biod.length) 
-                    code = (rxcallp->error < 0) ? rxcallp->error : -1;
+                if (nbytes > biod.length) {
+                    osi_Log0(afsd_logp, "cm_GetBuffer length_found > biod.length");
+                    code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR;
+                }
+            }
+            else {
+                osi_Log1(afsd_logp, "cm_GetBuffer rx_Read32 returns %d != 4", temp);
+                code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR;
             }
-            else 
-                code = (rxcallp->error < 0) ? rxcallp->error : -1;
         }
 #endif
 
@@ -1663,7 +1670,7 @@ long cm_GetBuffer(cm_scache_t *scp, cm_buf_t *bufp, int *cpffp, cm_user_t *userp
                 rbytes = (nbytes > cm_data.buf_blockSize? cm_data.buf_blockSize : nbytes);
                 temp = rx_Read(rxcallp, bufferp, rbytes);
                 if (temp < rbytes) {
-                    code = (rxcallp->error < 0) ? rxcallp->error : -1;
+                    code = (rxcallp->error < 0) ? rxcallp->error : RX_EOF;
                     break;
                 }