afs: discard cached state when we are unsure of validity
[openafs.git] / src / afs / afs_analyze.c
index 05d39a2..0023d5b 100644 (file)
 #include "afs/afs_util.h"
 #include "afs/unified_afs.h"
 
-#if    defined(AFS_SUN56_ENV)
+#if    defined(AFS_SUN5_ENV)
 #include <inet/led.h>
 #include <inet/common.h>
-#if     defined(AFS_SUN58_ENV)
 #include <netinet/ip6.h>
-#endif
 #include <inet/ip.h>
 #endif
 
@@ -174,13 +172,14 @@ VLDB_Same(struct VenusFid *afid, struct vrequest *areq)
        for (i = 0; i < NMAXNSERVERS && tvp->serverHost[i]; i++) {
            oldhosts[i] = tvp->serverHost[i];
        }
+       ReleaseWriteLock(&tvp->lock);
 
        if (type == 2) {
-           InstallUVolumeEntry(tvp, &v->utve, afid->Cell, tcell, &treq);
+           LockAndInstallUVolumeEntry(tvp, &v->utve, afid->Cell, tcell, &treq);
        } else if (type == 1) {
-           InstallNVolumeEntry(tvp, &v->ntve, afid->Cell);
+           LockAndInstallNVolumeEntry(tvp, &v->ntve, afid->Cell);
        } else {
-           InstallVolumeEntry(tvp, &v->tve, afid->Cell);
+           LockAndInstallVolumeEntry(tvp, &v->tve, afid->Cell);
        }
 
        if (i < NMAXNSERVERS && tvp->serverHost[i]) {
@@ -259,19 +258,74 @@ afs_BlackListOnce(struct vrequest *areq, struct VenusFid *afid,
                    areq->skipserver[i] = 1;
                }
            }
-           afs_PutVolume(tvp, READ_LOCK);
            for (i = 0; i < AFS_MAXHOSTS; i++) {
                if (tvp->serverHost[i] && areq->skipserver[i] == 0) {
                    serversleft = 1;
                    break;
                }
            }
+           afs_PutVolume(tvp, READ_LOCK);
            return serversleft;
        }
     }
     return serversleft;
 }
 
+/*------------------------------------------------------------------------
+ * afs_ClearStatus
+ *
+ * Description:
+ *     Analyze the outcome of an RPC operation, taking whatever support
+ *     actions are necessary.
+ *
+ * Arguments:
+ *     afid  : The FID of the file involved in the action.  This argument
+ *             may be null if none was involved.
+ *      op    : which RPC we are analyzing.
+ *      avp   : A pointer to the struct volume, if we already have one.
+ *
+ * Returns:
+ *     Non-zero value if the related RPC operation can be retried,
+ *     zero otherwise.
+ *
+ * Environment:
+ *     This routine is called when we got a network error,
+ *      and discards state if the operation was a data-mutating
+ *      operation.
+ *------------------------------------------------------------------------*/
+static int
+afs_ClearStatus(struct VenusFid *afid, int op, struct volume *avp)
+{
+    struct volume *tvp = NULL;
+
+    /* if it's not a write op, we have nothing to veto and shouldn't clear. */
+    if (!AFS_STATS_FS_RPCIDXES_ISWRITE(op)) {
+       return 1;
+    }
+
+    if (avp)
+       tvp = avp;
+    else if (afid)
+       tvp = afs_FindVolume(afid, READ_LOCK);
+
+    /* don't assume just discarding will fix if no cached volume */
+    if (tvp) {
+       struct vcache *tvc;
+       ObtainReadLock(&afs_xvcache);
+       if ((tvc = afs_FindVCache(afid, 0, 0))) {
+           ReleaseReadLock(&afs_xvcache);
+           tvc->f.states &= ~(CStatd | CUnique);
+           afs_PutVCache(tvc);
+       } else {
+           ReleaseReadLock(&afs_xvcache);
+       }
+    }
+    if (!avp)
+       afs_PutVolume(tvp, READ_LOCK);
+
+    /* not retriable: we may have raced ourselves */
+    return 0;
+}
 
 /*------------------------------------------------------------------------
  * EXPORTED afs_Analyze
@@ -317,7 +371,6 @@ afs_Analyze(struct afs_conn *aconn, struct rx_connection *rxconn,
     afs_int32 shouldRetry = 0;
     afs_int32 serversleft = 1;
     struct afs_stats_RPCErrors *aerrP;
-    afs_int32 markeddown;
     afs_uint32 address;
 
     if (AFS_IS_DISCONNECTED && !AFS_IN_SYNC) {
@@ -391,34 +444,86 @@ afs_Analyze(struct afs_conn *aconn, struct rx_connection *rxconn,
                ((afid && afs_IsPrimaryCellNum(afid->Cell))
                 || (cellp && afs_IsPrimaryCell(cellp)))) {
                if (!afid) {
-                   afs_warnuser
-                       ("afs: hard-mount waiting for a vlserver to return to service\n");
+                   static int afs_vl_hm = 0;
+                   int warn = 0;
+                   if (!afs_vl_hm) {
+                       afs_vl_hm = warn = 1;
+                   }
+                   if (warn) {
+                       afs_warnuser
+                           ("afs: hard-mount waiting for a vlserver to return to service\n");
+                   }
                    VSleep(hm_retry_int);
                    afs_CheckServers(1, cellp);
                    shouldRetry = 1;
+
+                   if (warn) {
+                       afs_vl_hm = 0;
+                   }
                } else {
+                   static int afs_unknown_vhm = 0;
+                   int warn = 0, vp_vhm = 0;
+
                    tvp = afs_FindVolume(afid, READ_LOCK);
                    if (!tvp || (tvp->states & VRO)) {
                        shouldRetry = hm_retry_RO;
                    } else {
                        shouldRetry = hm_retry_RW;
                    }
+
+                   /* Set 'warn' if we should afs_warnuser. Only let one
+                    * caller call afs_warnuser per hm_retry_int interval per
+                    * volume. */
+                   if (shouldRetry) {
+                       if (tvp) {
+                           if (!(tvp->states & VHardMount)) {
+                               tvp->states |= VHardMount;
+                               warn = vp_vhm = 1;
+                           }
+                       } else {
+                           if (!afs_unknown_vhm) {
+                               afs_unknown_vhm = 1;
+                               warn = 1;
+                           }
+                       }
+                   }
+
                    if (tvp)
                        afs_PutVolume(tvp, READ_LOCK);
+
                    if (shouldRetry) {
-                       afs_warnuser
-                           ("afs: hard-mount waiting for volume %u\n",
-                            afid->Fid.Volume);
+                       if (warn) {
+                           afs_warnuser
+                               ("afs: hard-mount waiting for volume %u\n",
+                                afid->Fid.Volume);
+                       }
+
                        VSleep(hm_retry_int);
                        afs_CheckServers(1, cellp);
+                       /* clear the black listed servers on this request. */
+                       memset(areq->skipserver, 0, sizeof(areq->skipserver));
+
+                       if (vp_vhm) {
+                           tvp = afs_FindVolume(afid, READ_LOCK);
+                           if (tvp) {
+                               tvp->states &= ~VHardMount;
+                               afs_PutVolume(tvp, READ_LOCK);
+                           }
+                       } else if (warn) {
+                           afs_unknown_vhm = 0;
+                       }
                    }
                }
            } /* if (hm_retry_int ... */
            else {
                if (acode == RX_MSGSIZE)
                    shouldRetry = 1;
-               else
+               else {
                    areq->networkError = 1;
+                   /* do not promote to shouldRetry if not already */
+                   if (afs_ClearStatus(afid, op, NULL) == 0)
+                       shouldRetry = 0;
+               }
            }
        }
        return shouldRetry;
@@ -468,14 +573,17 @@ afs_Analyze(struct afs_conn *aconn, struct rx_connection *rxconn,
            serversleft = afs_BlackListOnce(areq, afid, tsp);
            if (afid)
                tvp = afs_FindVolume(afid, READ_LOCK);
-           if (!afid || !tvp || (tvp->states & VRO))
-               areq->idleError++;
            if ((serversleft == 0) && tvp &&
                ((tvp->states & VRO) || (tvp->states & VBackup))) {
                shouldRetry = 0;
            } else {
                shouldRetry = 1;
            }
+           if (!afid || !tvp || (tvp->states & VRO))
+               areq->idleError++;
+           else if (afs_ClearStatus(afid, op, tvp) == 0)
+               shouldRetry = 0;
+
            if (tvp)
                afs_PutVolume(tvp, READ_LOCK);
            /* By doing this, we avoid ever marking a server down
@@ -486,17 +594,10 @@ afs_Analyze(struct afs_conn *aconn, struct rx_connection *rxconn,
             */
            goto out;
        }
-       markeddown = afs_ServerDown(sa);
+       afs_ServerDown(sa, acode);
        ForceNewConnections(sa); /**multi homed clients lock:afs_xsrvAddr? */
        if (aerrP)
            (aerrP->err_Server)++;
-#if 0
-       /* retry *once* when the server is timed out in case of NAT */
-       if (markeddown && acode == RX_CALL_DEAD) {
-           aconn->forceConnectFS = 1;
-           shouldRetry = 1;
-       }
-#endif
     }
 
     if (acode == VBUSY || acode == VRESTARTING) {
@@ -613,7 +714,7 @@ afs_Analyze(struct afs_conn *aconn, struct rx_connection *rxconn,
     }
     /* check for ubik errors; treat them like crashed servers */
     else if (acode >= ERROR_TABLE_BASE_U && acode < ERROR_TABLE_BASE_U + 255) {
-       afs_ServerDown(sa);
+       afs_ServerDown(sa, acode);
        if (aerrP)
            (aerrP->err_Server)++;
        shouldRetry = 1;        /* retryable (maybe one is working) */