#endif /* vlserver error base define */
-int afs_BusyWaitPeriod = 15; /* poll every 15 seconds */
+int afs_BusyWaitPeriod = 15; /**< poll period, in seconds */
-afs_int32 hm_retry_RO = 0; /* don't wait */
-afs_int32 hm_retry_RW = 0; /* don't wait */
-afs_int32 hm_retry_int = 0; /* don't wait */
+afs_int32 hm_retry_RO = 0; /**< enable read-only hard-mount retry */
+afs_int32 hm_retry_RW = 0; /**< enable read-write hard-mount retry */
+afs_int32 hm_retry_int = 0; /**< hard-mount retry interval, in seconds */
#define VSleep(at) afs_osi_Wait((at)*1000, 0, 0)
int lastcode;
-/* returns:
- * 0 if the vldb record for a specific volume is different from what
- * we have cached -- perhaps the volume has moved.
- * 1 if the vldb record is the same
- * 2 if we can't tell if it's the same or not.
- *
- * If 0, the caller will probably start over at the beginning of our
- * list of servers for this volume and try to find one that is up. If
- * not 0, we will probably just keep plugging with what we have
- * cached. If we fail to contact the VL server, we should just keep
- * trying with the information we have, rather than failing. */
#define DIFFERENT 0
#define SAME 1
#define DUNNO 2
+/*!
+ * \brief
+ * Request vldb record to determined if it has changed.
+ *
+ * \retval 0 if the vldb record for a specific volume is different from what
+ * we have cached -- perhaps the volume has moved.
+ * \retval 1 if the vldb record is the same
+ * \retval 2 if we can't tell if it's the same or not.
+ *
+ * \note
+ * If 0 returned, the caller will probably start over at the beginning of our
+ * list of servers for this volume and try to find one that is up. If
+ * not 0, we will probably just keep plugging with what we have
+ * cached. If we fail to contact the VL server, we should just keep
+ * trying with the information we have, rather than failing.
+ */
static int
VLDB_Same(struct VenusFid *afid, struct vrequest *areq)
{
VSleep(2); /* Better safe than sorry. */
tconn =
afs_ConnByMHosts(tcell->cellHosts, tcell->vlport, tcell->cellNum,
- &treq, SHARED_LOCK, &rxconn);
+ &treq, SHARED_LOCK, 0, &rxconn);
if (tconn) {
if ( tconn->parent->srvr->server->flags & SNO_LHOSTS) {
type = 0;
for (i = 0; i < NMAXNSERVERS && tvp->serverHost[i]; i++) {
oldhosts[i] = tvp->serverHost[i];
}
+ ReleaseWriteLock(&tvp->lock);
if (type == 2) {
- InstallUVolumeEntry(tvp, &v->utve, afid->Cell, tcell, &treq);
+ LockAndInstallUVolumeEntry(tvp, &v->utve, afid->Cell, tcell, &treq);
} else if (type == 1) {
- InstallNVolumeEntry(tvp, &v->ntve, afid->Cell);
+ LockAndInstallNVolumeEntry(tvp, &v->ntve, afid->Cell);
} else {
- InstallVolumeEntry(tvp, &v->tve, afid->Cell);
+ LockAndInstallVolumeEntry(tvp, &v->tve, afid->Cell);
}
if (i < NMAXNSERVERS && tvp->serverHost[i]) {
return (changed ? DIFFERENT : SAME);
} /*VLDB_Same */
-/*------------------------------------------------------------------------
- * afs_BlackListOnce
- *
- * Description:
+/*!
+ * \brief
* Mark a server as invalid for further attempts of this request only.
*
- * Arguments:
- * areq : The request record associated with this operation.
- * afid : The FID of the file involved in the action. This argument
- * may be null if none was involved.
- * tsp : pointer to a server struct for the server we wish to
- * blacklist.
+ * \param[in,out] areq The request record associated with this operation.
+ * \param[in] afid The FID of the file involved in the action. This argument
+ * may be null if none was involved.
+ * \param[in,out] tsp pointer to a server struct for the server we wish to
+ * blacklist.
*
- * Returns:
+ * \returns
* Non-zero value if further servers are available to try,
* zero otherwise.
*
- * Environment:
+ * \note
* This routine is typically called in situations where we believe
- * one server out of a pool may have an error condition.
+ * one server out of a pool may have an error condition.
*
- * Side Effects:
- * As advertised.
- *
- * NOTE:
+ * \note
* The afs_Conn* routines use the list of invalidated servers to
* avoid reusing a server marked as invalid for this request.
- *------------------------------------------------------------------------*/
+ */
static afs_int32
afs_BlackListOnce(struct vrequest *areq, struct VenusFid *afid,
struct server *tsp)
areq->skipserver[i] = 1;
}
}
- afs_PutVolume(tvp, READ_LOCK);
for (i = 0; i < AFS_MAXHOSTS; i++) {
if (tvp->serverHost[i] && areq->skipserver[i] == 0) {
serversleft = 1;
break;
}
}
+ afs_PutVolume(tvp, READ_LOCK);
return serversleft;
}
}
return serversleft;
}
-
-/*------------------------------------------------------------------------
- * EXPORTED afs_Analyze
+/*!
+ * \brief
+ * Analyze the outcome of an RPC operation, taking whatever support
+ * actions are necessary.
+ *
+ * \param[in] afid The FID of the file involved in the action. This argument
+ * may be null if none was involved.
+ * \param[in] op which RPC we are analyzing.
+ * \param[in,out] avp A pointer to the struct volume, if we already have one.
*
- * Description:
+ * \returns
+ * Non-zero value if the related RPC operation can be retried,
+ * zero otherwise.
+ *
+ * \note
+ * This routine is called when we got a network error,
+ * and discards state if the operation was a data-mutating
+ * operation.
+ */
+static int
+afs_ClearStatus(struct VenusFid *afid, int op, struct volume *avp)
+{
+ struct volume *tvp = NULL;
+
+ /* if it's not a write op, we have nothing to veto and shouldn't clear. */
+ if (!AFS_STATS_FS_RPCIDXES_ISWRITE(op)) {
+ return 1;
+ }
+
+ if (avp)
+ tvp = avp;
+ else if (afid)
+ tvp = afs_FindVolume(afid, READ_LOCK);
+
+ /* don't assume just discarding will fix if no cached volume */
+ if (tvp) {
+ struct vcache *tvc;
+ ObtainReadLock(&afs_xvcache);
+ if ((tvc = afs_FindVCache(afid, 0, 0))) {
+ ReleaseReadLock(&afs_xvcache);
+ tvc->f.states &= ~(CStatd | CUnique);
+ afs_PutVCache(tvc);
+ } else {
+ ReleaseReadLock(&afs_xvcache);
+ }
+ if (!avp)
+ afs_PutVolume(tvp, READ_LOCK);
+ }
+
+ if (AFS_STATS_FS_RPCIDXES_WRITE_RETRIABLE(op))
+ return 1;
+
+ /* not retriable: we may have raced ourselves */
+ return 0;
+}
+
+/*!
+ * \brief
* Analyze the outcome of an RPC operation, taking whatever support
* actions are necessary.
*
- * Arguments:
- * aconn : Ptr to the relevant connection on which the call was made.
- * acode : The return code experienced by the RPC.
- * afid : The FID of the file involved in the action. This argument
- * may be null if none was involved.
- * areq : The request record associated with this operation.
- * op : which RPC we are analyzing.
- * cellp : pointer to a cell struct. Must provide either fid or cell.
+ * \param[in] aconn Ptr to the relevant connection on which the call was made.
+ * \param[in] acode The return code experienced by the RPC.
+ * \param[in] fid The FID of the file involved in the action. This argument
+ * may be null if none was involved.
+ * \param[in,out] areq The request record associated with this operation.
+ * \param[in] op which RPC we are analyzing.
+ * \param[in] cellp pointer to a cell struct. Must provide either fid or cell.
*
- * Returns:
+ * \returns
* Non-zero value if the related RPC operation should be retried,
* zero otherwise.
*
- * Environment:
+ * \note
* This routine is typically called in a do-while loop, causing the
* embedded RPC operation to be called repeatedly if appropriate
* until whatever error condition (if any) is intolerable.
*
- * Side Effects:
- * As advertised.
- *
- * NOTE:
+ * \note
* The retry return value is used by afs_StoreAllSegments to determine
* if this is a temporary or permanent error.
- *------------------------------------------------------------------------*/
+ */
int
afs_Analyze(struct afs_conn *aconn, struct rx_connection *rxconn,
afs_int32 acode, struct VenusFid *afid, struct vrequest *areq,
((afid && afs_IsPrimaryCellNum(afid->Cell))
|| (cellp && afs_IsPrimaryCell(cellp)))) {
if (!afid) {
- afs_warnuser
- ("afs: hard-mount waiting for a vlserver to return to service\n");
+ static int afs_vl_hm = 0;
+ int warn = 0;
+ if (!afs_vl_hm) {
+ afs_vl_hm = warn = 1;
+ }
+ if (warn) {
+ afs_warnuser
+ ("afs: hard-mount waiting for a vlserver to return to service\n");
+ }
VSleep(hm_retry_int);
afs_CheckServers(1, cellp);
shouldRetry = 1;
+
+ if (warn) {
+ afs_vl_hm = 0;
+ }
} else {
+ static int afs_unknown_vhm = 0;
+ int warn = 0, vp_vhm = 0;
+
tvp = afs_FindVolume(afid, READ_LOCK);
if (!tvp || (tvp->states & VRO)) {
shouldRetry = hm_retry_RO;
} else {
shouldRetry = hm_retry_RW;
}
+
+ /* Set 'warn' if we should afs_warnuser. Only let one
+ * caller call afs_warnuser per hm_retry_int interval per
+ * volume. */
+ if (shouldRetry) {
+ if (tvp) {
+ if (!(tvp->states & VHardMount)) {
+ tvp->states |= VHardMount;
+ warn = vp_vhm = 1;
+ }
+ } else {
+ if (!afs_unknown_vhm) {
+ afs_unknown_vhm = 1;
+ warn = 1;
+ }
+ }
+ }
+
if (tvp)
afs_PutVolume(tvp, READ_LOCK);
+
if (shouldRetry) {
- afs_warnuser
- ("afs: hard-mount waiting for volume %u\n",
- afid->Fid.Volume);
+ if (warn) {
+ afs_warnuser
+ ("afs: hard-mount waiting for volume %u\n",
+ afid->Fid.Volume);
+ }
+
VSleep(hm_retry_int);
afs_CheckServers(1, cellp);
+ /* clear the black listed servers on this request. */
+ memset(areq->skipserver, 0, sizeof(areq->skipserver));
+
+ if (vp_vhm) {
+ tvp = afs_FindVolume(afid, READ_LOCK);
+ if (tvp) {
+ tvp->states &= ~VHardMount;
+ afs_PutVolume(tvp, READ_LOCK);
+ }
+ } else if (warn) {
+ afs_unknown_vhm = 0;
+ }
}
}
} /* if (hm_retry_int ... */
else {
if (acode == RX_MSGSIZE)
shouldRetry = 1;
- else
+ else {
areq->networkError = 1;
+ /* do not promote to shouldRetry if not already */
+ if (afs_ClearStatus(afid, op, NULL) == 0)
+ shouldRetry = 0;
+ }
}
}
return shouldRetry;
acode = 455;
#endif /* AFS_64BIT_CLIENT */
if ((acode < 0) && (acode != VRESTARTING)) {
- if (acode == RX_MSGSIZE) {
+ if (acode == RX_MSGSIZE || acode == RX_CALL_BUSY) {
shouldRetry = 1;
goto out;
}
- if (acode == RX_CALL_TIMEOUT) {
+ if (acode == RX_CALL_TIMEOUT || acode == RX_CALL_IDLE) {
serversleft = afs_BlackListOnce(areq, afid, tsp);
if (afid)
tvp = afs_FindVolume(afid, READ_LOCK);
- if (!afid || !tvp || (tvp->states & VRO))
- areq->idleError++;
if ((serversleft == 0) && tvp &&
((tvp->states & VRO) || (tvp->states & VBackup))) {
shouldRetry = 0;
} else {
shouldRetry = 1;
}
+ if (!afid || !tvp || (tvp->states & VRO))
+ areq->idleError++;
+ else if (afs_ClearStatus(afid, op, tvp) == 0)
+ shouldRetry = 0;
+
if (tvp)
afs_PutVolume(tvp, READ_LOCK);
/* By doing this, we avoid ever marking a server down
*/
goto out;
}
- afs_ServerDown(sa);
- ForceNewConnections(sa); /**multi homed clients lock:afs_xsrvAddr? */
+ afs_ServerDown(sa, acode);
+ ForceNewConnections(sa); /* multi homed clients lock:afs_xsrvAddr? */
if (aerrP)
(aerrP->err_Server)++;
}
}
/* check for ubik errors; treat them like crashed servers */
else if (acode >= ERROR_TABLE_BASE_U && acode < ERROR_TABLE_BASE_U + 255) {
- afs_ServerDown(sa);
+ afs_ServerDown(sa, acode);
if (aerrP)
(aerrP->err_Server)++;
shouldRetry = 1; /* retryable (maybe one is working) */
* retry in case there is another server. However, if we find
* no connection (aconn == 0) we set the networkError flag.
*/
- afs_MarkServerUpOrDown(sa, SRVR_ISDOWN);
+ afs_ServerDown(sa, acode);
if (aerrP)
(aerrP->err_Server)++;
VSleep(1); /* Just a hack for desperate times. */