}
}
- if (errorCode == CM_ERROR_TIMEDOUT) {
+ if (errorCode == 0) {
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+ }
+ else if (errorCode == CM_ERROR_TIMEDOUT) {
osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_TIMEDOUT");
if ( timeLeft > 5 ) {
thrd_Sleep(3000);
thrd_Sleep(1000);
retry = 1;
}
+
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
}
/* if there is nosuchvolume, then we have a situation in which a
if (code == 0)
free_svr_list = 1;
}
- cm_ResetServerBusyStatus(volServerspp);
+ if (volServerspp)
+ cm_ResetServerBusyStatus(volServerspp);
if (free_svr_list) {
cm_FreeServerList(volServerspp, 0);
free_svr_list = 0;
* Do not perform a cm_CheckOfflineVolume() if cm_Analyze()
* was called by cm_CheckOfflineVolumeState().
*/
- if (!(reqp->flags & CM_REQ_OFFLINE_VOL_CHK) && timeLeft > 7) {
+ if (!(reqp->flags & (CM_REQ_OFFLINE_VOL_CHK|CM_REQ_NORETRY)) &&
+ timeLeft > 7)
+ {
thrd_Sleep(5000);
/* cm_CheckOfflineVolume() resets the serverRef state */
if (code == 0)
free_svr_list = 1;
}
- cm_ResetServerBusyStatus(volServerspp);
+ if (volServerspp)
+ cm_ResetServerBusyStatus(volServerspp);
if (free_svr_list) {
cm_FreeServerList(volServerspp, 0);
free_svr_list = 0;
volServerspp = NULL;
}
- if (timeLeft > 7) {
- thrd_Sleep(5000);
- statep = cm_VolumeStateByID(volp, fidp->volume);
+ /*
+ * retry all replicas for 5 minutes waiting 15 seconds
+ * between attempts.
+ */
+ if (timeLeft > 20 && !(reqp->flags & CM_REQ_NORETRY) &&
+ reqp->volbusyCount++ < 20)
+ {
+ thrd_Sleep(15000);
retry = 1;
}
cm_UpdateVolumeStatus(volp, fidp->volume);
} else { /* VL Server query */
osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY (VL Server).");
- if (timeLeft > 7) {
+ if (timeLeft > 7 && !(reqp->flags & CM_REQ_NORETRY) && vlServerspp)
+ {
thrd_Sleep(5000);
- if (vlServerspp) {
- cm_ResetServerBusyStatus(vlServerspp);
- retry = 1;
- }
+ cm_ResetServerBusyStatus(vlServerspp);
+ retry = 1;
}
}
}
/* special codes: VBUSY and VRESTARTING */
else if (errorCode == VBUSY || errorCode == VRESTARTING) {
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
if (fidp) {
code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp,
CM_GETVOL_FLAG_NO_LRU_UPDATE,
osi_Log3(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume, cellp->name);
LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume, cellp->name);
- cm_SetServerBusyStatus(volServerspp, serverp);
+ if (volServerspp)
+ cm_SetServerBusyStatus(volServerspp, serverp);
}
if (free_svr_list) {
else if (errorCode == VNOVOL || errorCode == VMOVED || errorCode == VOFFLINE ||
errorCode == VSALVAGE || errorCode == VIO)
{
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
/* In case of timeout */
reqp->volumeError = errorCode;
reqp->vnovolError++;
}
+ /* Remember that the VIO error occurred */
+ if (errorCode == VIO) {
+ reqp->errorServp = serverp;
+ reqp->vioCount++;
+ }
+
/* Free the server list before cm_ForceUpdateVolume is called */
if (free_svr_list) {
cm_FreeServerList(volServerspp, 0);
free_svr_list = 0;
}
- if ( timeLeft > 2 )
+ if ( timeLeft > 2 && reqp->vioCount < 100)
retry = 1;
} else if ( errorCode == VNOVNODE ) {
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
if ( fidp ) {
osi_Log4(afsd_logp, "cm_Analyze passed VNOVNODE cell %u vol %u vn %u uniq %u.",
fidp->cell, fidp->volume, fidp->vnode, fidp->unique);
pscp = cm_FindSCacheParent(scp);
lock_ObtainWrite(&scp->rw);
- scp->flags |= CM_SCACHEFLAG_DELETED;
+ _InterlockedOr(&scp->flags, CM_SCACHEFLAG_DELETED);
lock_ObtainWrite(&cm_scacheLock);
cm_AdjustScacheLRU(scp);
- cm_RemoveSCacheFromHashTable(scp);
lock_ReleaseWrite(&cm_scacheLock);
cm_LockMarkSCacheLost(scp);
lock_ReleaseWrite(&scp->rw);
if (!fidp) { /* vldb */
retry = 1;
} else { /* file */
- cm_volume_t *volp = cm_GetVolumeByFID(fidp);
+ cm_volume_t *volp = cm_FindVolumeByFID(fidp, userp, reqp);
if (volp) {
if (fidp->volume == cm_GetROVolumeID(volp))
retry = 1;
osi_Log1(afsd_logp, "cm_Analyze: Path MTU may have been exceeded addr[%s]",
osi_LogSaveString(afsd_logp,addr));
- retry = 1;
- }
- else if (errorCode == RX_CALL_BUSY) {
- /*
- * RPC failed because the selected call channel
- * is currently busy on the server. Unconditionally
- * retry the request so an alternate call channel can be used.
- */
- if (serverp)
- sprintf(addr, "%d.%d.%d.%d",
- ((serverp->addr.sin_addr.s_addr & 0xff)),
- ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
- ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
- ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24));
-
- LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_BUSY_CALL_CHANNEL, addr);
- osi_Log1(afsd_logp, "cm_Analyze: Retry RPC due to busy call channel addr[%s]",
- osi_LogSaveString(afsd_logp,addr));
- retry = 1;
+ retry = 2;
}
else if (errorCode == VNOSERVICE) {
/*
* The RPC was not serviced so it can be retried and any
* existing status information is still valid.
*/
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
if (fidp) {
if (serverp)
sprintf(addr, "%d.%d.%d.%d",
osi_LogSaveString(afsd_logp,addr), fidp->volume, cellp->name);
}
- if (timeLeft > 2)
- retry = 1;
- }
- else if (errorCode == RX_CALL_IDLE) {
- /*
- * RPC failed because the server failed to respond with data
- * within the idle dead timeout period. This could be for a variety
- * of reasons:
- * 1. The server could have a bad partition such as a failed
- * disk or iSCSI target and all I/O to that partition is
- * blocking on the server and will never complete.
- *
- * 2. The server vnode may be locked by another client request
- * that is taking a very long time.
- *
- * 3. The server may have a very long queue of requests
- * pending and is unable to process this request.
- *
- * 4. The server could be malicious and is performing a denial
- * of service attack against the client.
- *
- * If this is a request against a .readonly with alternate sites
- * the server should be marked down for this request and the
- * client should fail over to another server. If this is a
- * request against a single source, the client may retry once.
- */
- if (serverp)
- sprintf(addr, "%d.%d.%d.%d",
- ((serverp->addr.sin_addr.s_addr & 0xff)),
- ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
- ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
- ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24));
-
- if (fidp) {
- code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp,
- CM_GETVOL_FLAG_NO_LRU_UPDATE,
- &volp);
- if (code == 0) {
- statep = cm_VolumeStateByID(volp, fidp->volume);
-
- if (statep)
- replicated = (statep->flags & CM_VOL_STATE_FLAG_REPLICATED);
-
- lock_ObtainRead(&cm_volumeLock);
- cm_PutVolume(volp);
- lock_ReleaseRead(&cm_volumeLock);
- volp = NULL;
- }
-
- if (storeOp)
- scp = cm_FindSCache(fidp);
- if (scp) {
- if (cm_HaveCallback(scp)) {
- lock_ObtainWrite(&scp->rw);
- cm_DiscardSCache(scp);
- lock_ReleaseWrite(&scp->rw);
-
- /*
- * We really should notify the redirector that we discarded
- * the status information but doing so in this case is not
- * safe as it can result in a deadlock with extent release
- * processing.
- */
- }
- cm_ReleaseSCache(scp);
- }
- }
-
- if (replicated && serverp) {
- reqp->errorServp = serverp;
- reqp->tokenError = errorCode;
-
- if (timeLeft > 2)
- retry = 1;
- }
-
- LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_IDLE_DEAD_TIMEOUT, addr, retry);
- osi_Log2(afsd_logp, "cm_Analyze: RPC failed due to idle dead timeout addr[%s] retry=%u",
- osi_LogSaveString(afsd_logp,addr), retry);
+ retry = 2;
}
else if (errorCode == RX_CALL_DEAD) {
/* mark server as down */
(reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no"));
if (serverp) {
- if ((reqp->flags & CM_REQ_NEW_CONN_FORCED)) {
+ if ((connp->flags & CM_CONN_FLAG_NEW) ||
+ (reqp->flags & CM_REQ_NEW_CONN_FORCED)) {
lock_ObtainMutex(&serverp->mx);
if (!(serverp->flags & CM_SERVERFLAG_DOWN)) {
_InterlockedOr(&serverp->flags, CM_SERVERFLAG_DOWN);
(reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no"));
if (serverp) {
- if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
+ if ((connp->flags & CM_CONN_FLAG_NEW) ||
+ (reqp->flags & CM_REQ_NEW_CONN_FORCED)) {
reqp->errorServp = serverp;
reqp->tokenError = errorCode;
} else {
_InterlockedAnd(&ucellp->flags, ~CM_UCELLFLAG_RXKAD);
ucellp->gen++;
lock_ReleaseMutex(&userp->mx);
- if ( timeLeft > 2 )
- retry = 1;
+
+ reqp->flags |= CM_REQ_NEW_CONN_FORCED;
+ forcing_new = 1;
+ cm_ForceNewConnections(serverp);
+
+ if ( timeLeft > 2 )
+ retry = 2;
}
} else if (errorCode >= ERROR_TABLE_BASE_RXK && errorCode < ERROR_TABLE_BASE_RXK + 256) {
char * s = "unknown error";
osi_Log2(afsd_logp, "cm_Analyze: rxkad error code 0x%x (%s)",
errorCode, s);
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
if (serverp) {
reqp->errorServp = serverp;
reqp->tokenError = errorCode;
* to answer our query. Therefore, we will retry the request
* and force the use of another server.
*/
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
if (serverp) {
reqp->errorServp = serverp;
reqp->tokenError = errorCode;
retry = 1;
}
} else if (errorCode == VICECONNBAD || errorCode == VICETOKENDEAD) {
- cm_ForceNewConnections(serverp);
- if ( timeLeft > 2 )
- retry = 1;
+ reqp->flags |= CM_REQ_NEW_CONN_FORCED;
+ forcing_new = 1;
+ cm_ForceNewConnections(serverp);
+ retry = 2;
} else {
+ if (connp)
+ _InterlockedAnd(&connp->flags, ~CM_CONN_FLAG_NEW);
+
if (errorCode) {
char * s = "unknown error";
switch ( errorCode ) {
/* If not allowed to retry, don't */
if (dead_session ||
- !forcing_new && (reqp->flags & CM_REQ_NORETRY) &&
+ !forcing_new && (retry < 2) && (reqp->flags & CM_REQ_NORETRY) &&
!(errorCode > -64 && errorCode <= RX_INVALID_OPERATION))
retry = 0;
cm_PutConn(connp);
/*
+
* clear the volume updated flag if we succeed.
* this way the flag will not prevent a subsequent volume
* from being updated if necessary.
rx_SetConnHardDeadTime(tcp->rxconnp, HardDeadtimeout);
/*
- * Setting idle dead timeout to a non-zero value activates RX_CALL_IDLE errors
+ * Setting idle dead timeout to a non-zero value activates RX_CALL_TIMEOUT
+ * errors if the call is idle for a certain amount of time.
*/
if (replicated) {
- tcp->flags &= CM_CONN_FLAG_REPLICATION;
+ _InterlockedOr(&tcp->flags, CM_CONN_FLAG_REPLICATION);
rx_SetConnIdleDeadTime(tcp->rxconnp, ReplicaIdleDeadtimeout);
} else {
rx_SetConnIdleDeadTime(tcp->rxconnp, IdleDeadtimeout);
tcp->ucgen = ucellp->gen;
if (secObjp)
rxs_Release(secObjp); /* Decrement the initial refCount */
+
+ _InterlockedAnd(&tcp->flags, ~CM_CONN_FLAG_FORCE_NEW);
+ _InterlockedOr(&tcp->flags, CM_CONN_FLAG_NEW);
}
long cm_ConnByServer(cm_server_t *serverp, cm_user_t *userp, afs_uint32 replicated, cm_conn_t **connpp)
userp = cm_rootUserp;
lock_ObtainMutex(&userp->mx);
+ /* find ucell structure */
+ ucellp = cm_GetUCell(userp, serverp->cellp);
+
lock_ObtainRead(&cm_connLock);
for (tcp = serverp->connsp; tcp; tcp=tcp->nextp) {
if (tcp->userp == userp &&
break;
}
- /* find ucell structure */
- ucellp = cm_GetUCell(userp, serverp->cellp);
if (!tcp) {
lock_ConvertRToW(&cm_connLock);
for (tcp = serverp->connsp; tcp; tcp=tcp->nextp) {
- if (tcp->userp == userp)
- break;
+ if (tcp->userp == userp &&
+ (replicated && (tcp->flags & CM_CONN_FLAG_REPLICATION) ||
+ !replicated && !(tcp->flags & CM_CONN_FLAG_REPLICATION)))
+ break;
}
if (tcp) {
InterlockedIncrement(&tcp->refCount);
osi_Log0(afsd_logp, "cm_ConnByServer replace connection due to token update");
else
osi_Log0(afsd_logp, "cm_ConnByServer replace connection due to crypt change");
- tcp->flags &= ~CM_CONN_FLAG_FORCE_NEW;
rx_SetConnSecondsUntilNatPing(tcp->rxconnp, 0);
rx_DestroyConnection(tcp->rxconnp);
cm_NewRXConnection(tcp, ucellp, serverp, replicated);
{
cm_conn_t *tcp;
+ if (serverp == NULL)
+ return;
+
lock_ObtainWrite(&cm_connLock);
for (tcp = serverp->connsp; tcp; tcp=tcp->nextp) {
lock_ObtainMutex(&tcp->mx);
- tcp->flags |= CM_CONN_FLAG_FORCE_NEW;
+ _InterlockedOr(&tcp->flags, CM_CONN_FLAG_FORCE_NEW);
lock_ReleaseMutex(&tcp->mx);
}
lock_ReleaseWrite(&cm_connLock);