From dbb62996df6ad4efb2d2e3711e4ac59841b75027 Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Fri, 19 Jun 2009 04:38:50 +0000 Subject: [PATCH] DEVEL15-windows-conn-analyze-20090615 LICENSE MIT When processing RX_CALL_TIMEOUT, if there is no fid specified its a vldb rpc and we should retry. If there is a fid and the volume is a readonly volume then we should also retry. Only fail if the request has no alternate server to look at. When processing RX_CALL_DEAD, log the fact that the call is dead. When processing any other RX error between -2 and -64, do not force a new connection to the same server. Mark the server down and retry with a new server if possible. (cherry picked from commit 82d9807e2246997ac73930c91d1ad4312084cc57) --- src/WINNT/afsd/cm_conn.c | 82 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 27 deletions(-) diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index cc70fb9..39b8a20 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -470,13 +470,13 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, if (serverp && fidp) { /* Log server being offline for this volume */ sprintf(addr, "%d.%d.%d.%d", - ((serverp->addr.sin_addr.s_addr & 0xff)), - ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), - ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), - ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + ((serverp->addr.sin_addr.s_addr & 0xff)), + ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), + ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); - osi_Log2(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume); - LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume); + osi_Log2(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume); + LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume); } /* @@ -612,25 +612,53 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, */ if (serverp) { - /* Log server being offline for this volume */ sprintf(addr, "%d.%d.%d.%d", - ((serverp->addr.sin_addr.s_addr & 0xff)), - ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), - ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), - ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + ((serverp->addr.sin_addr.s_addr & 0xff)), + ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), + ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr); - - osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadTime exceeded addr[%s]", + osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]", osi_LogSaveString(afsd_logp,addr)); reqp->tokenIdleErrorServp = serverp; reqp->idleError++; + + if (timeLeft > 2) { + if (!fidp) { /* vldb */ + retry = 1; + } else { /* file */ + cm_volume_t *volp = cm_GetVolumeByFID(fidp); + if (volp) { + if (fidp->volume == cm_GetROVolumeID(volp)) + retry = 1; + cm_PutVolume(volp); + } + } + } } } else if (errorCode >= -64 && errorCode < 0) { /* mark server as down */ + sprintf(addr, "%d.%d.%d.%d", + ((serverp->addr.sin_addr.s_addr & 0xff)), + ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), + ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + + if (errorCode == RX_CALL_DEAD) + osi_Log2(afsd_logp, "cm_Analyze: Rx Call Dead addr[%s] forcedNew[%s]", + osi_LogSaveString(afsd_logp,addr), + (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no")); + else + osi_Log3(afsd_logp, "cm_Analyze: Rx Misc Error[%d] addr[%s] forcedNew[%s]", + errorCode, + osi_LogSaveString(afsd_logp,addr), + (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no")); + lock_ObtainMutex(&serverp->mx); - if (reqp->flags & CM_REQ_NEW_CONN_FORCED) { + if (errorCode != RX_CALL_DEAD || + (reqp->flags & CM_REQ_NEW_CONN_FORCED)) { if (!(serverp->flags & CM_SERVERFLAG_DOWN)) { serverp->flags |= CM_SERVERFLAG_DOWN; serverp->downTime = time(NULL); @@ -660,23 +688,23 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, } } else if (errorCode >= ERROR_TABLE_BASE_RXK && errorCode < ERROR_TABLE_BASE_RXK + 256) { if (serverp) { + reqp->tokenIdleErrorServp = serverp; + reqp->tokenError = errorCode; + retry = 1; + } + } else if (errorCode >= ERROR_TABLE_BASE_U && errorCode < ERROR_TABLE_BASE_U + 256) { + /* + * We received a ubik error. its possible that the server we are + * communicating with has a corrupted database or is partitioned + * from the rest of the servers and another server might be able + * to answer our query. Therefore, we will retry the request + * and force the use of another server. + */ + if (serverp) { reqp->tokenIdleErrorServp = serverp; reqp->tokenError = errorCode; retry = 1; } - } else if (errorCode >= ERROR_TABLE_BASE_U && errorCode < ERROR_TABLE_BASE_U + 256) { - /* - * We received a ubik error. its possible that the server we are - * communicating with has a corrupted database or is partitioned - * from the rest of the servers and another server might be able - * to answer our query. Therefore, we will retry the request - * and force the use of another server. - */ - if (serverp) { - reqp->tokenIdleErrorServp = serverp; - reqp->tokenError = errorCode; - retry = 1; - } } else if (errorCode == VICECONNBAD || errorCode == VICETOKENDEAD) { cm_ForceNewConnections(serverp); if ( timeLeft > 2 ) -- 1.9.4