From 9056d09887c84a480e0a9ee3457a9469fbb97064 Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Tue, 17 Jan 2012 19:46:30 -0500 Subject: [PATCH] Windows: failover and retry for VBUSY When a file server returns the VBUSY error for an RPC the cache manager records the 'srv_busy' state in the cm_serverRef_t structure binding that file server to the active cm_volume_t object. The 'srv_busy' was never cleared which prevents the volume from being accessed. Clear the 'srv_busy' flag whenever cm_Analyze() receives a CM_ERROR_ALLBUSY error which means that all replicas have been tried or whenever the error is not VBUSY or VRESTARTING. FIXES 130537 Change-Id: I5020198e4f0ded1df0f64e228e699852f9de7c4d Reviewed-on: http://gerrit.openafs.org/6563 Reviewed-by: Derrick Brashear Reviewed-by: Jeffrey Altman Tested-by: Jeffrey Altman --- src/WINNT/afsd/cm_conn.c | 155 +++++++++++++++++++++++++++-------------------- 1 file changed, 88 insertions(+), 67 deletions(-) diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index 8310f0b..6f2bdf8 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -241,6 +241,39 @@ static long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp, return (*serversppp ? 0 : CM_ERROR_NOSUCHVOLUME); } +void +cm_SetServerBusyStatus(cm_serverRef_t *serversp, cm_server_t *serverp) +{ + cm_serverRef_t *tsrp; + + lock_ObtainWrite(&cm_serverLock); + for (tsrp = serversp; tsrp; tsrp=tsrp->next) { + if (tsrp->status == srv_deleted) + continue; + if (tsrp->server == serverp && tsrp->status == srv_not_busy) { + tsrp->status = srv_busy; + break; + } + } + lock_ReleaseWrite(&cm_serverLock); +} + +void +cm_ResetServerBusyStatus(cm_serverRef_t *serversp) +{ + cm_serverRef_t *tsrp; + + lock_ObtainWrite(&cm_serverLock); + for (tsrp = serversp; tsrp; tsrp=tsrp->next) { + if (tsrp->status == srv_deleted) + continue; + if (tsrp->status == srv_busy) { + tsrp->status = srv_not_busy; + } + } + lock_ReleaseWrite(&cm_serverLock); +} + /* * Analyze the error return from an RPC. Determine whether or not to retry, * and if we're going to retry, determine whether failover is appropriate, @@ -398,6 +431,20 @@ cm_Analyze(cm_conn_t *connp, format = "All servers are offline when accessing cell %s volume %d."; LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume); + if (!serversp) { + code = cm_GetServerList(fidp, userp, reqp, &replicated, &serverspp); + if (code == 0) { + serversp = *serverspp; + free_svr_list = 1; + } + } + cm_ResetServerBusyStatus(serversp); + if (free_svr_list) { + cm_FreeServerList(serverspp, 0); + free_svr_list = 0; + serversp = NULL; + } + code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, CM_GETVOL_FLAG_NO_LRU_UPDATE, &volp); @@ -435,47 +482,30 @@ cm_Analyze(cm_conn_t *connp, format = "All servers are busy when accessing cell %s volume %d."; LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume); + if (!serversp) { + code = cm_GetServerList(fidp, userp, reqp, &replicated, &serverspp); + if (code == 0) { + serversp = *serverspp; + free_svr_list = 1; + } + } + cm_ResetServerBusyStatus(serversp); + if (free_svr_list) { + cm_FreeServerList(serverspp, 0); + free_svr_list = 0; + serversp = NULL; + } + code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, CM_GETVOL_FLAG_NO_LRU_UPDATE, &volp); if (code == 0) { if (timeLeft > 7) { thrd_Sleep(5000); - statep = cm_VolumeStateByID(volp, fidp->volume); - if (statep->state != vl_offline && - statep->state != vl_busy && - statep->state != vl_unknown) { - retry = 1; - } else { - if (!serversp) { - code = cm_GetServerList(fidp, userp, reqp, &replicated, &serverspp); - if (code == 0) { - serversp = *serverspp; - free_svr_list = 1; - } - } - lock_ObtainWrite(&cm_serverLock); - for (tsrp = serversp; tsrp; tsrp=tsrp->next) { - if (tsrp->status == srv_deleted) - continue; - if (tsrp->status == srv_busy) { - tsrp->status = srv_not_busy; - } - } - lock_ReleaseWrite(&cm_serverLock); - if (free_svr_list) { - cm_FreeServerList(serverspp, 0); - serversp = NULL; - free_svr_list = 0; - } - - cm_UpdateVolumeStatus(volp, fidp->volume); - retry = 1; - } - } else { - cm_UpdateVolumeStatus(volp, fidp->volume); + retry = 1; } + cm_UpdateVolumeStatus(volp, fidp->volume); lock_ObtainRead(&cm_volumeLock); cm_PutVolume(volp); @@ -489,15 +519,7 @@ cm_Analyze(cm_conn_t *connp, thrd_Sleep(5000); if (serversp) { - lock_ObtainWrite(&cm_serverLock); - for (tsrp = serversp; tsrp; tsrp=tsrp->next) { - if (tsrp->status == srv_deleted) - continue; - if (tsrp->status == srv_busy) { - tsrp->status = srv_not_busy; - } - } - lock_ReleaseWrite(&cm_serverLock); + cm_ResetServerBusyStatus(serversp); retry = 1; } } @@ -537,32 +559,23 @@ cm_Analyze(cm_conn_t *connp, LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume, cellp->name); } - lock_ObtainWrite(&cm_serverLock); - for (tsrp = serversp; tsrp; tsrp=tsrp->next) { - if (tsrp->status == srv_deleted) - continue; - if (tsrp->server == serverp && tsrp->status == srv_not_busy) { - tsrp->status = srv_busy; - if (fidp) { /* File Server query */ - lock_ReleaseWrite(&cm_serverLock); - code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, - CM_GETVOL_FLAG_NO_LRU_UPDATE, - &volp); - if (code == 0) - statep = cm_VolumeStateByID(volp, fidp->volume); - lock_ObtainWrite(&cm_serverLock); - } - break; - } - } - lock_ReleaseWrite(&cm_serverLock); + cm_SetServerBusyStatus(serversp, serverp); - if (statep) { - cm_UpdateVolumeStatus(volp, statep->ID); - lock_ObtainRead(&cm_volumeLock); - cm_PutVolume(volp); - lock_ReleaseRead(&cm_volumeLock); - volp = NULL; + if (fidp) { /* File Server query */ + code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, + CM_GETVOL_FLAG_NO_LRU_UPDATE, + &volp); + if (code == 0) { + statep = cm_VolumeStateByID(volp, fidp->volume); + + if (statep) + cm_UpdateVolumeStatus(volp, statep->ID); + + lock_ObtainRead(&cm_volumeLock); + cm_PutVolume(volp); + lock_ReleaseRead(&cm_volumeLock); + volp = NULL; + } } if (free_svr_list) { @@ -1199,6 +1212,14 @@ cm_Analyze(cm_conn_t *connp, reqp->flags &= ~CM_REQ_VOLUME_UPDATED; } + if ( serversp && + errorCode != VBUSY && + errorCode != VRESTARTING && + errorCode != CM_ERROR_ALLBUSY) + { + cm_ResetServerBusyStatus(serversp); + } + /* retry until we fail to find a connection */ return retry; } -- 1.9.4