reqp->startTime = GetTickCount();
}
-static long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp,
+long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp,
struct cm_req *reqp, afs_uint32 *replicated, cm_serverRef_t ***serversppp)
{
long code;
for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
if (tsrp->status == srv_deleted)
continue;
- if (tsrp->server == serverp && tsrp->status == srv_not_busy) {
+ if (cm_ServerEqual(tsrp->server, serverp) && tsrp->status == srv_not_busy) {
tsrp->status = srv_busy;
break;
}
if ((errorCode == VMOVED || errorCode == VNOVOL || errorCode == VOFFLINE) &&
!(reqp->flags & CM_REQ_VOLUME_UPDATED))
{
+ LONG_PTR oldSum, newSum;
+
+ oldSum = cm_ChecksumVolumeServerList(fidp, userp, reqp);
+
code = cm_ForceUpdateVolume(fidp, userp, reqp);
- if (code == 0)
+ if (code == 0) {
location_updated = 1;
+ newSum = cm_ChecksumVolumeServerList(fidp, userp, reqp);
+ }
- /* Even if the update fails, there might still be another replica */
+ /*
+ * Even if the update fails, there might still be another replica.
+ * If the volume location list changed, permit another update on
+ * a subsequent error.
+ */
+ if (code || oldSum == newSum)
+ reqp->flags |= CM_REQ_VOLUME_UPDATED;
- reqp->flags |= CM_REQ_VOLUME_UPDATED;
osi_Log3(afsd_logp, "cm_Analyze called cm_ForceUpdateVolume cell %u vol %u code 0x%x",
- fidp->cell, fidp->volume, code);
+ fidp->cell, fidp->volume, code);
}
if (statep) {
if (cm_ServerEqual(tsrp->server, serverp)) {
/* REDIRECT */
- if (errorCode == VMOVED || errorCode == VNOVOL) {
- osi_Log2(afsd_logp, "volume %d not present on server %s",
+ switch (errorCode) {
+ case VMOVED:
+ osi_Log2(afsd_logp, "volume %u moved from server %s",
fidp->volume, osi_LogSaveString(afsd_logp,addr));
tsrp->status = srv_deleted;
if (fidp)
cm_RemoveVolumeFromServer(serverp, fidp->volume);
- } else {
- osi_Log2(afsd_logp, "volume %d instance on server %s marked offline",
- fidp->volume, osi_LogSaveString(afsd_logp,addr));
- tsrp->status = srv_offline;
+ break;
+ case VNOVOL:
+ /*
+ * The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which
+ * are no indicative of the volume not being present. For example,
+ * VNOVOL can be sent during a transition to a VBUSY state prior to
+ * salvaging or when cloning a .backup volume instance. As a result
+ * the cache manager must attempt at least one retry when a VNOVOL is
+ * receive but there are no changes to the volume location information.
+ */
+ if (reqp->vnovolError > 0 && cm_ServerEqual(reqp->errorServp, serverp)) {
+ osi_Log2(afsd_logp, "volume %u not present on server %s",
+ fidp->volume, osi_LogSaveString(afsd_logp,addr));
+ tsrp->status = srv_deleted;
+ if (fidp)
+ cm_RemoveVolumeFromServer(serverp, fidp->volume);
+ } else {
+ osi_Log2(afsd_logp, "VNOVOL received for volume %u from server %s",
+ fidp->volume, osi_LogSaveString(afsd_logp,addr));
+ if (replicated) {
+ cm_SetServerBusyStatus(serversp, serverp);
+ } else {
+ Sleep(2000);
+ }
+ }
+ break;
+ default:
+ osi_Log3(afsd_logp, "volume %u exists on server %s with status %u",
+ fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
}
- /* break; */
- } else {
- osi_Log3(afsd_logp, "volume %d exists on server %s with status %u",
- fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
}
}
lock_ReleaseWrite(&cm_serverLock);
+ /* Remember that the VNOVOL error occurred */
+ if (errorCode == VNOVOL) {
+ reqp->errorServp = serverp;
+ reqp->vnovolError++;
+ }
+
/* Free the server list before cm_ForceUpdateVolume is called */
if (free_svr_list) {
cm_FreeServerList(serverspp, 0);
LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr);
osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]",
osi_LogSaveString(afsd_logp,addr));
- reqp->tokenIdleErrorServp = serverp;
+ reqp->errorServp = serverp;
reqp->idleError++;
}
LogEvent(EVENTLOG_WARNING_TYPE, MSG_SERVER_REPORTS_VNOSERVICE,
addr, fidp->volume, cellp->name);
- osi_Log3(afsd_logp, "Server %s reported volume %d in cell %s as not in service.",
+ osi_Log3(afsd_logp, "Server %s reported rpc to volume %d in cell %s as not serviced.",
osi_LogSaveString(afsd_logp,addr), fidp->volume, cellp->name);
}
}
if (replicated && serverp) {
- reqp->tokenIdleErrorServp = serverp;
+ reqp->errorServp = serverp;
reqp->tokenError = errorCode;
if (timeLeft > 2)
if (serverp) {
if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
- reqp->tokenIdleErrorServp = serverp;
+ reqp->errorServp = serverp;
reqp->tokenError = errorCode;
} else {
reqp->flags |= CM_REQ_NEW_CONN_FORCED;
errorCode, s);
if (serverp) {
- reqp->tokenIdleErrorServp = serverp;
+ reqp->errorServp = serverp;
reqp->tokenError = errorCode;
retry = 1;
}
* and force the use of another server.
*/
if (serverp) {
- reqp->tokenIdleErrorServp = serverp;
+ reqp->errorServp = serverp;
reqp->tokenError = errorCode;
retry = 1;
}
continue;
tsp = tsrp->server;
- if (reqp->tokenIdleErrorServp) {
+ if (reqp->errorServp) {
/*
* search the list until we find the server
* that failed last time. When we find it
* clear the error, skip it and try the next one
* in the list.
*/
- if (tsp == reqp->tokenIdleErrorServp)
- reqp->tokenIdleErrorServp = NULL;
+ if (tsp == reqp->errorServp)
+ reqp->errorServp = NULL;
continue;
}
if (tsp) {