Windows: Apply cm_GetVolServerList() to cm_Analyze()
[openafs.git] / src / WINNT / afsd / cm_conn.c
index ab95902..3043d5d 100644 (file)
@@ -210,12 +210,18 @@ void cm_InitReq(cm_req_t *reqp)
        reqp->startTime = GetTickCount();
 }
 
-static long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp,
+long cm_GetVolServerList(cm_volume_t *volp, afs_uint32 volid, struct cm_user *userp,
+       struct cm_req *reqp, afs_uint32 *replicated, cm_serverRef_t ***serversppp)
+{
+    *serversppp = cm_GetVolServers(volp, volid, userp, reqp, replicated);
+    return (*serversppp ? 0 : CM_ERROR_NOSUCHVOLUME);
+}
+
+long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp,
        struct cm_req *reqp, afs_uint32 *replicated, cm_serverRef_t ***serversppp)
 {
     long code;
     cm_volume_t *volp = NULL;
-    cm_vol_state_t *volstatep = NULL;
     cm_cell_t *cellp = NULL;
 
     if (!fidp) {
@@ -231,9 +237,7 @@ static long cm_GetServerList(struct cm_fid *fidp, struct cm_user *userp,
     if (code)
         return code;
 
-    volstatep = cm_VolumeStateByID(volp, fidp->volume);
-    *replicated = (volstatep->flags & CM_VOL_STATE_FLAG_REPLICATED);
-    *serversppp = cm_GetVolServers(volp, fidp->volume, userp, reqp);
+    *serversppp = cm_GetVolServers(volp, fidp->volume, userp, reqp, replicated);
 
     lock_ObtainRead(&cm_volumeLock);
     cm_PutVolume(volp);
@@ -250,7 +254,7 @@ cm_SetServerBusyStatus(cm_serverRef_t *serversp, cm_server_t *serverp)
     for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
         if (tsrp->status == srv_deleted)
             continue;
-        if (tsrp->server == serverp && tsrp->status == srv_not_busy) {
+        if (cm_ServerEqual(tsrp->server, serverp) && tsrp->status == srv_not_busy) {
             tsrp->status = srv_busy;
             break;
         }
@@ -286,7 +290,7 @@ cm_ResetServerBusyStatus(cm_serverRef_t *serversp)
  *
  * If the error code is from cm_ConnFromFID() or friends, connp will be NULL.
  *
- * For VLDB calls, fidp will be NULL.
+ * For VLDB calls, fidp will be NULL and cellp will not be.
  *
  * volSyncp and/or cbrp may also be NULL.
  */
@@ -295,6 +299,7 @@ cm_Analyze(cm_conn_t *connp,
            cm_user_t *userp,
            cm_req_t *reqp,
            struct cm_fid *fidp,
+           cm_cell_t *cellp,
            afs_uint32 storeOp,
            AFSVolSync *volSyncp,
            cm_serverRef_t * serversp,
@@ -304,7 +309,6 @@ cm_Analyze(cm_conn_t *connp,
     cm_server_t *serverp = NULL;
     cm_serverRef_t **serverspp = NULL;
     cm_serverRef_t *tsrp;
-    cm_cell_t  *cellp = NULL;
     cm_ucell_t *ucellp;
     cm_volume_t * volp = NULL;
     cm_vol_state_t *statep = NULL;
@@ -404,6 +408,38 @@ cm_Analyze(cm_conn_t *connp,
          */
     }
 
+    else if (errorCode == CM_ERROR_EMPTY) {
+        /*
+         * The server list is empty (or all entries have been deleted).
+         * If fidp is NULL, this was a vlServer list and we can attempt
+         * to force a cell lookup.  If fidp is not NULL, we can attempt
+         * to refresh the volume location list.
+         */
+        if (fidp) {
+            code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp,
+                                     CM_GETVOL_FLAG_NO_LRU_UPDATE,
+                                     &volp);
+            if (code == 0) {
+                if (cm_UpdateVolumeLocation(cellp, userp, reqp, volp) == 0) {
+                    code = cm_GetVolServerList(volp, fidp->volume, userp, reqp, &replicated, &serverspp);
+                    if (code == 0) {
+                        if (!cm_IsServerListEmpty(*serverspp))
+                            retry = 1;
+                        cm_FreeServerList(serverspp, 0);
+                    }
+                }
+
+                lock_ObtainRead(&cm_volumeLock);
+                cm_PutVolume(volp);
+                lock_ReleaseRead(&cm_volumeLock);
+                volp = NULL;
+            }
+        } else {
+            cm_cell_t * newCellp = cm_UpdateCell( cellp, 0);
+            if (newCellp)
+                retry = 1;
+        }
+    }
     else if (errorCode == CM_ERROR_ALLDOWN) {
        /* Servers marked DOWN will be restored by the background daemon
         * thread as they become available.  The volume status is
@@ -419,7 +455,6 @@ cm_Analyze(cm_conn_t *connp,
             osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLDOWN (VL Server)");
         }
     }
-
     else if (errorCode == CM_ERROR_ALLOFFLINE) {
         /* Volume instances marked offline will be restored by the
          * background daemon thread as they become available
@@ -431,24 +466,24 @@ cm_Analyze(cm_conn_t *connp,
             format = "All servers are offline when accessing cell %s volume %d.";
            LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume);
 
-            if (!serversp) {
-                code = cm_GetServerList(fidp, userp, reqp, &replicated, &serverspp);
-                if (code == 0) {
-                    serversp = *serverspp;
-                    free_svr_list = 1;
-                }
-            }
-            cm_ResetServerBusyStatus(serversp);
-            if (free_svr_list) {
-                cm_FreeServerList(serverspp, 0);
-                free_svr_list = 0;
-                serversp = NULL;
-            }
-
             code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp,
                                       CM_GETVOL_FLAG_NO_LRU_UPDATE,
                                       &volp);
             if (code == 0) {
+                if (!serversp) {
+                    code = cm_GetVolServerList(volp, fidp->volume, userp, reqp, &replicated, &serverspp);
+                    if (code == 0) {
+                        serversp = *serverspp;
+                        free_svr_list = 1;
+                    }
+                }
+                cm_ResetServerBusyStatus(serversp);
+                if (free_svr_list) {
+                    cm_FreeServerList(serverspp, 0);
+                    free_svr_list = 0;
+                    serversp = NULL;
+                }
+
                 /*
                  * Do not perform a cm_CheckOfflineVolume() if cm_Analyze()
                  * was called by cm_CheckOfflineVolumeState().
@@ -482,24 +517,24 @@ cm_Analyze(cm_conn_t *connp,
             format = "All servers are busy when accessing cell %s volume %d.";
            LogEvent(EVENTLOG_WARNING_TYPE, msgID, cellp->name, fidp->volume);
 
-            if (!serversp) {
-                code = cm_GetServerList(fidp, userp, reqp, &replicated, &serverspp);
-                if (code == 0) {
-                    serversp = *serverspp;
-                    free_svr_list = 1;
-                }
-            }
-            cm_ResetServerBusyStatus(serversp);
-            if (free_svr_list) {
-                cm_FreeServerList(serverspp, 0);
-                free_svr_list = 0;
-                serversp = NULL;
-            }
-
             code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp,
                                      CM_GETVOL_FLAG_NO_LRU_UPDATE,
                                      &volp);
             if (code == 0) {
+                if (!serversp) {
+                    code = cm_GetVolServerList(volp, fidp->volume, userp, reqp, &replicated, &serverspp);
+                    if (code == 0) {
+                        serversp = *serverspp;
+                        free_svr_list = 1;
+                    }
+                }
+                cm_ResetServerBusyStatus(serversp);
+                if (free_svr_list) {
+                    cm_FreeServerList(serverspp, 0);
+                    free_svr_list = 0;
+                    serversp = NULL;
+                }
+
                 if (timeLeft > 7) {
                     thrd_Sleep(5000);
                     statep = cm_VolumeStateByID(volp, fidp->volume);
@@ -528,44 +563,19 @@ cm_Analyze(cm_conn_t *connp,
 
     /* special codes:  VBUSY and VRESTARTING */
     else if (errorCode == VBUSY || errorCode == VRESTARTING) {
-        if (!serversp && fidp) {
-            code = cm_GetServerList(fidp, userp, reqp, &replicated, &serverspp);
-            if (code == 0) {
-                serversp = *serverspp;
-                free_svr_list = 1;
-            }
-        }
-
-        switch ( errorCode ) {
-        case VBUSY:
-           msgID = MSG_SERVER_REPORTS_VBUSY;
-            format = "Server %s reported busy when accessing volume %d in cell %s.";
-            break;
-        case VRESTARTING:
-           msgID = MSG_SERVER_REPORTS_VRESTARTING;
-            format = "Server %s reported restarting when accessing volume %d in cell %s.";
-            break;
-        }
-
-        if (serverp && fidp) {
-            /* Log server being offline for this volume */
-            sprintf(addr, "%d.%d.%d.%d",
-                   ((serverp->addr.sin_addr.s_addr & 0xff)),
-                   ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
-                   ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
-                   ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24));
-
-           osi_Log3(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume, cellp->name);
-           LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume, cellp->name);
-        }
-
-        cm_SetServerBusyStatus(serversp, serverp);
-
-        if (fidp) { /* File Server query */
+        if (fidp) {
             code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp,
                                       CM_GETVOL_FLAG_NO_LRU_UPDATE,
                                       &volp);
             if (code == 0) {
+                if (!serversp) {
+                    code = cm_GetVolServerList(volp, fidp->volume, userp, reqp, &replicated, &serverspp);
+                    if (code == 0) {
+                        serversp = *serverspp;
+                        free_svr_list = 1;
+                    }
+                }
+
                 statep = cm_VolumeStateByID(volp, fidp->volume);
 
                 if (statep)
@@ -578,6 +588,31 @@ cm_Analyze(cm_conn_t *connp,
             }
         }
 
+        if (serverp) {
+            /* Log server being offline for this volume */
+            sprintf(addr, "%d.%d.%d.%d",
+                    ((serverp->addr.sin_addr.s_addr & 0xff)),
+                    ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8),
+                    ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                     ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24));
+
+            switch ( errorCode ) {
+            case VBUSY:
+                msgID = MSG_SERVER_REPORTS_VBUSY;
+                format = "Server %s reported busy when accessing volume %d in cell %s.";
+                break;
+            case VRESTARTING:
+                msgID = MSG_SERVER_REPORTS_VRESTARTING;
+                format = "Server %s reported restarting when accessing volume %d in cell %s.";
+                break;
+            }
+
+            osi_Log3(afsd_logp, format, osi_LogSaveString(afsd_logp,addr), fidp->volume, cellp->name);
+            LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume, cellp->name);
+
+            cm_SetServerBusyStatus(serversp, serverp);
+        }
+
         if (free_svr_list) {
             cm_FreeServerList(serverspp, 0);
             serversp = NULL;
@@ -638,15 +673,26 @@ cm_Analyze(cm_conn_t *connp,
             if ((errorCode == VMOVED || errorCode == VNOVOL || errorCode == VOFFLINE) &&
                 !(reqp->flags & CM_REQ_VOLUME_UPDATED))
             {
+                LONG_PTR oldSum, newSum;
+
+                oldSum = cm_ChecksumVolumeServerList(fidp, userp, reqp);
+
                 code = cm_ForceUpdateVolume(fidp, userp, reqp);
-                if (code == 0)
+                if (code == 0) {
                     location_updated = 1;
+                    newSum = cm_ChecksumVolumeServerList(fidp, userp, reqp);
+                }
 
-                /* Even if the update fails, there might still be another replica */
+                /*
+                 * Even if the update fails, there might still be another replica.
+                 * If the volume location list changed, permit another update on
+                 * a subsequent error.
+                 */
+                if (code || oldSum == newSum)
+                    reqp->flags |= CM_REQ_VOLUME_UPDATED;
 
-                reqp->flags |= CM_REQ_VOLUME_UPDATED;
                 osi_Log3(afsd_logp, "cm_Analyze called cm_ForceUpdateVolume cell %u vol %u code 0x%x",
-                        fidp->cell, fidp->volume, code);
+                         fidp->cell, fidp->volume, code);
             }
 
             if (statep) {
@@ -688,25 +734,54 @@ cm_Analyze(cm_conn_t *connp,
 
             if (cm_ServerEqual(tsrp->server, serverp)) {
                 /* REDIRECT */
-                if (errorCode == VMOVED || errorCode == VNOVOL) {
-                    osi_Log2(afsd_logp, "volume %d not present on server %s",
+                switch (errorCode) {
+                case VMOVED:
+                    osi_Log2(afsd_logp, "volume %u moved from server %s",
                              fidp->volume, osi_LogSaveString(afsd_logp,addr));
                     tsrp->status = srv_deleted;
                     if (fidp)
                         cm_RemoveVolumeFromServer(serverp, fidp->volume);
-                } else {
-                    osi_Log2(afsd_logp, "volume %d instance on server %s marked offline",
-                             fidp->volume, osi_LogSaveString(afsd_logp,addr));
-                    tsrp->status = srv_offline;
+                    break;
+                case VNOVOL:
+                    /*
+                     * The 1.6.0 and 1.6.1 file servers send transient VNOVOL errors which
+                     * are no indicative of the volume not being present.  For example,
+                     * VNOVOL can be sent during a transition to a VBUSY state prior to
+                     * salvaging or when cloning a .backup volume instance.  As a result
+                     * the cache manager must attempt at least one retry when a VNOVOL is
+                     * receive but there are no changes to the volume location information.
+                     */
+                    if (reqp->vnovolError > 0 && cm_ServerEqual(reqp->errorServp, serverp)) {
+                        osi_Log2(afsd_logp, "volume %u not present on server %s",
+                                  fidp->volume, osi_LogSaveString(afsd_logp,addr));
+                        tsrp->status = srv_deleted;
+                        if (fidp)
+                            cm_RemoveVolumeFromServer(serverp, fidp->volume);
+                    } else {
+                        osi_Log2(afsd_logp, "VNOVOL received for volume %u from server %s",
+                                 fidp->volume, osi_LogSaveString(afsd_logp,addr));
+                        if (replicated) {
+                            if (tsrp->status == srv_not_busy)
+                                tsrp->status = srv_busy;
+                        } else {
+                            Sleep(2000);
+                        }
+                    }
+                    break;
+                default:
+                    osi_Log3(afsd_logp, "volume %u exists on server %s with status %u",
+                             fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
                 }
-                /* break; */
-            } else {
-                osi_Log3(afsd_logp, "volume %d exists on server %s with status %u",
-                         fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
             }
         }
         lock_ReleaseWrite(&cm_serverLock);
 
+        /* Remember that the VNOVOL error occurred */
+        if (errorCode == VNOVOL) {
+            reqp->errorServp = serverp;
+            reqp->vnovolError++;
+        }
+
         /* Free the server list before cm_ForceUpdateVolume is called */
         if (free_svr_list) {
             cm_FreeServerList(serverspp, 0);
@@ -779,7 +854,7 @@ cm_Analyze(cm_conn_t *connp,
             LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr);
             osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]",
                      osi_LogSaveString(afsd_logp,addr));
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->idleError++;
         }
 
@@ -947,7 +1022,7 @@ cm_Analyze(cm_conn_t *connp,
         }
 
         if (replicated && serverp) {
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->tokenError = errorCode;
 
             if (timeLeft > 2)
@@ -1023,7 +1098,7 @@ cm_Analyze(cm_conn_t *connp,
 
         if (serverp) {
             if (reqp->flags & CM_REQ_NEW_CONN_FORCED) {
-                reqp->tokenIdleErrorServp = serverp;
+                reqp->errorServp = serverp;
                 reqp->tokenError = errorCode;
             } else {
                 reqp->flags |= CM_REQ_NEW_CONN_FORCED;
@@ -1039,7 +1114,7 @@ cm_Analyze(cm_conn_t *connp,
                  errorCode);
         if (!dead_session) {
             lock_ObtainMutex(&userp->mx);
-            ucellp = cm_GetUCell(userp, serverp->cellp);
+            ucellp = cm_GetUCell(userp, cellp);
             if (ucellp->ticketp) {
                 free(ucellp->ticketp);
                 ucellp->ticketp = NULL;
@@ -1071,7 +1146,7 @@ cm_Analyze(cm_conn_t *connp,
                   errorCode, s);
 
         if (serverp) {
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->tokenError = errorCode;
             retry = 1;
         }
@@ -1084,7 +1159,7 @@ cm_Analyze(cm_conn_t *connp,
          * and force the use of another server.
          */
         if (serverp) {
-            reqp->tokenIdleErrorServp = serverp;
+            reqp->errorServp = serverp;
             reqp->tokenError = errorCode;
             retry = 1;
         }
@@ -1255,15 +1330,15 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, afs_uint32 replicated, cm_user_
     cm_serverRef_t *tsrp;
     cm_server_t *tsp;
     long firstError = 0;
-    int someBusy = 0, someOffline = 0, allOffline = 1, allBusy = 1, allDown = 1;
+    int someBusy = 0, someOffline = 0, allOffline = 1, allBusy = 1, allDown = 1, allDeleted = 1;
 #ifdef SET_RX_TIMEOUTS_TO_TIMELEFT
     long timeUsed, timeLeft, hardTimeLeft;
 #endif
     *connpp = NULL;
 
     if (serversp == NULL) {
-       osi_Log1(afsd_logp, "cm_ConnByMServers returning 0x%x", CM_ERROR_ALLDOWN);
-       return CM_ERROR_ALLDOWN;
+       osi_Log1(afsd_logp, "cm_ConnByMServers returning 0x%x", CM_ERROR_EMPTY);
+       return CM_ERROR_EMPTY;
     }
 
 #ifdef SET_RX_TIMEOUTS_TO_TIMELEFT
@@ -1279,16 +1354,18 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, afs_uint32 replicated, cm_user_
         if (tsrp->status == srv_deleted)
             continue;
 
+        allDeleted = 0;
+
         tsp = tsrp->server;
-        if (reqp->tokenIdleErrorServp) {
+        if (reqp->errorServp) {
             /*
              * search the list until we find the server
              * that failed last time.  When we find it
              * clear the error, skip it and try the next one
              * in the list.
              */
-            if (tsp == reqp->tokenIdleErrorServp)
-                reqp->tokenIdleErrorServp = NULL;
+            if (tsp == reqp->errorServp)
+                reqp->errorServp = NULL;
             continue;
         }
         if (tsp) {
@@ -1336,7 +1413,9 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, afs_uint32 replicated, cm_user_
     lock_ReleaseRead(&cm_serverLock);
 
     if (firstError == 0) {
-        if (allDown) {
+        if (allDeleted) {
+            firstError = CM_ERROR_EMPTY;
+        } else if (allDown) {
             firstError = (reqp->tokenError ? reqp->tokenError :
                           (reqp->idleError ? RX_CALL_TIMEOUT : CM_ERROR_ALLDOWN));
             /*
@@ -1509,21 +1588,21 @@ long cm_ConnByServer(cm_server_t *serverp, cm_user_t *userp, afs_uint32 replicat
         cm_GetServer(serverp);
         tcp = malloc(sizeof(*tcp));
         memset(tcp, 0, sizeof(*tcp));
-        tcp->nextp = serverp->connsp;
-        serverp->connsp = tcp;
         cm_HoldUser(userp);
         tcp->userp = userp;
         lock_InitializeMutex(&tcp->mx, "cm_conn_t mutex", LOCK_HIERARCHY_CONN);
-        lock_ObtainMutex(&tcp->mx);
         tcp->serverp = serverp;
         tcp->cryptlevel = rxkad_clear;
         cm_NewRXConnection(tcp, ucellp, serverp, replicated);
         tcp->refCount = 1;
-        lock_ReleaseMutex(&tcp->mx);
+        tcp->nextp = serverp->connsp;
+        serverp->connsp = tcp;
         lock_ReleaseWrite(&cm_connLock);
+        lock_ReleaseMutex(&userp->mx);
     } else {
         lock_ReleaseRead(&cm_connLock);
       haveconn:
+        lock_ReleaseMutex(&userp->mx);
         InterlockedIncrement(&tcp->refCount);
 
         lock_ObtainMutex(&tcp->mx);
@@ -1535,14 +1614,13 @@ long cm_ConnByServer(cm_server_t *serverp, cm_user_t *userp, afs_uint32 replicat
                 osi_Log0(afsd_logp, "cm_ConnByServer replace connection due to token update");
             else
                 osi_Log0(afsd_logp, "cm_ConnByServer replace connection due to crypt change");
-           tcp->flags &= ~CM_CONN_FLAG_FORCE_NEW;
+            tcp->flags &= ~CM_CONN_FLAG_FORCE_NEW;
             rx_SetConnSecondsUntilNatPing(tcp->rxconnp, 0);
             rx_DestroyConnection(tcp->rxconnp);
             cm_NewRXConnection(tcp, ucellp, serverp, replicated);
         }
         lock_ReleaseMutex(&tcp->mx);
     }
-    lock_ReleaseMutex(&userp->mx);
 
     /* return this pointer to our caller */
     osi_Log1(afsd_logp, "cm_ConnByServer returning conn 0x%p", tcp);
@@ -1628,13 +1706,10 @@ long cm_ConnFromVolume(struct cm_volume *volp, unsigned long volid, struct cm_us
     long code;
     cm_serverRef_t **serverspp;
     afs_uint32 replicated;
-    cm_vol_state_t * volstatep;
 
     *connpp = NULL;
 
-    volstatep = cm_VolumeStateByID(volp, volid);
-    replicated = (volstatep->flags & CM_VOL_STATE_FLAG_REPLICATED);
-    serverspp = cm_GetVolServers(volp, volid, userp, reqp);
+    serverspp = cm_GetVolServers(volp, volid, userp, reqp, &replicated);
 
     code = cm_ConnByMServers(*serverspp, replicated, userp, reqp, connpp);
     cm_FreeServerList(serverspp, 0);