windows-vnovol-20080912
authorJeffrey Altman <jaltman@secure-endpoints.com>
Sat, 13 Sep 2008 05:20:02 +0000 (05:20 +0000)
committerJeffrey Altman <jaltman@secure-endpoints.com>
Sat, 13 Sep 2008 05:20:02 +0000 (05:20 +0000)
LICENSE MIT

The cm_serverRef_t list reference counts were undercounting and
prematurely freeing the server lists for volumes that experienced
VNOVOL and VMOVED errors.  cm_Analyze() must release the server
list before forcibly updating the volume location info.  Otherwise,
the list that gets freed is the old one concatenated with the new
one.

Add more trace messages.

src/WINNT/afsd/cm_callback.c
src/WINNT/afsd/cm_conn.c
src/WINNT/afsd/cm_ioctl.c
src/WINNT/afsd/cm_server.c
src/WINNT/afsd/cm_volume.c

index 899c9b1..1ce5b00 100644 (file)
@@ -1867,6 +1867,8 @@ long cm_CBServersUp(cm_scache_t *scp, time_t * downTime)
         return 1;
 
     for (found = 0,tsrp = statep->serversp; tsrp; tsrp=tsrp->next) {
+        if (tsrp->status == srv_deleted)
+            continue;
         if (tsrp->server == scp->cbServerp)
             found = 1;
         if (tsrp->server->downTime > *downTime)
index c919ebb..79e3294 100644 (file)
@@ -232,6 +232,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         if (cellp == NULL && serversp) {
             struct cm_serverRef * refp;
             for ( refp=serversp ; cellp == NULL && refp != NULL; refp=refp->next) {
+                if (refp->status == srv_deleted)
+                    continue;
                 if ( refp->server )
                     cellp = refp->server->cellp;
             }
@@ -335,6 +337,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                         }
                         lock_ObtainWrite(&cm_serverLock);
                         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+                            if (tsrp->status == srv_deleted)
+                                continue;
                             if (tsrp->status == srv_busy) {
                                 tsrp->status = srv_not_busy;
                             }       
@@ -342,7 +346,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                         lock_ReleaseWrite(&cm_serverLock);
                         if (free_svr_list) {
                             cm_FreeServerList(&serversp, 0);
-                            *serverspp = serversp;
+                            *serverspp = serversp = NULL;
+                            free_svr_list = 0;
                         }
 
                         cm_UpdateVolumeStatus(volp, fidp->volume);
@@ -364,6 +369,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                 if (serversp) {
                     lock_ObtainWrite(&cm_serverLock);
                     for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+                        if (tsrp->status == srv_deleted)
+                            continue;
                         if (tsrp->status == srv_busy) {
                             tsrp->status = srv_not_busy;
                         }
@@ -386,6 +393,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         }
         lock_ObtainWrite(&cm_serverLock);
         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+            if (tsrp->status == srv_deleted)
+                continue;
             if (tsrp->server == serverp && tsrp->status == srv_not_busy) {
                 tsrp->status = srv_busy;
                 if (fidp) { /* File Server query */
@@ -412,7 +421,8 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
 
         if (free_svr_list) {
             cm_FreeServerList(&serversp, 0);
-            *serverspp = serversp;
+            *serverspp = serversp = NULL;
+            free_svr_list = 0;
         }
         retry = 1;
     }
@@ -467,13 +477,15 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         if (!serversp && fidp) {
             code = cm_GetServerList(fidp, userp, reqp, &serverspp);
             if (code == 0) {
-                serversp = *serverspp;
+                serversp = *serverspp = NULL;
                 free_svr_list = 1;
             }
         }
 
         lock_ObtainWrite(&cm_serverLock);
         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+            if (tsrp->status == srv_deleted)
+                continue;
             if (tsrp->server == serverp) {
                 /* REDIRECT */
                 if (errorCode == VMOVED || errorCode == VNOVOL) {
@@ -483,38 +495,46 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                 } else {
                     tsrp->status = srv_offline;
                 }
-
-                if (fidp) { /* File Server query */
-                    lock_ReleaseWrite(&cm_serverLock);
-                    code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
-                                             CM_GETVOL_FLAG_NO_LRU_UPDATE, 
-                                             &volp);
-                    if (code == 0)
-                        cm_VolumeStateByID(volp, fidp->volume);
-                    lock_ObtainWrite(&cm_serverLock);
-                }   
             }
         }   
         lock_ReleaseWrite(&cm_serverLock);
 
-        if (fidp && (errorCode == VMOVED || errorCode == VNOVOL)) {
-            code = cm_ForceUpdateVolume(fidp, userp, reqp);
-            if (code) 
-                timeLeft = 0;   /* prevent a retry on failure */
+        /* Free the server list before cm_ForceUpdateVolume is called */
+        if (free_svr_list) {
+            cm_FreeServerList(&serversp, 0);
+            *serverspp = serversp = NULL;
+            free_svr_list = 0;
         }
 
-        if (statep) {
-            cm_UpdateVolumeStatus(volp, statep->ID);
-            lock_ObtainRead(&cm_volumeLock);
-            cm_PutVolume(volp);
-            lock_ReleaseRead(&cm_volumeLock);
-            volp = NULL;
-        }
+        if (fidp) { /* File Server query */
+            code = cm_FindVolumeByID(cellp, fidp->volume, userp, reqp, 
+                                      CM_GETVOL_FLAG_NO_LRU_UPDATE, 
+                                      &volp);
+            if (code == 0)
+                statep = cm_VolumeStateByID(volp, fidp->volume);
+
+            if (errorCode == VMOVED || errorCode == VNOVOL) {
+                code = cm_ForceUpdateVolume(fidp, userp, reqp);
+                if (code) 
+                    timeLeft = 0;   /* prevent a retry on failure */
+                osi_Log3(afsd_logp, "cm_Analyze called cm_ForceUpdateVolume cell %u vol %u code 0x%x",
+                        fidp->cell, fidp->volume, code);
+            }
 
-        if (free_svr_list) {
-            cm_FreeServerList(&serversp, 0);
-            *serverspp = serversp;
+            if (statep) {
+                cm_UpdateVolumeStatus(volp, statep->ID);
+                osi_Log3(afsd_logp, "cm_Analyze NewVolState cell %u vol %u state %u", 
+                         fidp->cell, fidp->volume, statep->state);
+            }
+
+            if (volp) {
+                lock_ObtainRead(&cm_volumeLock);
+                cm_PutVolume(volp);
+                lock_ReleaseRead(&cm_volumeLock);
+                volp = NULL;
+            }
         }
+
         if ( timeLeft > 2 )
             retry = 1;
     } else if ( errorCode == VNOVNODE ) {
@@ -798,6 +818,9 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp,
 
     lock_ObtainRead(&cm_serverLock);
     for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+        if (tsrp->status == srv_deleted)
+            continue;
+
         tsp = tsrp->server;
         if (reqp->tokenIdleErrorServp) {
             /* 
@@ -813,7 +836,7 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp,
         if (tsp) {
             cm_GetServerNoLock(tsp);
             lock_ReleaseRead(&cm_serverLock);
-            if ((tsrp->status != srv_deleted) && !(tsp->flags & CM_SERVERFLAG_DOWN)) {
+            if (!(tsp->flags & CM_SERVERFLAG_DOWN)) {
                 allDown = 0;
                 if (tsrp->status == srv_busy) {
                     allOffline = 0;
@@ -1042,8 +1065,10 @@ long cm_ServerAvailable(struct cm_fid *fidp, struct cm_user *userp)
 
     lock_ObtainRead(&cm_serverLock);
     for (tsrp = *serverspp; tsrp; tsrp=tsrp->next) {
+        if (tsrp->status == srv_deleted)
+            continue;
         tsp = tsrp->server;
-        if ((tsrp->status != srv_deleted) && !(tsp->flags & CM_SERVERFLAG_DOWN)) {
+        if (!(tsp->flags & CM_SERVERFLAG_DOWN)) {
            allDown = 0;
             if (tsrp->status == srv_busy) {
                allOffline = 0;
index 9f5bee8..3086bb3 100644 (file)
@@ -3048,10 +3048,12 @@ cm_CheckServersStatus(cm_serverRef_t *serversp)
 
     lock_ObtainRead(&cm_serverLock);
     for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+        if (tsrp->status == srv_deleted)
+            continue;
         if (tsp = tsrp->server) {
             cm_GetServerNoLock(tsp);
             lock_ReleaseRead(&cm_serverLock);
-            if ((tsrp->status == srv_deleted) && !(tsp->flags & CM_SERVERFLAG_DOWN)) {
+            if (!(tsp->flags & CM_SERVERFLAG_DOWN)) {
                 allDown = 0;
                 if (tsrp->status == srv_busy) {
                     allOffline = 0;
index eba1fd2..a2748d5 100644 (file)
@@ -1034,6 +1034,8 @@ LONG_PTR cm_ChecksumServerList(cm_serverRef_t *serversp)
 
     lock_ObtainRead(&cm_serverLock);
     for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
+        if (tsrp->status == srv_deleted)
+            continue;
         if (first)
             first = 0;
         else
@@ -1259,6 +1261,9 @@ void cm_FreeServerList(cm_serverRef_t** list, afs_uint32 flags)
     cm_serverRef_t  **nextp = 0;
     cm_serverRef_t  * next = 0;
 
+       if (*list == NULL)
+               return;
+
     lock_ObtainWrite(&cm_serverLock);
 
     while (*current)
index 5af1993..d0dd1d6 100644 (file)
@@ -227,7 +227,9 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
             cm_UpdateCell(cellp, 0);
 
         /* now we have volume structure locked and held; make RPC to fill it */
-       osi_Log2(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s", volp->cellp->name, volp->namep);
+       osi_Log2(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s", 
+                  osi_LogSaveString(afsd_logp,volp->cellp->name), 
+                  osi_LogSaveString(afsd_logp,volp->namep));
         do {
             struct rx_connection * rxconnp;
 
@@ -254,10 +256,12 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
         code = cm_MapVLRPCError(code, reqp);
        if ( code )
            osi_Log3(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s FAILURE, code 0x%x", 
-                     volp->cellp->name, volp->namep, code);
+                     osi_LogSaveString(afsd_logp,volp->cellp->name), 
+                      osi_LogSaveString(afsd_logp,volp->namep), code);
        else
            osi_Log2(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s SUCCESS", 
-                     volp->cellp->name, volp->namep);
+                     osi_LogSaveString(afsd_logp,volp->cellp->name), 
+                      osi_LogSaveString(afsd_logp,volp->namep));
     }
 
     /* We can end up here with code == CM_ERROR_NOSUCHVOLUME if the base volume name
@@ -271,7 +275,8 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
         snprintf(name, VL_MAXNAMELEN, "%s.readonly", volp->namep);
                 
         /* now we have volume structure locked and held; make RPC to fill it */
-       osi_Log2(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s", volp->cellp->name, 
+       osi_Log2(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s", 
+                 osi_LogSaveString(afsd_logp,volp->cellp->name),
                  osi_LogSaveString(afsd_logp,name));
         do {
             struct rx_connection * rxconnp;
@@ -299,10 +304,12 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
         code = cm_MapVLRPCError(code, reqp);
        if ( code )
            osi_Log3(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s FAILURE, code 0x%x", 
-                     volp->cellp->name, osi_LogSaveString(afsd_logp,name), code);
+                    osi_LogSaveString(afsd_logp,volp->cellp->name), 
+                     osi_LogSaveString(afsd_logp,name), code);
        else
            osi_Log2(afsd_logp, "CALL VL_GetEntryByName{UNO} name %s:%s SUCCESS", 
-                     volp->cellp->name, osi_LogSaveString(afsd_logp,name));
+                    osi_LogSaveString(afsd_logp,volp->cellp->name), 
+                     osi_LogSaveString(afsd_logp,name));
     }
     
     lock_ObtainWrite(&volp->rw);
@@ -427,7 +434,8 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
                 name[len - 9] = '\0';
             }
             
-            osi_Log2(afsd_logp, "cm_UpdateVolume name %s -> %s", volp->namep, name);
+            osi_Log2(afsd_logp, "cm_UpdateVolume name %s -> %s", 
+                     osi_LogSaveString(afsd_logp,volp->namep), osi_LogSaveString(afsd_logp,name));
 
             if (volp->flags & CM_VOLUMEFLAG_IN_HASH)
                 cm_RemoveVolumeFromNameHashTable(volp);
@@ -558,6 +566,7 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
             cm_RandomizeServer(&volp->vol[ROVOL].serversp);
         }
 
+
         rwNewstate = rwServers_alldown ? vl_alldown : vl_online;
         roNewstate = roServers_alldown ? vl_alldown : vl_online;
         bkNewstate = bkServers_alldown ? vl_alldown : vl_online;
@@ -604,7 +613,8 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
 
     volp->flags &= ~CM_VOLUMEFLAG_UPDATING_VL;
     osi_Log4(afsd_logp, "cm_UpdateVolumeLocation done, waking others name %s:%s flags 0x%x code 0x%x", 
-             volp->cellp->name, volp->namep, volp->flags, code);
+             osi_LogSaveString(afsd_logp,volp->cellp->name), 
+             osi_LogSaveString(afsd_logp,volp->namep), volp->flags, code);
     osi_Wakeup((LONG_PTR) &volp->flags);
 
     return code;
@@ -846,6 +856,7 @@ long cm_FindVolumeByName(struct cm_cell *cellp, char *volumeNamep,
                     cm_VolumeStatusNotification(volp, volp->vol[volType].ID, volp->vol[volType].state, vl_unknown);
                 volp->vol[volType].ID = 0;
                 cm_SetFid(&volp->vol[volType].dotdotFid, 0, 0, 0, 0);
+                cm_FreeServerList(&volp->vol[volType].serversp, CM_FREESERVERLIST_DELETE);
             }
        } else {
            volp = &cm_data.volumeBaseAddress[cm_data.currentVolumes++];
@@ -999,7 +1010,7 @@ long cm_ForceUpdateVolume(cm_fid_t *fidp, cm_user_t *userp, cm_req_t *reqp)
 cm_serverRef_t **cm_GetVolServers(cm_volume_t *volp, afs_uint32 volume)
 {
     cm_serverRef_t **serverspp;
-    cm_serverRef_t *current;;
+    cm_serverRef_t *current;
 
     lock_ObtainWrite(&cm_serverLock);
 
@@ -1012,7 +1023,11 @@ cm_serverRef_t **cm_GetVolServers(cm_volume_t *volp, afs_uint32 volume)
     else 
         osi_panic("bad volume ID in cm_GetVolServers", __FILE__, __LINE__);
         
-    for (current = *serverspp; current; current = current->next)
+    /* 
+     * Increment the refCount on deleted items as well.
+     * They will be freed by cm_FreeServerList when they get to zero 
+     */
+    for (current = *serverspp; current; current = current->next) 
         current->refCount++;
 
     lock_ReleaseWrite(&cm_serverLock);
@@ -1120,12 +1135,14 @@ cm_CheckOfflineVolumeState(cm_volume_t *volp, cm_vol_state_t *statep, afs_uint32
             alldown = 1;
             alldeleted = 1;
             for (serversp = statep->serversp; serversp; serversp = serversp->next) {
-                if (serversp->status != srv_deleted) {
-                    alldeleted = 0;
-                    *onlinep = 1;
-                    alldown = 0;
-                }
-                if (serversp->status == srv_busy || serversp->status == srv_offline) 
+                if (serversp->status == srv_deleted)
+                    continue;
+
+                alldeleted = 0;
+                *onlinep = 1;
+                alldown = 0;
+                
+                if (serversp->status == srv_busy || serversp->status == srv_offline)
                     serversp->status = srv_not_busy;
             }
 
@@ -1240,6 +1257,8 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep)
 
     lock_ObtainWrite(&cm_serverLock);
     for (tsrp = statep->serversp; tsrp; tsrp=tsrp->next) {
+        if (tsrp->status == srv_deleted)
+            continue;
         tsp = tsrp->server;
         if (tsp) {
             cm_GetServerNoLock(tsp);