viced-cleanup-old-addresses-as-they-become-invalid-20080225
[openafs.git] / src / viced / host.c
index aea2c08..969c4b2 100644 (file)
@@ -384,7 +384,10 @@ hpr_GetHostCPS(afs_int32 host, prlist *CPS)
 
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code) 
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     over = 0;
@@ -415,7 +418,10 @@ hpr_NameToId(namelist *names, idlist *ids)
 
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code)
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     for (i = 0; i < names->namelist_len; i++)
@@ -437,7 +443,10 @@ hpr_IdToName(idlist *ids, namelist *names)
     
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code)
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     code = ubik_PR_IDToName(uclient, 0, ids, names);
@@ -458,7 +467,10 @@ hpr_GetCPS(afs_int32 id, prlist *CPS)
 
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code)
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     over = 0;
@@ -730,11 +742,11 @@ h_gethostcps_r(register struct host *host, register afs_int32 now)
 void
 h_flushhostcps(register afs_uint32 hostaddr, register afs_uint16 hport)
 {
-    register struct host *host;
+    struct host *host;
     int held = 0;
 
     H_LOCK;
-    host = h_Lookup_r(hostaddr, hport, &held);
+    h_Lookup_r(hostaddr, hport, &held, &host);
     if (host) {
        host->hcpsfailed = 1;
        if (!held)
@@ -829,8 +841,8 @@ h_SetupCallbackConn_r(struct host * host)
 /* hostaddr and hport are in network order */
 /* Note: host should be released by caller if 0 == *heldp and non-null */
 /* hostaddr and hport are in network order */
-struct host *
-h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp)
+int
+h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp, struct host **hostp)
 {
     afs_int32 now;
     struct host *host = NULL;
@@ -844,6 +856,11 @@ h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp)
        assert(host);
        if (!(host->hostFlags & HOSTDELETED) && chain->addr == haddr
            && chain->port == hport) {
+           if ((host->hostFlags & HWHO_INPROGRESS) && 
+               h_threadquota(host->lock.num_waiting)) {
+               *hostp = 0;
+               return VBUSY;
+           }
            *heldp = h_Held_r(host);
            if (!*heldp)
                h_Hold_r(host);
@@ -871,8 +888,8 @@ h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp)
        }
        host = NULL;
     }
-    return host;
-
+    *hostp = host;
+    return 0;
 }                              /*h_Lookup */
 
 /* Lookup a host given its UUID. */
@@ -1075,7 +1092,10 @@ h_Enumerate(int (*proc) (), char *param)
        if (!(held[count] = h_Held_r(host)))
            h_Hold_r(host);
     }
-    assert(count == hostCount);
+    if (count != hostCount) {
+       ViceLog(0, ("h_Enumerate found %d of %d hosts\n", count, hostCount));
+    }
+    assert(count <= hostCount);
     H_UNLOCK;
     for (i = 0; i < count; i++) {
        held[i] = (*proc) (list[i], held[i], param);
@@ -1421,19 +1441,25 @@ h_GetHost_r(struct rx_connection *tcon)
     char hoststr[16], hoststr2[16];
     Capabilities caps;
     struct rx_connection *cb_conn = NULL;
+    struct rx_connection *cb_in = NULL;
 
     caps.Capabilities_val = NULL;
 
     haddr = rxr_HostOf(tcon);
     hport = rxr_PortOf(tcon);
   retry:
+    if (cb_in) {
+        rx_DestroyConnection(cb_in);
+        cb_in = NULL;
+    }
     if (caps.Capabilities_val)
        free(caps.Capabilities_val);
     caps.Capabilities_val = NULL;
     caps.Capabilities_len = 0;
 
     code = 0;
-    host = h_Lookup_r(haddr, hport, &held);
+    if (h_Lookup_r(haddr, hport, &held, &host))
+       return 0;
     identP = (struct Identity *)rx_GetSpecific(tcon, rxcon_ident_key);
     if (host && !identP && !(host->Console & 1)) {
        /* This is a new connection, and we already have a host
@@ -1441,11 +1467,13 @@ h_GetHost_r(struct rx_connection *tcon)
         * of the caller matches the identity in the host structure.
         */
        if ((host->hostFlags & HWHO_INPROGRESS) && 
-           h_threadquota(host->lock.num_waiting))
+           h_threadquota(host->lock.num_waiting)) {
+           if (!held)
+               h_Release_r(host);
            return 0;
+       }
        h_Lock_r(host);
        if (!(host->hostFlags & ALTADDR)) {
-           host->hostFlags &= ~HWHO_INPROGRESS;
            /* Another thread is doing initialization */
            h_Unlock_r(host);
            if (!held)
@@ -1456,19 +1484,52 @@ h_GetHost_r(struct rx_connection *tcon)
                     ntohs(host->port)));
            goto retry;
        }
+       host->hostFlags |= HWHO_INPROGRESS;
        host->hostFlags &= ~ALTADDR;
+
+        /* We received a new connection from an IP address/port
+         * that is associated with 'host' but the address/port of
+         * the callback connection does not have to match it.
+         * If there is a match, we can use the existing callback
+         * connection to verify the UUID.  If they do not match
+         * we need to use a new callback connection to verify the
+         * UUID of the incoming caller and perhaps use the old 
+         * callback connection to verify that the old address/port
+         * is still valid.
+         */
+       
        cb_conn = host->callback_rxcon;
        rx_GetConnection(cb_conn);
        H_UNLOCK;
-       code =
-           RXAFSCB_TellMeAboutYourself(cb_conn, &interf, &caps);
-       if (code == RXGEN_OPCODE)
-           code = RXAFSCB_WhoAreYou(cb_conn, &interf);
+        if (haddr == host->host && hport == host->port) {
+            /* The existing callback connection matches the 
+             * incoming connection so just use it.
+             */
+           code =
+               RXAFSCB_TellMeAboutYourself(cb_conn, &interf, &caps);
+           if (code == RXGEN_OPCODE)
+               code = RXAFSCB_WhoAreYou(cb_conn, &interf);
+       } else {
+            /* We do not have a match.  Create a new connection
+             * for the new addr/port and use multi_Rx to probe
+             * both of them simultaneously.
+             */
+           if (!sc)
+                sc = rxnull_NewClientSecurityObject();
+            cb_in = rx_NewConnection(haddr, hport, 1, sc, 0);
+            rx_SetConnDeadTime(cb_in, 50);
+            rx_SetConnHardDeadTime(cb_in, AFS_HARDDEADTIME);
+           
+            code =
+                RXAFSCB_TellMeAboutYourself(cb_in, &interf, &caps);
+           if (code == RXGEN_OPCODE)
+                code = RXAFSCB_WhoAreYou(cb_in, &interf);
+       }
        rx_PutConnection(cb_conn);
        cb_conn=NULL;
        H_LOCK;
        if ((code == RXGEN_OPCODE) || 
-           (afs_uuid_equal(&interf.uuid, &nulluuid))) {
+           ((code == 0) && (afs_uuid_equal(&interf.uuid, &nulluuid)))) {
            identP = (struct Identity *)malloc(sizeof(struct Identity));
            if (!identP) {
                ViceLog(0, ("Failed malloc in h_GetHost_r\n"));
@@ -1476,23 +1537,39 @@ h_GetHost_r(struct rx_connection *tcon)
            }
            identP->valid = 0;
            rx_SetSpecific(tcon, rxcon_ident_key, identP);
-           /* The host on this connection was unable to respond to 
-            * the WhoAreYou. We will treat this as a new connection
-            * from the existing host. The worst that can happen is
-            * that we maintain some extra callback state information */
-           if (host->interface) {
-               ViceLog(0,
-                       ("Host %x (%s:%d) used to support WhoAreYou, deleting.\n",
-                        host, 
-                         afs_inet_ntoa_r(host->host, hoststr),
-                        ntohs(host->port)));
-               host->hostFlags |= HOSTDELETED;
-               host->hostFlags &= ~HWHO_INPROGRESS;
-               h_Unlock_r(host);
+           if (cb_in == NULL) {
+               /* The host on this connection was unable to respond to 
+                * the WhoAreYou. We will treat this as a new connection
+                * from the existing host. The worst that can happen is
+                * that we maintain some extra callback state information */
+               if (host->interface) {
+                   ViceLog(0,
+                           ("Host %x (%s:%d) used to support WhoAreYou, deleting.\n",
+                            host, 
+                            afs_inet_ntoa_r(host->host, hoststr),
+                            ntohs(host->port)));
+                   host->hostFlags |= HOSTDELETED;
+                   host->hostFlags &= ~HWHO_INPROGRESS;
+                   h_Unlock_r(host);
+                   if (!held)
+                       h_Release_r(host);
+                   host = NULL;
+                   goto retry;
+               }
+           } else {
+               /* The incoming connection does not support WhoAreYou but
+                * the original one might have.  Use removeAddress_r() to
+                 * remove this addr/port from the host that was found.
+                 * If there are no more addresses left for the host it 
+                 * will be deleted.  Then we retry.
+                 */
+                removeAddress_r(host, haddr, hport);
+                host->hostFlags &= ~HWHO_INPROGRESS;
+                h_Unlock_r(host);
                if (!held)
-                   h_Release_r(host);
-               host = NULL;
-               goto retry;
+                    h_Release_r(host);
+                host = NULL;
+                goto retry;
            }
        } else if (code == 0) {
            interfValid = 1;
@@ -1509,24 +1586,102 @@ h_GetHost_r(struct rx_connection *tcon)
             * then this is not the same host as before. */
            if (!host->interface
                || !afs_uuid_equal(&interf.uuid, &host->interface->uuid)) {
-                ViceLog(25,
-                         ("Uuid doesn't match host %x (%s:%d).\n",
-                           host, afs_inet_ntoa_r(host->host, hoststr), ntohs(host->port)));
-
-                removeAddress_r(host, host->host, host->port);
+               if (cb_in) {
+                    ViceLog(25,
+                           ("Uuid doesn't match connection (%s:%d).\n",
+                            afs_inet_ntoa_r(haddr, hoststr), ntohs(hport)));
+                   
+                    removeAddress_r(host, haddr, hport);
+               } else {
+                   ViceLog(25,
+                           ("Uuid doesn't match host %x (%s:%d).\n",
+                            host, afs_inet_ntoa_r(host->host, hoststr), ntohs(host->port)));
+                   
+                   removeAddress_r(host, host->host, host->port);
+               }
                host->hostFlags &= ~HWHO_INPROGRESS;
                h_Unlock_r(host);
                if (!held)
                    h_Release_r(host);
                host = NULL;
                goto retry;
+           } else if (cb_in) {
+               /* the UUID matched the client at the incoming addr/port 
+                 * but this is not the address of the active callback 
+                 * connection.  Try that connection and see if the client
+                 * is still there and if the reported UUID is the same.
+                 */
+                int code2;
+                afsUUID uuid = host->interface->uuid;
+                cb_conn = host->callback_rxcon;
+                rx_GetConnection(cb_conn);
+                rx_SetConnDeadTime(cb_conn, 2);
+                rx_SetConnHardDeadTime(cb_conn, AFS_HARDDEADTIME);
+                H_UNLOCK;
+                code2 = RXAFSCB_ProbeUuid(cb_conn, &uuid);
+                H_LOCK;
+                rx_SetConnDeadTime(cb_conn, 50);
+                rx_SetConnHardDeadTime(cb_conn, AFS_HARDDEADTIME);
+                rx_PutConnection(cb_conn);
+                cb_conn=NULL;
+                if (code2) {
+                    /* The primary address is either not responding or
+                     * is not the client we are looking for.  Need to
+                     * remove the primary address and add swap in the new 
+                     * callback connection, and destroy the old one.
+                     */
+                    struct rx_connection *rxconn;
+                    ViceLog(0,("CB: ProbeUuid for host %x (%s:%d) failed %d\n",
+                              host, 
+                              afs_inet_ntoa_r(host->host, hoststr),
+                              ntohs(host->port),code2));
+                   
+                    removeInterfaceAddr_r(host, host->host, host->port);
+                    addInterfaceAddr_r(host, haddr, hport);
+                    host->host = haddr;
+                    host->port = hport;
+                    rxconn = host->callback_rxcon;
+                    host->callback_rxcon = cb_in;
+                    cb_in = NULL;
+                   
+                    if (rxconn) {
+                        struct client *client;
+                        /*
+                         * If rx_DestroyConnection calls h_FreeConnection we will
+                        * deadlock on the host_glock_mutex. Work around the problem
+                         * by unhooking the client from the connection before
+                         * destroying the connection.
+                         */
+                        client = rx_GetSpecific(rxconn, rxcon_client_key);
+                        rx_SetSpecific(rxconn, rxcon_client_key, (void *)0);
+                        rx_DestroyConnection(rxconn);
+                   }
+               }
            }
        } else {
-           ViceLog(0,
-                   ("CB: WhoAreYou failed for host %x (%s:%d), error %d\n",
-                     host, afs_inet_ntoa_r(host->host, hoststr),
-                    ntohs(host->port), code));
-           host->hostFlags |= VENUSDOWN;
+            if (cb_in) {
+                /* A callback to the incoming connection address is failing.  
+                 * Assume that the addr/port is no longer associated with the host
+                 * returned by h_Lookup_r.
+                 */
+                ViceLog(0,
+                       ("CB: WhoAreYou failed for connection (%s:%d) , error %d\n",
+                        afs_inet_ntoa_r(haddr, hoststr), ntohs(hport), code));
+                removeAddress_r(host, haddr, hport);
+                host->hostFlags &= ~HWHO_INPROGRESS;
+                h_Unlock_r(host);
+                if (!held)
+                    h_Release_r(host);
+                host = NULL;
+                rx_DestroyConnection(cb_in);
+                return 0;
+           } else {
+               ViceLog(0,
+                       ("CB: WhoAreYou failed for host %x (%s:%d), error %d\n",
+                        host, afs_inet_ntoa_r(host->host, hoststr),
+                        ntohs(host->port), code));
+               host->hostFlags |= VENUSDOWN;
+           }
        }
        if (caps.Capabilities_val
            && (caps.Capabilities_val[0] & CLIENT_CAPABILITY_ERRORTRANS))
@@ -1544,7 +1699,6 @@ h_GetHost_r(struct rx_connection *tcon)
                     host, afs_inet_ntoa_r(host->host, hoststr),
                     ntohs(host->port)));
            h_Lock_r(host);
-           host->hostFlags &= ~HWHO_INPROGRESS;
            h_Unlock_r(host);
            if (!held)
                h_Release_r(host);
@@ -1576,8 +1730,8 @@ h_GetHost_r(struct rx_connection *tcon)
                     host->interface ? uuid2 : "no_uuid"));
 
            /* The host in the cache is not the host for this connection */
+            h_Lock_r(host);
            host->hostFlags |= HOSTDELETED;
-           host->hostFlags &= ~HWHO_INPROGRESS;
            h_Unlock_r(host);
            if (!held)
                h_Release_r(host);
@@ -1590,6 +1744,7 @@ h_GetHost_r(struct rx_connection *tcon)
            int pident = 0;
            cb_conn = host->callback_rxcon;
            rx_GetConnection(cb_conn);
+           host->hostFlags |= HWHO_INPROGRESS;
            H_UNLOCK;
            code =
                RXAFSCB_TellMeAboutYourself(cb_conn, &interf, &caps);
@@ -1599,7 +1754,7 @@ h_GetHost_r(struct rx_connection *tcon)
            cb_conn=NULL;
            H_LOCK;
            if ((code == RXGEN_OPCODE) || 
-               afs_uuid_equal(&interf.uuid, &nulluuid)) {
+               ((code == 0) && (afs_uuid_equal(&interf.uuid, &nulluuid)))) {
                if (!identP)
                    identP =
                        (struct Identity *)malloc(sizeof(struct Identity));
@@ -1768,6 +1923,7 @@ h_GetHost_r(struct rx_connection *tcon)
                                                   &FS_HostUUID);
                    rx_PutConnection(cb_conn);
                    cb_conn=NULL;
+                   H_LOCK;
                    if (code == 0) {
                        ViceLog(25,
                                ("InitCallBackState3 success on host %x (%s:%d)\n",
@@ -3326,6 +3482,16 @@ CheckHost_r(register struct host *host, int held, char *dummy)
     struct rx_connection *cb_conn = NULL;
     int code;
 
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* kill the checkhost lwp ASAP during shutdown */
+    FS_STATE_RDLOCK;
+    if (fs_state.mode == FS_MODE_SHUTDOWN) {
+       FS_STATE_UNLOCK;
+       return H_ENUMERATE_BAIL(held);
+    }
+    FS_STATE_UNLOCK;
+#endif
+
     /* Host is held by h_Enumerate_r */
     for (client = host->FirstClient; client; client = client->next) {
        if (client->refCount == 0 && client->LastCall < clientdeletetime) {