uuid-corrected-duplicate-check-20080501
[openafs.git] / src / viced / host.c
index 889998b..28ba0e1 100644 (file)
@@ -17,6 +17,7 @@ RCSID
 
 #include <stdio.h>
 #include <errno.h>
+#include <string.h>
 #ifdef AFS_NT40_ENV
 #include <fcntl.h>
 #include <winsock2.h>
@@ -26,14 +27,6 @@ RCSID
 #include <netinet/in.h>
 #endif
 
-#ifdef HAVE_STRING_H
-#include <string.h>
-#else
-#ifdef HAVE_STRINGS_H
-#include <strings.h>
-#endif
-#endif
-
 #include <afs/stds.h>
 #include <rx/xdr.h>
 #include <afs/assert.h>
@@ -391,7 +384,10 @@ hpr_GetHostCPS(afs_int32 host, prlist *CPS)
 
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code) 
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     over = 0;
@@ -422,7 +418,10 @@ hpr_NameToId(namelist *names, idlist *ids)
 
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code)
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     for (i = 0; i < names->namelist_len; i++)
@@ -444,7 +443,10 @@ hpr_IdToName(idlist *ids, namelist *names)
     
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code)
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     code = ubik_PR_IDToName(uclient, 0, ids, names);
@@ -465,7 +467,10 @@ hpr_GetCPS(afs_int32 id, prlist *CPS)
 
     if (!uclient) {
         code = hpr_Initialize(&uclient);
-        assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       if (!code)
+           assert(pthread_setspecific(viced_uclient_key, (void *)uclient) == 0);
+       else
+           return code;
     }
 
     over = 0;
@@ -737,11 +742,11 @@ h_gethostcps_r(register struct host *host, register afs_int32 now)
 void
 h_flushhostcps(register afs_uint32 hostaddr, register afs_uint16 hport)
 {
-    register struct host *host;
+    struct host *host;
     int held = 0;
 
     H_LOCK;
-    host = h_Lookup_r(hostaddr, hport, &held);
+    h_Lookup_r(hostaddr, hport, &held, &host);
     if (host) {
        host->hcpsfailed = 1;
        if (!held)
@@ -836,8 +841,8 @@ h_SetupCallbackConn_r(struct host * host)
 /* hostaddr and hport are in network order */
 /* Note: host should be released by caller if 0 == *heldp and non-null */
 /* hostaddr and hport are in network order */
-struct host *
-h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp)
+int
+h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp, struct host **hostp)
 {
     afs_int32 now;
     struct host *host = NULL;
@@ -851,6 +856,11 @@ h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp)
        assert(host);
        if (!(host->hostFlags & HOSTDELETED) && chain->addr == haddr
            && chain->port == hport) {
+           if ((host->hostFlags & HWHO_INPROGRESS) && 
+               h_threadquota(host->lock.num_waiting)) {
+               *hostp = 0;
+               return VBUSY;
+           }
            *heldp = h_Held_r(host);
            if (!*heldp)
                h_Hold_r(host);
@@ -878,8 +888,8 @@ h_Lookup_r(afs_uint32 haddr, afs_uint16 hport, int *heldp)
        }
        host = NULL;
     }
-    return host;
-
+    *hostp = host;
+    return 0;
 }                              /*h_Lookup */
 
 /* Lookup a host given its UUID. */
@@ -1082,7 +1092,10 @@ h_Enumerate(int (*proc) (), char *param)
        if (!(held[count] = h_Held_r(host)))
            h_Hold_r(host);
     }
-    assert(count == hostCount);
+    if (count != hostCount) {
+       ViceLog(0, ("h_Enumerate found %d of %d hosts\n", count, hostCount));
+    }
+    assert(count <= hostCount);
     H_UNLOCK;
     for (i = 0; i < count; i++) {
        held[i] = (*proc) (list[i], held[i], param);
@@ -1144,7 +1157,8 @@ h_AddHostToUuidHashTable_r(struct afsUUID *uuid, struct host *host)
 
     /* don't add the same entry multiple times */
     for (chain = hostUuidHashTable[index]; chain; chain = chain->next) {
-       if (host->interface && afs_uuid_equal(&host->interface->uuid, uuid))
+       if (chain->hostPtr->interface && 
+           afs_uuid_equal(&chain->hostPtr->interface->uuid, uuid))
            return;
     }
 
@@ -1428,19 +1442,25 @@ h_GetHost_r(struct rx_connection *tcon)
     char hoststr[16], hoststr2[16];
     Capabilities caps;
     struct rx_connection *cb_conn = NULL;
+    struct rx_connection *cb_in = NULL;
 
     caps.Capabilities_val = NULL;
 
     haddr = rxr_HostOf(tcon);
     hport = rxr_PortOf(tcon);
   retry:
+    if (cb_in) {
+        rx_DestroyConnection(cb_in);
+        cb_in = NULL;
+    }
     if (caps.Capabilities_val)
        free(caps.Capabilities_val);
     caps.Capabilities_val = NULL;
     caps.Capabilities_len = 0;
 
     code = 0;
-    host = h_Lookup_r(haddr, hport, &held);
+    if (h_Lookup_r(haddr, hport, &held, &host))
+       return 0;
     identP = (struct Identity *)rx_GetSpecific(tcon, rxcon_ident_key);
     if (host && !identP && !(host->Console & 1)) {
        /* This is a new connection, and we already have a host
@@ -1448,11 +1468,13 @@ h_GetHost_r(struct rx_connection *tcon)
         * of the caller matches the identity in the host structure.
         */
        if ((host->hostFlags & HWHO_INPROGRESS) && 
-           h_threadquota(host->lock.num_waiting))
+           h_threadquota(host->lock.num_waiting)) {
+           if (!held)
+               h_Release_r(host);
            return 0;
+       }
        h_Lock_r(host);
        if (!(host->hostFlags & ALTADDR)) {
-           host->hostFlags &= ~HWHO_INPROGRESS;
            /* Another thread is doing initialization */
            h_Unlock_r(host);
            if (!held)
@@ -1463,19 +1485,52 @@ h_GetHost_r(struct rx_connection *tcon)
                     ntohs(host->port)));
            goto retry;
        }
+       host->hostFlags |= HWHO_INPROGRESS;
        host->hostFlags &= ~ALTADDR;
+
+        /* We received a new connection from an IP address/port
+         * that is associated with 'host' but the address/port of
+         * the callback connection does not have to match it.
+         * If there is a match, we can use the existing callback
+         * connection to verify the UUID.  If they do not match
+         * we need to use a new callback connection to verify the
+         * UUID of the incoming caller and perhaps use the old 
+         * callback connection to verify that the old address/port
+         * is still valid.
+         */
+       
        cb_conn = host->callback_rxcon;
        rx_GetConnection(cb_conn);
        H_UNLOCK;
-       code =
-           RXAFSCB_TellMeAboutYourself(cb_conn, &interf, &caps);
-       if (code == RXGEN_OPCODE)
-           code = RXAFSCB_WhoAreYou(cb_conn, &interf);
+        if (haddr == host->host && hport == host->port) {
+            /* The existing callback connection matches the 
+             * incoming connection so just use it.
+             */
+           code =
+               RXAFSCB_TellMeAboutYourself(cb_conn, &interf, &caps);
+           if (code == RXGEN_OPCODE)
+               code = RXAFSCB_WhoAreYou(cb_conn, &interf);
+       } else {
+            /* We do not have a match.  Create a new connection
+             * for the new addr/port and use multi_Rx to probe
+             * both of them simultaneously.
+             */
+           if (!sc)
+                sc = rxnull_NewClientSecurityObject();
+            cb_in = rx_NewConnection(haddr, hport, 1, sc, 0);
+            rx_SetConnDeadTime(cb_in, 50);
+            rx_SetConnHardDeadTime(cb_in, AFS_HARDDEADTIME);
+           
+            code =
+                RXAFSCB_TellMeAboutYourself(cb_in, &interf, &caps);
+           if (code == RXGEN_OPCODE)
+                code = RXAFSCB_WhoAreYou(cb_in, &interf);
+       }
        rx_PutConnection(cb_conn);
        cb_conn=NULL;
        H_LOCK;
        if ((code == RXGEN_OPCODE) || 
-           (afs_uuid_equal(&interf.uuid, &nulluuid))) {
+           ((code == 0) && (afs_uuid_equal(&interf.uuid, &nulluuid)))) {
            identP = (struct Identity *)malloc(sizeof(struct Identity));
            if (!identP) {
                ViceLog(0, ("Failed malloc in h_GetHost_r\n"));
@@ -1483,23 +1538,39 @@ h_GetHost_r(struct rx_connection *tcon)
            }
            identP->valid = 0;
            rx_SetSpecific(tcon, rxcon_ident_key, identP);
-           /* The host on this connection was unable to respond to 
-            * the WhoAreYou. We will treat this as a new connection
-            * from the existing host. The worst that can happen is
-            * that we maintain some extra callback state information */
-           if (host->interface) {
-               ViceLog(0,
-                       ("Host %x (%s:%d) used to support WhoAreYou, deleting.\n",
-                        host, 
-                         afs_inet_ntoa_r(host->host, hoststr),
-                        ntohs(host->port)));
-               host->hostFlags |= HOSTDELETED;
-               host->hostFlags &= ~HWHO_INPROGRESS;
-               h_Unlock_r(host);
+           if (cb_in == NULL) {
+               /* The host on this connection was unable to respond to 
+                * the WhoAreYou. We will treat this as a new connection
+                * from the existing host. The worst that can happen is
+                * that we maintain some extra callback state information */
+               if (host->interface) {
+                   ViceLog(0,
+                           ("Host %x (%s:%d) used to support WhoAreYou, deleting.\n",
+                            host, 
+                            afs_inet_ntoa_r(host->host, hoststr),
+                            ntohs(host->port)));
+                   host->hostFlags |= HOSTDELETED;
+                   host->hostFlags &= ~HWHO_INPROGRESS;
+                   h_Unlock_r(host);
+                   if (!held)
+                       h_Release_r(host);
+                   host = NULL;
+                   goto retry;
+               }
+           } else {
+               /* The incoming connection does not support WhoAreYou but
+                * the original one might have.  Use removeAddress_r() to
+                 * remove this addr/port from the host that was found.
+                 * If there are no more addresses left for the host it 
+                 * will be deleted.  Then we retry.
+                 */
+                removeAddress_r(host, haddr, hport);
+                host->hostFlags &= ~HWHO_INPROGRESS;
+                h_Unlock_r(host);
                if (!held)
-                   h_Release_r(host);
-               host = NULL;
-               goto retry;
+                    h_Release_r(host);
+                host = NULL;
+                goto retry;
            }
        } else if (code == 0) {
            interfValid = 1;
@@ -1516,24 +1587,102 @@ h_GetHost_r(struct rx_connection *tcon)
             * then this is not the same host as before. */
            if (!host->interface
                || !afs_uuid_equal(&interf.uuid, &host->interface->uuid)) {
-                ViceLog(25,
-                         ("Uuid doesn't match host %x (%s:%d).\n",
-                           host, afs_inet_ntoa_r(host->host, hoststr), ntohs(host->port)));
-
-                removeAddress_r(host, host->host, host->port);
+               if (cb_in) {
+                    ViceLog(25,
+                           ("Uuid doesn't match connection (%s:%d).\n",
+                            afs_inet_ntoa_r(haddr, hoststr), ntohs(hport)));
+                   
+                    removeAddress_r(host, haddr, hport);
+               } else {
+                   ViceLog(25,
+                           ("Uuid doesn't match host %x (%s:%d).\n",
+                            host, afs_inet_ntoa_r(host->host, hoststr), ntohs(host->port)));
+                   
+                   removeAddress_r(host, host->host, host->port);
+               }
                host->hostFlags &= ~HWHO_INPROGRESS;
                h_Unlock_r(host);
                if (!held)
                    h_Release_r(host);
                host = NULL;
                goto retry;
+           } else if (cb_in) {
+               /* the UUID matched the client at the incoming addr/port 
+                 * but this is not the address of the active callback 
+                 * connection.  Try that connection and see if the client
+                 * is still there and if the reported UUID is the same.
+                 */
+                int code2;
+                afsUUID uuid = host->interface->uuid;
+                cb_conn = host->callback_rxcon;
+                rx_GetConnection(cb_conn);
+                rx_SetConnDeadTime(cb_conn, 2);
+                rx_SetConnHardDeadTime(cb_conn, AFS_HARDDEADTIME);
+                H_UNLOCK;
+                code2 = RXAFSCB_ProbeUuid(cb_conn, &uuid);
+                H_LOCK;
+                rx_SetConnDeadTime(cb_conn, 50);
+                rx_SetConnHardDeadTime(cb_conn, AFS_HARDDEADTIME);
+                rx_PutConnection(cb_conn);
+                cb_conn=NULL;
+                if (code2) {
+                    /* The primary address is either not responding or
+                     * is not the client we are looking for.  Need to
+                     * remove the primary address and add swap in the new 
+                     * callback connection, and destroy the old one.
+                     */
+                    struct rx_connection *rxconn;
+                    ViceLog(0,("CB: ProbeUuid for host %x (%s:%d) failed %d\n",
+                              host, 
+                              afs_inet_ntoa_r(host->host, hoststr),
+                              ntohs(host->port),code2));
+                   
+                    removeInterfaceAddr_r(host, host->host, host->port);
+                    addInterfaceAddr_r(host, haddr, hport);
+                    host->host = haddr;
+                    host->port = hport;
+                    rxconn = host->callback_rxcon;
+                    host->callback_rxcon = cb_in;
+                    cb_in = NULL;
+                   
+                    if (rxconn) {
+                        struct client *client;
+                        /*
+                         * If rx_DestroyConnection calls h_FreeConnection we will
+                        * deadlock on the host_glock_mutex. Work around the problem
+                         * by unhooking the client from the connection before
+                         * destroying the connection.
+                         */
+                        client = rx_GetSpecific(rxconn, rxcon_client_key);
+                        rx_SetSpecific(rxconn, rxcon_client_key, (void *)0);
+                        rx_DestroyConnection(rxconn);
+                   }
+               }
            }
        } else {
-           ViceLog(0,
-                   ("CB: WhoAreYou failed for host %x (%s:%d), error %d\n",
-                     host, afs_inet_ntoa_r(host->host, hoststr),
-                    ntohs(host->port), code));
-           host->hostFlags |= VENUSDOWN;
+            if (cb_in) {
+                /* A callback to the incoming connection address is failing.  
+                 * Assume that the addr/port is no longer associated with the host
+                 * returned by h_Lookup_r.
+                 */
+                ViceLog(0,
+                       ("CB: WhoAreYou failed for connection (%s:%d) , error %d\n",
+                        afs_inet_ntoa_r(haddr, hoststr), ntohs(hport), code));
+                removeAddress_r(host, haddr, hport);
+                host->hostFlags &= ~HWHO_INPROGRESS;
+                h_Unlock_r(host);
+                if (!held)
+                    h_Release_r(host);
+                host = NULL;
+                rx_DestroyConnection(cb_in);
+                return 0;
+           } else {
+               ViceLog(0,
+                       ("CB: WhoAreYou failed for host %x (%s:%d), error %d\n",
+                        host, afs_inet_ntoa_r(host->host, hoststr),
+                        ntohs(host->port), code));
+               host->hostFlags |= VENUSDOWN;
+           }
        }
        if (caps.Capabilities_val
            && (caps.Capabilities_val[0] & CLIENT_CAPABILITY_ERRORTRANS))
@@ -1551,7 +1700,6 @@ h_GetHost_r(struct rx_connection *tcon)
                     host, afs_inet_ntoa_r(host->host, hoststr),
                     ntohs(host->port)));
            h_Lock_r(host);
-           host->hostFlags &= ~HWHO_INPROGRESS;
            h_Unlock_r(host);
            if (!held)
                h_Release_r(host);
@@ -1583,8 +1731,8 @@ h_GetHost_r(struct rx_connection *tcon)
                     host->interface ? uuid2 : "no_uuid"));
 
            /* The host in the cache is not the host for this connection */
+            h_Lock_r(host);
            host->hostFlags |= HOSTDELETED;
-           host->hostFlags &= ~HWHO_INPROGRESS;
            h_Unlock_r(host);
            if (!held)
                h_Release_r(host);
@@ -1597,6 +1745,7 @@ h_GetHost_r(struct rx_connection *tcon)
            int pident = 0;
            cb_conn = host->callback_rxcon;
            rx_GetConnection(cb_conn);
+           host->hostFlags |= HWHO_INPROGRESS;
            H_UNLOCK;
            code =
                RXAFSCB_TellMeAboutYourself(cb_conn, &interf, &caps);
@@ -1606,7 +1755,7 @@ h_GetHost_r(struct rx_connection *tcon)
            cb_conn=NULL;
            H_LOCK;
            if ((code == RXGEN_OPCODE) || 
-               afs_uuid_equal(&interf.uuid, &nulluuid)) {
+               ((code == 0) && (afs_uuid_equal(&interf.uuid, &nulluuid)))) {
                if (!identP)
                    identP =
                        (struct Identity *)malloc(sizeof(struct Identity));
@@ -1775,6 +1924,7 @@ h_GetHost_r(struct rx_connection *tcon)
                                                   &FS_HostUUID);
                    rx_PutConnection(cb_conn);
                    cb_conn=NULL;
+                   H_LOCK;
                    if (code == 0) {
                        ViceLog(25,
                                ("InitCallBackState3 success on host %x (%s:%d)\n",
@@ -3326,6 +3476,107 @@ CheckHost(register struct host *host, int held)
 
 }                              /*CheckHost */
 
+int
+CheckHost_r(register struct host *host, int held, char *dummy)
+{
+    register struct client *client;
+    struct rx_connection *cb_conn = NULL;
+    int code;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* kill the checkhost lwp ASAP during shutdown */
+    FS_STATE_RDLOCK;
+    if (fs_state.mode == FS_MODE_SHUTDOWN) {
+       FS_STATE_UNLOCK;
+       return H_ENUMERATE_BAIL(held);
+    }
+    FS_STATE_UNLOCK;
+#endif
+
+    /* Host is held by h_Enumerate_r */
+    for (client = host->FirstClient; client; client = client->next) {
+       if (client->refCount == 0 && client->LastCall < clientdeletetime) {
+           client->deleted = 1;
+           host->hostFlags |= CLIENTDELETED;
+       }
+    }
+    if (host->LastCall < checktime) {
+       h_Lock_r(host);
+       if (!(host->hostFlags & HOSTDELETED)) {
+           cb_conn = host->callback_rxcon;
+           rx_GetConnection(cb_conn);
+           if (host->LastCall < clientdeletetime) {
+               host->hostFlags |= HOSTDELETED;
+               if (!(host->hostFlags & VENUSDOWN)) {
+                   host->hostFlags &= ~ALTADDR;        /* alternate address invalid */
+                   if (host->interface) {
+                       H_UNLOCK;
+                       code =
+                           RXAFSCB_InitCallBackState3(cb_conn,
+                                                      &FS_HostUUID);
+                       H_LOCK;
+                   } else {
+                       H_UNLOCK;
+                       code =
+                           RXAFSCB_InitCallBackState(cb_conn);
+                       H_LOCK;
+                   }
+                   host->hostFlags |= ALTADDR; /* alternate addresses valid */
+                   if (code) {
+                       char hoststr[16];
+                       (void)afs_inet_ntoa_r(host->host, hoststr);
+                       ViceLog(0,
+                               ("CB: RCallBackConnectBack (host.c) failed for host %s:%d\n",
+                                hoststr, ntohs(host->port)));
+                       host->hostFlags |= VENUSDOWN;
+                   }
+                   /* Note:  it's safe to delete hosts even if they have call
+                    * back state, because break delayed callbacks (called when a
+                    * message is received from the workstation) will always send a 
+                    * break all call backs to the workstation if there is no
+                    *callback.
+                    */
+               }
+           } else {
+               if (!(host->hostFlags & VENUSDOWN) && host->cblist) {
+                   char hoststr[16];
+                   (void)afs_inet_ntoa_r(host->host, hoststr);
+                   if (host->interface) {
+                       afsUUID uuid = host->interface->uuid;
+                       H_UNLOCK;
+                       code = RXAFSCB_ProbeUuid(cb_conn, &uuid);
+                       H_LOCK;
+                       if (code) {
+                           if (MultiProbeAlternateAddress_r(host)) {
+                               ViceLog(0,("CheckHost_r: Probing all interfaces of host %s:%d failed, code %d\n",
+                                           hoststr, ntohs(host->port), code));
+                               host->hostFlags |= VENUSDOWN;
+                           }
+                       }
+                   } else {
+                       H_UNLOCK;
+                       code = RXAFSCB_Probe(cb_conn);
+                       H_LOCK;
+                       if (code) {
+                           ViceLog(0,
+                                   ("CheckHost_r: Probe failed for host %s:%d, code %d\n", 
+                                    hoststr, ntohs(host->port), code));
+                           host->hostFlags |= VENUSDOWN;
+                       }
+                   }
+               }
+           }
+           H_UNLOCK;
+           rx_PutConnection(cb_conn);
+           cb_conn=NULL;
+           H_LOCK;
+       }
+       h_Unlock_r(host);
+    }
+    return held;
+
+}                              /*CheckHost_r */
+
 
 /*
  * Set VenusDown for any hosts that have not had a call in 15 minutes and
@@ -3348,8 +3599,10 @@ h_CheckHosts(void)
      */
     checktime = now - 15 * 60;
     clientdeletetime = now - 120 * 60; /* 2 hours ago */
-    h_Enumerate(CheckHost, NULL);
-
+    
+    H_LOCK;
+    h_Enumerate_r(CheckHost_r, hostList, NULL);
+    H_UNLOCK;
 }                              /*h_CheckHosts */
 
 /*