ubik: Rename flags to dbFlags
[openafs.git] / src / ubik / beacon.c
index eb2d526..305a118 100644 (file)
 
 #include <roken.h>
 
+#include <afs/opr.h>
+#ifdef AFS_PTHREAD_ENV
+# include <opr/lock.h>
+#else
+# include <opr/lockstub.h>
+#endif
+
 #include <lock.h>
-#include <rx/xdr.h>
 #include <rx/rx.h>
 #include <rx/rxkad.h>
 #include <rx/rx_multi.h>
 #include <afs/cellconfig.h>
-#ifndef AFS_NT40_ENV
 #include <afs/afsutil.h>
-#include <afs/netutils.h>
-#endif
 
 #define UBIK_INTERNALS
 #include "ubik.h"
@@ -96,7 +99,8 @@ ubeacon_Debug(struct ubik_debug *aparm)
 }
 
 static int
-amSyncSite(void) {
+amSyncSite(void)
+{
     afs_int32 now;
     afs_int32 rcode;
 
@@ -111,15 +115,16 @@ amSyncSite(void) {
        now = FT_ApproxTime();
        if (beacon_globals.syncSiteUntil <= now) {      /* if my votes have expired, say so */
            if (beacon_globals.ubik_amSyncSite)
-               ubik_dprint("Ubik: I am no longer the sync site\n");
+               ViceLog(0, ("Ubik: I am no longer the sync site - my votes expired\n"));
            beacon_globals.ubik_amSyncSite = 0;
+           beacon_globals.ubik_syncSiteAdvertised = 0;
            rcode = 0;
        } else {
            rcode = 1;          /* otherwise still have the required votes */
        }
     }
     UBIK_BEACON_UNLOCK;
-    ubik_dprint("beacon: amSyncSite is %d\n", rcode);
+    ViceLog(5, ("beacon: amSyncSite is %d\n", rcode));
     return rcode;
 }
 
@@ -153,6 +158,32 @@ ubeacon_AmSyncSite(void)
 }
 
 /*!
+ * \brief Determine whether at least quorum are aware we have a sync-site.
+ *
+ * Called from higher-level modules.
+ *
+ * There is a gap between the time when a new sync-site is elected and the time
+ * when the remotes are aware of that. Therefore, any write transaction between
+ * this gap will fail. This will force a new re-election which might be time
+ * consuming. This procedure determines whether the remotes (quorum) are aware
+ * we have a sync-site.
+ *
+ * \return 1 if remotes are aware we have a sync-site
+ * \return 0 if remotes are not aware we have a sync-site
+ */
+int
+ubeacon_SyncSiteAdvertised(void)
+{
+    afs_int32 rcode;
+
+    UBIK_BEACON_LOCK;
+    rcode = beacon_globals.ubik_syncSiteAdvertised;
+    UBIK_BEACON_UNLOCK;
+
+    return rcode;
+}
+
+/*!
  * \see ubeacon_InitServerListCommon()
  */
 int
@@ -293,8 +324,7 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
        for (i = 0; i < info->numServers; i++) {
            if (i == me)
                continue;
-           ts = (struct ubik_server *)malloc(sizeof(struct ubik_server));
-           memset(ts, 0, sizeof(struct ubik_server));
+           ts = calloc(1, sizeof(struct ubik_server));
            ts->next = ubik_servers;
            ubik_servers = ts;
            ts->addr[0] = info->hostAddr[i].sin_addr.s_addr;
@@ -326,8 +356,7 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
        while ((servAddr = *aservers++)) {
            if (i >= MAXSERVERS)
                return UNHOSTS; /* too many hosts */
-           ts = (struct ubik_server *)malloc(sizeof(struct ubik_server));
-           memset(ts, 0, sizeof(struct ubik_server));
+           ts = calloc(1, sizeof(struct ubik_server));
            ts->next = ubik_servers;
            ubik_servers = ts;
            ts->addr[0] = servAddr;     /* primary address in  net byte order */
@@ -367,6 +396,12 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
        if (nServers == 1 && !amIClone) {
            beacon_globals.ubik_amSyncSite = 1; /* let's start as sync site */
            beacon_globals.syncSiteUntil = 0x7fffffff;  /* and be it quite a while */
+           beacon_globals.ubik_syncSiteAdvertised = 1;
+           DBHOLD(ubik_dbase);
+           UBIK_VERSION_LOCK;
+           version_globals.ubik_epochTime = FT_ApproxTime();
+           UBIK_VERSION_UNLOCK;
+           DBRELE(ubik_dbase);
        }
     } else {
        if (nServers == 1)      /* special case 1 server */
@@ -374,10 +409,17 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
     }
 
     if (ubik_singleServer) {
-       if (!beacon_globals.ubik_amSyncSite)
-           ubik_dprint("Ubik: I am the sync site - 1 server\n");
+       if (!beacon_globals.ubik_amSyncSite) {
+           ViceLog(0, ("Ubik: I am the sync site - 1 server\n"));
+           DBHOLD(ubik_dbase);
+           UBIK_VERSION_LOCK;
+           version_globals.ubik_epochTime = FT_ApproxTime();
+           UBIK_VERSION_UNLOCK;
+           DBRELE(ubik_dbase);
+       }
        beacon_globals.ubik_amSyncSite = 1;
        beacon_globals.syncSiteUntil = 0x7fffffff;      /* quite a while */
+       beacon_globals.ubik_syncSiteAdvertised = 1;
     }
     return 0;
 }
@@ -398,10 +440,13 @@ ubeacon_Interact(void *dummy)
     afs_int32 i;
     struct ubik_server *ts;
     afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
+    int becameSyncSite;
     struct ubik_tid ttid;
     struct ubik_version tversion;
     afs_int32 startTime;
 
+    opr_threadname_set("beacon");
+
     /* loop forever getting votes */
     lastWakeupTime = 0;                /* keep track of time we last started a vote collection */
     while (1) {
@@ -415,12 +460,11 @@ ubeacon_Interact(void *dummy)
            tt.tv_sec = temp;
            tt.tv_usec = 0;
 #ifdef AFS_PTHREAD_ENV
-           code = select(0, 0, 0, 0, &tt);
+           select(0, 0, 0, 0, &tt);
 #else
-           code = IOMGR_Select(0, 0, 0, 0, &tt);
+           IOMGR_Select(0, 0, 0, 0, &tt);
 #endif
-       } else
-           code = 0;
+       }
 
        lastWakeupTime = FT_ApproxTime();       /* started a new collection phase */
 
@@ -454,7 +498,7 @@ ubeacon_Interact(void *dummy)
 
        UBIK_VERSION_LOCK;
        ttid.epoch = version_globals.ubik_epochTime;
-       if (ubik_dbase->flags & DBWRITING) {
+       if (ubik_dbase->dbFlags & DBWRITING) {
            /*
             * if a write is in progress, we have to send the writeTidCounter
             * which holds the tid counter of the write transaction , and not
@@ -492,13 +536,35 @@ ubeacon_Interact(void *dummy)
                UBIK_BEACON_LOCK;
                ts->lastBeaconSent = temp;
                code = multi_error;
+
+               if (code > 0 && ((code < temp && code < temp - 3600) ||
+                                (code > temp && code > temp + 3600))) {
+                   /* if we reached here, supposedly the remote host voted
+                    * for us based on a computation from over an hour ago in
+                    * the past, or over an hour in the future. this is
+                    * unlikely; what actually probably happened is that the
+                    * call generated some error and was aborted. this can
+                    * happen due to errors with the rx security class in play
+                    * (rxkad, rxgk, etc). treat the host as if we got a
+                    * timeout, since this is not a valid vote. */
+                   ViceLog(0, ("assuming distant vote time %d from %s is an error; marking host down\n",
+                              (int)code, afs_inet_ntoa_r(ts->addr[0], hoststr)));
+                   code = -1;
+               }
+               if (code > 0 && rx_ConnError(connections[multi_i])) {
+                   ViceLog(0, ("assuming vote from %s is invalid due to conn error %d; marking host down\n",
+                              afs_inet_ntoa_r(ts->addr[0], hoststr),
+                              (int)rx_ConnError(connections[multi_i])));
+                   code = -1;
+               }
+
                /* note that the vote time (the return code) represents the time
                 * the vote was computed, *not* the time the vote expires.  We compute
                 * the latter down below if we got enough votes to go with */
                if (code > 0) {
                    if ((code & ~0xff) == ERROR_TABLE_BASE_RXK) {
-                       ubik_dprint("token error %d from host %s\n",
-                                   code, afs_inet_ntoa_r(ts->addr[0], hoststr));
+                       ViceLog(0, ("Server %s is marked down due to token error %d\n",
+                                   afs_inet_ntoa_r(ts->addr[0], hoststr), code));
                        ts->up = 0;
                        ts->beaconSinceDown = 0;
                        urecovery_LostServer(ts);
@@ -513,21 +579,21 @@ ubeacon_Interact(void *dummy)
                            yesVotes++; /* the extra epsilon */
                        ts->up = 1;     /* server is up (not really necessary: recovery does this for real) */
                        ts->beaconSinceDown = 1;
-                       ubik_dprint("yes vote from host %s\n",
-                                   afs_inet_ntoa_r(ts->addr[0], hoststr));
+                       ViceLog(5, ("yes vote from host %s\n",
+                                   afs_inet_ntoa_r(ts->addr[0], hoststr)));
                    }
                } else if (code == 0) {
                    ts->lastVoteTime = temp;
                    ts->lastVote = 0;
                    ts->beaconSinceDown = 1;
-                   ubik_dprint("no vote from %s\n",
-                               afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   ViceLog(5, ("no vote from %s\n",
+                               afs_inet_ntoa_r(ts->addr[0], hoststr)));
                } else if (code < 0) {
                    ts->up = 0;
                    ts->beaconSinceDown = 0;
                    urecovery_LostServer(ts);
-                   ubik_dprint("time out from %s\n",
-                               afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   ViceLog(0, ("Server %s is marked down due to VOTE_Beacon time out (%d)\n",
+                               afs_inet_ntoa_r(ts->addr[0], hoststr), code));
                }
                UBIK_BEACON_UNLOCK;
            }
@@ -548,31 +614,54 @@ ubeacon_Interact(void *dummy)
 
        /* now decide if we have enough votes to become sync site.
         * Note that we can still get enough votes even if we didn't for ourself. */
+       becameSyncSite = 0;
        if (yesVotes > nServers) {      /* yesVotes is bumped by 2 or 3 for each site */
            UBIK_BEACON_LOCK;
-           if (!beacon_globals.ubik_amSyncSite)
-               ubik_dprint("Ubik: I am the sync site\n");
-           beacon_globals.ubik_amSyncSite = 1;
-           beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
-#ifndef AFS_PTHREAD_ENV
-               /* I did not find a corresponding LWP_WaitProcess(&ubik_amSyncSite) --
-                  this may be a spurious signal call -- sjenkins */
-               LWP_NoYieldSignal(&beacon_globals.ubik_amSyncSite);
-#endif
+           if (!beacon_globals.ubik_amSyncSite) {
+               ViceLog(0, ("Ubik: I am the sync site\n"));
+               /* Defer actually changing any variables until we can take the
+                * DB lock (which is before the beacon lock in the lock order). */
+               becameSyncSite = 1;
+           } else {
+               beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
+               /* at this point, we have the guarantee that at least quorum
+                * received a beacon packet informing we have a sync-site. */
+               beacon_globals.ubik_syncSiteAdvertised = 1;
+           }
            UBIK_BEACON_UNLOCK;
        } else {
            UBIK_BEACON_LOCK;
            if (beacon_globals.ubik_amSyncSite)
-               ubik_dprint("Ubik: I am no longer the sync site\n");
+               ViceLog(0, ("Ubik: I am no longer the sync site - I lost the election\n"));
            beacon_globals.ubik_amSyncSite = 0;
+           beacon_globals.ubik_syncSiteAdvertised = 0;
            UBIK_BEACON_UNLOCK;
            DBHOLD(ubik_dbase);
            urecovery_ResetState();     /* tell recovery we're no longer the sync site */
            DBRELE(ubik_dbase);
        }
+       /* We cannot take the DB lock around the entire preceding conditional,
+        * because if we are currently the sync site and this election serves
+        * to confirm that status, the DB lock may already be held for a long-running
+        * write transaction.  In such a case, attempting to acquire the DB lock
+        * would cause the beacon thread to block and disrupt election processing.
+        * However, if we are transitioning from not-sync-site to sync-site, there
+        * can be no outstanding transactions and acquiring the DB lock should be
+        * safe without extended blocking. */
+       if (becameSyncSite) {
+           DBHOLD(ubik_dbase);
+           UBIK_BEACON_LOCK;
+           UBIK_VERSION_LOCK;
+           version_globals.ubik_epochTime = FT_ApproxTime();
+           beacon_globals.ubik_amSyncSite = 1;
+           beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
+           UBIK_VERSION_UNLOCK;
+           UBIK_BEACON_UNLOCK;
+           DBRELE(ubik_dbase);
+       }
 
     }                          /* while loop */
-    return NULL;
+    AFS_UNREACHED(return(NULL));
 }
 
 /*!
@@ -615,14 +704,14 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
         * host as returned by rx_getAllAddr (in NBO)
         */
        char reason[1024];
-       count =
-           parseNetFiles(myAddr, NULL, NULL, UBIK_MAX_INTERFACE_ADDR, reason,
-                         AFSDIR_SERVER_NETINFO_FILEPATH,
-                         AFSDIR_SERVER_NETRESTRICT_FILEPATH);
+       count = afsconf_ParseNetFiles(myAddr, NULL, NULL,
+                                     UBIK_MAX_INTERFACE_ADDR, reason,
+                                     AFSDIR_SERVER_NETINFO_FILEPATH,
+                                     AFSDIR_SERVER_NETRESTRICT_FILEPATH);
        if (count < 0) {
-           ubik_print("ubik: Can't register any valid addresses:%s\n",
-                      reason);
-           ubik_print("Aborting..\n");
+           ViceLog(0, ("ubik: Can't register any valid addresses:%s\n",
+                      reason));
+           ViceLog(0, ("Aborting..\n"));
            return UBADHOST;
        }
        usednetfiles++;
@@ -632,7 +721,7 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
     }
 
     if (count <= 0) {          /* no address found */
-       ubik_print("ubik: No network addresses found, aborting..");
+       ViceLog(0, ("ubik: No network addresses found, aborting..\n"));
        return UBADHOST;
     }
 
@@ -645,8 +734,8 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
     }
 
     if (!found) {
-       ubik_print("ubik: primary address %s does not exist\n",
-                  afs_inet_ntoa_r(*ame, hoststr));
+       ViceLog(0, ("ubik: primary address %s does not exist\n",
+                  afs_inet_ntoa_r(*ame, hoststr)));
        /* if we had the result of rx_getAllAddr already, avoid subverting
         * the "is gethostbyname(gethostname()) us" check. If we're
         * using NetInfo/NetRestrict, we assume they have enough clue
@@ -656,7 +745,7 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
            *ame = myAddr[0];
            tcount = rx_getAllAddr(myAddr2, UBIK_MAX_INTERFACE_ADDR);
            if (tcount <= 0) {  /* no address found */
-               ubik_print("ubik: No network addresses found, aborting..");
+               ViceLog(0, ("ubik: No network addresses found, aborting..\n"));
                return UBADHOST;
            }
 
@@ -691,7 +780,7 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
        }
     }
     if (found)
-       ubik_print("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr));
+       ViceLog(0, ("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr)));
 
     if (!info) {
        /* get rid of servers which were purged because all
@@ -773,9 +862,9 @@ ubeacon_updateUbikNetworkAddress(afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR])
                if (ts->addr[0] != htonl(outAddr.hostAddr[0])) {
                    code = UBADHOST;
                    strcpy(buffer, afs_inet_ntoa_r(ts->addr[0], hoststr));
-                   ubik_print("ubik:Two primary addresses for same server \
+                   ViceLog(0, ("ubik:Two primary addresses for same server \
                     %s %s\n", buffer,
-                   afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr));
+                   afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr)));
                } else {
                    for (j = 1; j < UBIK_MAX_INTERFACE_ADDR; j++)
                        ts->addr[j] = htonl(outAddr.hostAddr[j]);
@@ -783,22 +872,23 @@ ubeacon_updateUbikNetworkAddress(afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR])
                UBIK_ADDR_UNLOCK;
            } else if (multi_error == RXGEN_OPCODE) {   /* pre 3.5 remote server */
                UBIK_ADDR_LOCK;
-               ubik_print
-                   ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
-                    afs_inet_ntoa_r(ts->addr[0], hoststr));
+               ViceLog(0, ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
+                    afs_inet_ntoa_r(ts->addr[0], hoststr)));
                UBIK_ADDR_UNLOCK;
            } else if (multi_error == UBADHOST) {
                code = UBADHOST;        /* remote CellServDB inconsistency */
-               ubik_print("Inconsistent Cell Info on server: ");
+               ViceLog(0, ("Inconsistent Cell Info on server:\n"));
                UBIK_ADDR_LOCK;
                for (j = 0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
-                   ubik_print("%s ", afs_inet_ntoa_r(ts->addr[j], hoststr));
+                   ViceLog(0, ("... %s\n", afs_inet_ntoa_r(ts->addr[j], hoststr)));
                UBIK_ADDR_UNLOCK;
-               ubik_print("\n");
            } else {
                UBIK_BEACON_LOCK;
                ts->up = 0;     /* mark the remote server as down */
                UBIK_BEACON_UNLOCK;
+               ViceLog(0, ("Server %s is marked down due to DISK_UpdateInterfaceAddr code %d\n",
+                           afs_inet_ntoa_r(ts->addr[0], hoststr), multi_error));
+
            }
        }
        multi_End;