ubik: Rename flags to dbFlags
[openafs.git] / src / ubik / beacon.c
index bf32432..305a118 100644 (file)
@@ -24,9 +24,7 @@
 #include <rx/rxkad.h>
 #include <rx/rx_multi.h>
 #include <afs/cellconfig.h>
-#ifndef AFS_NT40_ENV
 #include <afs/afsutil.h>
-#endif
 
 #define UBIK_INTERNALS
 #include "ubik.h"
@@ -117,7 +115,7 @@ amSyncSite(void)
        now = FT_ApproxTime();
        if (beacon_globals.syncSiteUntil <= now) {      /* if my votes have expired, say so */
            if (beacon_globals.ubik_amSyncSite)
-               ubik_dprint("Ubik: I am no longer the sync site\n");
+               ViceLog(0, ("Ubik: I am no longer the sync site - my votes expired\n"));
            beacon_globals.ubik_amSyncSite = 0;
            beacon_globals.ubik_syncSiteAdvertised = 0;
            rcode = 0;
@@ -126,7 +124,7 @@ amSyncSite(void)
        }
     }
     UBIK_BEACON_UNLOCK;
-    ubik_dprint("beacon: amSyncSite is %d\n", rcode);
+    ViceLog(5, ("beacon: amSyncSite is %d\n", rcode));
     return rcode;
 }
 
@@ -399,6 +397,11 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
            beacon_globals.ubik_amSyncSite = 1; /* let's start as sync site */
            beacon_globals.syncSiteUntil = 0x7fffffff;  /* and be it quite a while */
            beacon_globals.ubik_syncSiteAdvertised = 1;
+           DBHOLD(ubik_dbase);
+           UBIK_VERSION_LOCK;
+           version_globals.ubik_epochTime = FT_ApproxTime();
+           UBIK_VERSION_UNLOCK;
+           DBRELE(ubik_dbase);
        }
     } else {
        if (nServers == 1)      /* special case 1 server */
@@ -406,8 +409,14 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
     }
 
     if (ubik_singleServer) {
-       if (!beacon_globals.ubik_amSyncSite)
-           ubik_dprint("Ubik: I am the sync site - 1 server\n");
+       if (!beacon_globals.ubik_amSyncSite) {
+           ViceLog(0, ("Ubik: I am the sync site - 1 server\n"));
+           DBHOLD(ubik_dbase);
+           UBIK_VERSION_LOCK;
+           version_globals.ubik_epochTime = FT_ApproxTime();
+           UBIK_VERSION_UNLOCK;
+           DBRELE(ubik_dbase);
+       }
        beacon_globals.ubik_amSyncSite = 1;
        beacon_globals.syncSiteUntil = 0x7fffffff;      /* quite a while */
        beacon_globals.ubik_syncSiteAdvertised = 1;
@@ -431,11 +440,12 @@ ubeacon_Interact(void *dummy)
     afs_int32 i;
     struct ubik_server *ts;
     afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
+    int becameSyncSite;
     struct ubik_tid ttid;
     struct ubik_version tversion;
     afs_int32 startTime;
 
-    afs_pthread_setname_self("beacon");
+    opr_threadname_set("beacon");
 
     /* loop forever getting votes */
     lastWakeupTime = 0;                /* keep track of time we last started a vote collection */
@@ -488,7 +498,7 @@ ubeacon_Interact(void *dummy)
 
        UBIK_VERSION_LOCK;
        ttid.epoch = version_globals.ubik_epochTime;
-       if (ubik_dbase->flags & DBWRITING) {
+       if (ubik_dbase->dbFlags & DBWRITING) {
            /*
             * if a write is in progress, we have to send the writeTidCounter
             * which holds the tid counter of the write transaction , and not
@@ -537,14 +547,14 @@ ubeacon_Interact(void *dummy)
                     * happen due to errors with the rx security class in play
                     * (rxkad, rxgk, etc). treat the host as if we got a
                     * timeout, since this is not a valid vote. */
-                   ubik_print("assuming distant vote time %d from %s is an error; marking host down\n",
-                              (int)code, afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   ViceLog(0, ("assuming distant vote time %d from %s is an error; marking host down\n",
+                              (int)code, afs_inet_ntoa_r(ts->addr[0], hoststr)));
                    code = -1;
                }
                if (code > 0 && rx_ConnError(connections[multi_i])) {
-                   ubik_print("assuming vote from %s is invalid due to conn error %d; marking host down\n",
+                   ViceLog(0, ("assuming vote from %s is invalid due to conn error %d; marking host down\n",
                               afs_inet_ntoa_r(ts->addr[0], hoststr),
-                              (int)rx_ConnError(connections[multi_i]));
+                              (int)rx_ConnError(connections[multi_i])));
                    code = -1;
                }
 
@@ -553,8 +563,8 @@ ubeacon_Interact(void *dummy)
                 * the latter down below if we got enough votes to go with */
                if (code > 0) {
                    if ((code & ~0xff) == ERROR_TABLE_BASE_RXK) {
-                       ubik_dprint("token error %d from host %s\n",
-                                   code, afs_inet_ntoa_r(ts->addr[0], hoststr));
+                       ViceLog(0, ("Server %s is marked down due to token error %d\n",
+                                   afs_inet_ntoa_r(ts->addr[0], hoststr), code));
                        ts->up = 0;
                        ts->beaconSinceDown = 0;
                        urecovery_LostServer(ts);
@@ -569,21 +579,21 @@ ubeacon_Interact(void *dummy)
                            yesVotes++; /* the extra epsilon */
                        ts->up = 1;     /* server is up (not really necessary: recovery does this for real) */
                        ts->beaconSinceDown = 1;
-                       ubik_dprint("yes vote from host %s\n",
-                                   afs_inet_ntoa_r(ts->addr[0], hoststr));
+                       ViceLog(5, ("yes vote from host %s\n",
+                                   afs_inet_ntoa_r(ts->addr[0], hoststr)));
                    }
                } else if (code == 0) {
                    ts->lastVoteTime = temp;
                    ts->lastVote = 0;
                    ts->beaconSinceDown = 1;
-                   ubik_dprint("no vote from %s\n",
-                               afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   ViceLog(5, ("no vote from %s\n",
+                               afs_inet_ntoa_r(ts->addr[0], hoststr)));
                } else if (code < 0) {
                    ts->up = 0;
                    ts->beaconSinceDown = 0;
                    urecovery_LostServer(ts);
-                   ubik_dprint("time out from %s\n",
-                               afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   ViceLog(0, ("Server %s is marked down due to VOTE_Beacon time out (%d)\n",
+                               afs_inet_ntoa_r(ts->addr[0], hoststr), code));
                }
                UBIK_BEACON_UNLOCK;
            }
@@ -604,27 +614,25 @@ ubeacon_Interact(void *dummy)
 
        /* now decide if we have enough votes to become sync site.
         * Note that we can still get enough votes even if we didn't for ourself. */
+       becameSyncSite = 0;
        if (yesVotes > nServers) {      /* yesVotes is bumped by 2 or 3 for each site */
            UBIK_BEACON_LOCK;
-           if (!beacon_globals.ubik_amSyncSite)
-               ubik_dprint("Ubik: I am the sync site\n");
-           else {
+           if (!beacon_globals.ubik_amSyncSite) {
+               ViceLog(0, ("Ubik: I am the sync site\n"));
+               /* Defer actually changing any variables until we can take the
+                * DB lock (which is before the beacon lock in the lock order). */
+               becameSyncSite = 1;
+           } else {
+               beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
                /* at this point, we have the guarantee that at least quorum
                 * received a beacon packet informing we have a sync-site. */
                beacon_globals.ubik_syncSiteAdvertised = 1;
            }
-           beacon_globals.ubik_amSyncSite = 1;
-           beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
-#ifndef AFS_PTHREAD_ENV
-               /* I did not find a corresponding LWP_WaitProcess(&ubik_amSyncSite) --
-                  this may be a spurious signal call -- sjenkins */
-               LWP_NoYieldSignal(&beacon_globals.ubik_amSyncSite);
-#endif
            UBIK_BEACON_UNLOCK;
        } else {
            UBIK_BEACON_LOCK;
            if (beacon_globals.ubik_amSyncSite)
-               ubik_dprint("Ubik: I am no longer the sync site\n");
+               ViceLog(0, ("Ubik: I am no longer the sync site - I lost the election\n"));
            beacon_globals.ubik_amSyncSite = 0;
            beacon_globals.ubik_syncSiteAdvertised = 0;
            UBIK_BEACON_UNLOCK;
@@ -632,9 +640,28 @@ ubeacon_Interact(void *dummy)
            urecovery_ResetState();     /* tell recovery we're no longer the sync site */
            DBRELE(ubik_dbase);
        }
+       /* We cannot take the DB lock around the entire preceding conditional,
+        * because if we are currently the sync site and this election serves
+        * to confirm that status, the DB lock may already be held for a long-running
+        * write transaction.  In such a case, attempting to acquire the DB lock
+        * would cause the beacon thread to block and disrupt election processing.
+        * However, if we are transitioning from not-sync-site to sync-site, there
+        * can be no outstanding transactions and acquiring the DB lock should be
+        * safe without extended blocking. */
+       if (becameSyncSite) {
+           DBHOLD(ubik_dbase);
+           UBIK_BEACON_LOCK;
+           UBIK_VERSION_LOCK;
+           version_globals.ubik_epochTime = FT_ApproxTime();
+           beacon_globals.ubik_amSyncSite = 1;
+           beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
+           UBIK_VERSION_UNLOCK;
+           UBIK_BEACON_UNLOCK;
+           DBRELE(ubik_dbase);
+       }
 
     }                          /* while loop */
-    return NULL;
+    AFS_UNREACHED(return(NULL));
 }
 
 /*!
@@ -682,9 +709,9 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
                                      AFSDIR_SERVER_NETINFO_FILEPATH,
                                      AFSDIR_SERVER_NETRESTRICT_FILEPATH);
        if (count < 0) {
-           ubik_print("ubik: Can't register any valid addresses:%s\n",
-                      reason);
-           ubik_print("Aborting..\n");
+           ViceLog(0, ("ubik: Can't register any valid addresses:%s\n",
+                      reason));
+           ViceLog(0, ("Aborting..\n"));
            return UBADHOST;
        }
        usednetfiles++;
@@ -694,7 +721,7 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
     }
 
     if (count <= 0) {          /* no address found */
-       ubik_print("ubik: No network addresses found, aborting..\n");
+       ViceLog(0, ("ubik: No network addresses found, aborting..\n"));
        return UBADHOST;
     }
 
@@ -707,8 +734,8 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
     }
 
     if (!found) {
-       ubik_print("ubik: primary address %s does not exist\n",
-                  afs_inet_ntoa_r(*ame, hoststr));
+       ViceLog(0, ("ubik: primary address %s does not exist\n",
+                  afs_inet_ntoa_r(*ame, hoststr)));
        /* if we had the result of rx_getAllAddr already, avoid subverting
         * the "is gethostbyname(gethostname()) us" check. If we're
         * using NetInfo/NetRestrict, we assume they have enough clue
@@ -718,7 +745,7 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
            *ame = myAddr[0];
            tcount = rx_getAllAddr(myAddr2, UBIK_MAX_INTERFACE_ADDR);
            if (tcount <= 0) {  /* no address found */
-               ubik_print("ubik: No network addresses found, aborting..\n");
+               ViceLog(0, ("ubik: No network addresses found, aborting..\n"));
                return UBADHOST;
            }
 
@@ -753,7 +780,7 @@ verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
        }
     }
     if (found)
-       ubik_print("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr));
+       ViceLog(0, ("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr)));
 
     if (!info) {
        /* get rid of servers which were purged because all
@@ -835,9 +862,9 @@ ubeacon_updateUbikNetworkAddress(afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR])
                if (ts->addr[0] != htonl(outAddr.hostAddr[0])) {
                    code = UBADHOST;
                    strcpy(buffer, afs_inet_ntoa_r(ts->addr[0], hoststr));
-                   ubik_print("ubik:Two primary addresses for same server \
+                   ViceLog(0, ("ubik:Two primary addresses for same server \
                     %s %s\n", buffer,
-                   afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr));
+                   afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr)));
                } else {
                    for (j = 1; j < UBIK_MAX_INTERFACE_ADDR; j++)
                        ts->addr[j] = htonl(outAddr.hostAddr[j]);
@@ -845,21 +872,23 @@ ubeacon_updateUbikNetworkAddress(afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR])
                UBIK_ADDR_UNLOCK;
            } else if (multi_error == RXGEN_OPCODE) {   /* pre 3.5 remote server */
                UBIK_ADDR_LOCK;
-               ubik_print
-                   ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
-                    afs_inet_ntoa_r(ts->addr[0], hoststr));
+               ViceLog(0, ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
+                    afs_inet_ntoa_r(ts->addr[0], hoststr)));
                UBIK_ADDR_UNLOCK;
            } else if (multi_error == UBADHOST) {
                code = UBADHOST;        /* remote CellServDB inconsistency */
-               ubik_print("Inconsistent Cell Info on server:\n");
+               ViceLog(0, ("Inconsistent Cell Info on server:\n"));
                UBIK_ADDR_LOCK;
                for (j = 0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
-                   ubik_print("... %s\n", afs_inet_ntoa_r(ts->addr[j], hoststr));
+                   ViceLog(0, ("... %s\n", afs_inet_ntoa_r(ts->addr[j], hoststr)));
                UBIK_ADDR_UNLOCK;
            } else {
                UBIK_BEACON_LOCK;
                ts->up = 0;     /* mark the remote server as down */
                UBIK_BEACON_UNLOCK;
+               ViceLog(0, ("Server %s is marked down due to DISK_UpdateInterfaceAddr code %d\n",
+                           afs_inet_ntoa_r(ts->addr[0], hoststr), multi_error));
+
            }
        }
        multi_End;