#include <rx/rxkad.h>
#include <rx/rx_multi.h>
#include <afs/cellconfig.h>
-#ifndef AFS_NT40_ENV
#include <afs/afsutil.h>
-#endif
#define UBIK_INTERNALS
#include "ubik.h"
now = FT_ApproxTime();
if (beacon_globals.syncSiteUntil <= now) { /* if my votes have expired, say so */
if (beacon_globals.ubik_amSyncSite)
- ubik_dprint("Ubik: I am no longer the sync site\n");
+ ViceLog(0, ("Ubik: I am no longer the sync site - my votes expired\n"));
beacon_globals.ubik_amSyncSite = 0;
beacon_globals.ubik_syncSiteAdvertised = 0;
rcode = 0;
}
}
UBIK_BEACON_UNLOCK;
- ubik_dprint("beacon: amSyncSite is %d\n", rcode);
+ ViceLog(5, ("beacon: amSyncSite is %d\n", rcode));
return rcode;
}
beacon_globals.ubik_amSyncSite = 1; /* let's start as sync site */
beacon_globals.syncSiteUntil = 0x7fffffff; /* and be it quite a while */
beacon_globals.ubik_syncSiteAdvertised = 1;
+ DBHOLD(ubik_dbase);
+ UBIK_VERSION_LOCK;
+ version_globals.ubik_epochTime = FT_ApproxTime();
+ UBIK_VERSION_UNLOCK;
+ DBRELE(ubik_dbase);
}
} else {
if (nServers == 1) /* special case 1 server */
}
if (ubik_singleServer) {
- if (!beacon_globals.ubik_amSyncSite)
- ubik_dprint("Ubik: I am the sync site - 1 server\n");
+ if (!beacon_globals.ubik_amSyncSite) {
+ ViceLog(0, ("Ubik: I am the sync site - 1 server\n"));
+ DBHOLD(ubik_dbase);
+ UBIK_VERSION_LOCK;
+ version_globals.ubik_epochTime = FT_ApproxTime();
+ UBIK_VERSION_UNLOCK;
+ DBRELE(ubik_dbase);
+ }
beacon_globals.ubik_amSyncSite = 1;
beacon_globals.syncSiteUntil = 0x7fffffff; /* quite a while */
beacon_globals.ubik_syncSiteAdvertised = 1;
afs_int32 i;
struct ubik_server *ts;
afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
+ int becameSyncSite;
struct ubik_tid ttid;
struct ubik_version tversion;
afs_int32 startTime;
- afs_pthread_setname_self("beacon");
+ opr_threadname_set("beacon");
/* loop forever getting votes */
lastWakeupTime = 0; /* keep track of time we last started a vote collection */
UBIK_VERSION_LOCK;
ttid.epoch = version_globals.ubik_epochTime;
- if (ubik_dbase->flags & DBWRITING) {
+ if (ubik_dbase->dbFlags & DBWRITING) {
/*
* if a write is in progress, we have to send the writeTidCounter
* which holds the tid counter of the write transaction , and not
* happen due to errors with the rx security class in play
* (rxkad, rxgk, etc). treat the host as if we got a
* timeout, since this is not a valid vote. */
- ubik_print("assuming distant vote time %d from %s is an error; marking host down\n",
- (int)code, afs_inet_ntoa_r(ts->addr[0], hoststr));
+ ViceLog(0, ("assuming distant vote time %d from %s is an error; marking host down\n",
+ (int)code, afs_inet_ntoa_r(ts->addr[0], hoststr)));
code = -1;
}
if (code > 0 && rx_ConnError(connections[multi_i])) {
- ubik_print("assuming vote from %s is invalid due to conn error %d; marking host down\n",
+ ViceLog(0, ("assuming vote from %s is invalid due to conn error %d; marking host down\n",
afs_inet_ntoa_r(ts->addr[0], hoststr),
- (int)rx_ConnError(connections[multi_i]));
+ (int)rx_ConnError(connections[multi_i])));
code = -1;
}
* the latter down below if we got enough votes to go with */
if (code > 0) {
if ((code & ~0xff) == ERROR_TABLE_BASE_RXK) {
- ubik_dprint("token error %d from host %s\n",
- code, afs_inet_ntoa_r(ts->addr[0], hoststr));
+ ViceLog(0, ("Server %s is marked down due to token error %d\n",
+ afs_inet_ntoa_r(ts->addr[0], hoststr), code));
ts->up = 0;
ts->beaconSinceDown = 0;
urecovery_LostServer(ts);
yesVotes++; /* the extra epsilon */
ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
ts->beaconSinceDown = 1;
- ubik_dprint("yes vote from host %s\n",
- afs_inet_ntoa_r(ts->addr[0], hoststr));
+ ViceLog(5, ("yes vote from host %s\n",
+ afs_inet_ntoa_r(ts->addr[0], hoststr)));
}
} else if (code == 0) {
ts->lastVoteTime = temp;
ts->lastVote = 0;
ts->beaconSinceDown = 1;
- ubik_dprint("no vote from %s\n",
- afs_inet_ntoa_r(ts->addr[0], hoststr));
+ ViceLog(5, ("no vote from %s\n",
+ afs_inet_ntoa_r(ts->addr[0], hoststr)));
} else if (code < 0) {
ts->up = 0;
ts->beaconSinceDown = 0;
urecovery_LostServer(ts);
- ubik_dprint("time out from %s\n",
- afs_inet_ntoa_r(ts->addr[0], hoststr));
+ ViceLog(0, ("Server %s is marked down due to VOTE_Beacon time out (%d)\n",
+ afs_inet_ntoa_r(ts->addr[0], hoststr), code));
}
UBIK_BEACON_UNLOCK;
}
/* now decide if we have enough votes to become sync site.
* Note that we can still get enough votes even if we didn't for ourself. */
+ becameSyncSite = 0;
if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
UBIK_BEACON_LOCK;
- if (!beacon_globals.ubik_amSyncSite)
- ubik_dprint("Ubik: I am the sync site\n");
- else {
+ if (!beacon_globals.ubik_amSyncSite) {
+ ViceLog(0, ("Ubik: I am the sync site\n"));
+ /* Defer actually changing any variables until we can take the
+ * DB lock (which is before the beacon lock in the lock order). */
+ becameSyncSite = 1;
+ } else {
+ beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
/* at this point, we have the guarantee that at least quorum
* received a beacon packet informing we have a sync-site. */
beacon_globals.ubik_syncSiteAdvertised = 1;
}
- beacon_globals.ubik_amSyncSite = 1;
- beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
-#ifndef AFS_PTHREAD_ENV
- /* I did not find a corresponding LWP_WaitProcess(&ubik_amSyncSite) --
- this may be a spurious signal call -- sjenkins */
- LWP_NoYieldSignal(&beacon_globals.ubik_amSyncSite);
-#endif
UBIK_BEACON_UNLOCK;
} else {
UBIK_BEACON_LOCK;
if (beacon_globals.ubik_amSyncSite)
- ubik_dprint("Ubik: I am no longer the sync site\n");
+ ViceLog(0, ("Ubik: I am no longer the sync site - I lost the election\n"));
beacon_globals.ubik_amSyncSite = 0;
beacon_globals.ubik_syncSiteAdvertised = 0;
UBIK_BEACON_UNLOCK;
urecovery_ResetState(); /* tell recovery we're no longer the sync site */
DBRELE(ubik_dbase);
}
+ /* We cannot take the DB lock around the entire preceding conditional,
+ * because if we are currently the sync site and this election serves
+ * to confirm that status, the DB lock may already be held for a long-running
+ * write transaction. In such a case, attempting to acquire the DB lock
+ * would cause the beacon thread to block and disrupt election processing.
+ * However, if we are transitioning from not-sync-site to sync-site, there
+ * can be no outstanding transactions and acquiring the DB lock should be
+ * safe without extended blocking. */
+ if (becameSyncSite) {
+ DBHOLD(ubik_dbase);
+ UBIK_BEACON_LOCK;
+ UBIK_VERSION_LOCK;
+ version_globals.ubik_epochTime = FT_ApproxTime();
+ beacon_globals.ubik_amSyncSite = 1;
+ beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
+ UBIK_VERSION_UNLOCK;
+ UBIK_BEACON_UNLOCK;
+ DBRELE(ubik_dbase);
+ }
} /* while loop */
- return NULL;
+ AFS_UNREACHED(return(NULL));
}
/*!
AFSDIR_SERVER_NETINFO_FILEPATH,
AFSDIR_SERVER_NETRESTRICT_FILEPATH);
if (count < 0) {
- ubik_print("ubik: Can't register any valid addresses:%s\n",
- reason);
- ubik_print("Aborting..\n");
+ ViceLog(0, ("ubik: Can't register any valid addresses:%s\n",
+ reason));
+ ViceLog(0, ("Aborting..\n"));
return UBADHOST;
}
usednetfiles++;
}
if (count <= 0) { /* no address found */
- ubik_print("ubik: No network addresses found, aborting..\n");
+ ViceLog(0, ("ubik: No network addresses found, aborting..\n"));
return UBADHOST;
}
}
if (!found) {
- ubik_print("ubik: primary address %s does not exist\n",
- afs_inet_ntoa_r(*ame, hoststr));
+ ViceLog(0, ("ubik: primary address %s does not exist\n",
+ afs_inet_ntoa_r(*ame, hoststr)));
/* if we had the result of rx_getAllAddr already, avoid subverting
* the "is gethostbyname(gethostname()) us" check. If we're
* using NetInfo/NetRestrict, we assume they have enough clue
*ame = myAddr[0];
tcount = rx_getAllAddr(myAddr2, UBIK_MAX_INTERFACE_ADDR);
if (tcount <= 0) { /* no address found */
- ubik_print("ubik: No network addresses found, aborting..\n");
+ ViceLog(0, ("ubik: No network addresses found, aborting..\n"));
return UBADHOST;
}
}
}
if (found)
- ubik_print("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr));
+ ViceLog(0, ("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr)));
if (!info) {
/* get rid of servers which were purged because all
if (ts->addr[0] != htonl(outAddr.hostAddr[0])) {
code = UBADHOST;
strcpy(buffer, afs_inet_ntoa_r(ts->addr[0], hoststr));
- ubik_print("ubik:Two primary addresses for same server \
+ ViceLog(0, ("ubik:Two primary addresses for same server \
%s %s\n", buffer,
- afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr));
+ afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr)));
} else {
for (j = 1; j < UBIK_MAX_INTERFACE_ADDR; j++)
ts->addr[j] = htonl(outAddr.hostAddr[j]);
UBIK_ADDR_UNLOCK;
} else if (multi_error == RXGEN_OPCODE) { /* pre 3.5 remote server */
UBIK_ADDR_LOCK;
- ubik_print
- ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
- afs_inet_ntoa_r(ts->addr[0], hoststr));
+ ViceLog(0, ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
+ afs_inet_ntoa_r(ts->addr[0], hoststr)));
UBIK_ADDR_UNLOCK;
} else if (multi_error == UBADHOST) {
code = UBADHOST; /* remote CellServDB inconsistency */
- ubik_print("Inconsistent Cell Info on server:\n");
+ ViceLog(0, ("Inconsistent Cell Info on server:\n"));
UBIK_ADDR_LOCK;
for (j = 0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
- ubik_print("... %s\n", afs_inet_ntoa_r(ts->addr[j], hoststr));
+ ViceLog(0, ("... %s\n", afs_inet_ntoa_r(ts->addr[j], hoststr)));
UBIK_ADDR_UNLOCK;
} else {
UBIK_BEACON_LOCK;
ts->up = 0; /* mark the remote server as down */
UBIK_BEACON_UNLOCK;
+ ViceLog(0, ("Server %s is marked down due to DISK_UpdateInterfaceAddr code %d\n",
+ afs_inet_ntoa_r(ts->addr[0], hoststr), multi_error));
+
}
}
multi_End;