2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
18 #include <afs/afsutil.h>
20 #define UBIK_INTERNALS
26 * The goal is to provide reliable operation among N servers, such that any
27 * server can crash with the remaining servers continuing operation within a
28 * short period of time. While a \b short outage is acceptable, this time
29 * should be order of 3 minutes or less.
31 * Theory of operation:
33 * Note: #SMALLTIME and #BIGTIME are essentially the same time value, separated
34 * only by the clock skew, #MAXSKEW. In general, if you are making guarantees
35 * for someone else, promise them no more than #SMALLTIME seconds of whatever
36 * invariant you provide. If you are waiting to be sure some invariant is now
37 * \b false, wait at least #BIGTIME seconds to be sure that #SMALLTIME seconds
38 * has passed at the other site.
40 * Now, back to the design:
41 * One site in the collection is a special site, designated the \b sync site.
42 * The sync site sends periodic messages, which can be thought of as
43 * keep-alive messages. When a non-sync site hears from the sync site, it
44 * knows that it is getting updates for the next #SMALLTIME seconds from that
47 * If a server does not hear from the sync site in #SMALLTIME seconds, it
48 * determines that it no longer is getting updates, and thus refuses to give
49 * out potentially out-of-date data. If a sync site can not muster a majority
50 * of servers to agree that it is the sync site, then there is a possibility
51 * that a network partition has occurred, allowing another server to claim to
52 * be the sync site. Thus, any time that the sync site has not heard from a
53 * majority of the servers in the last #SMALLTIME seconds, it voluntarily
54 * relinquishes its role as sync site.
56 * While attempting to nominate a new sync site, certain rules apply. First,
57 * a server can not reply "ok" (return 1 from ServBeacon) to two different
58 * hosts in less than #BIGTIME seconds; this allows a server that has heard
59 * affirmative replies from a majority of the servers to know that no other
60 * server in the network has heard enough affirmative replies in the last
61 * #BIGTIME seconds to become sync site, too. The variables #ubik_lastYesTime
62 * and #lastYesHost are used by all servers to keep track of which host they
63 * have last replied affirmatively to, when queried by a potential new sync
66 * Once a sync site has become a sync site, it periodically sends beacon
67 * messages with a parameter of 1, indicating that it already has determined
68 * it is supposed to be the sync site. The servers treat such a message as a
69 * guarantee that no other site will become sync site for the next #SMALLTIME
70 * seconds. In the interim, these servers can answer a query concerning which
71 * site is the sync site without any communication with any server. The
72 * variables #lastBeaconArrival and #lastBeaconHost are used by all servers to
73 * keep track of which sync site has last contacted them.
75 * One complication occurs while nominating a new sync site: each site may be
76 * trying to nominate a different site (based on the value of #lastYesHost),
77 * yet we must nominate the smallest host (under some order), to prevent this
78 * process from looping. The process could loop by having each server give
79 * one vote to another server, but with no server getting a majority of the
80 * votes. To avoid this, we try to withhold our votes for the server with the
81 * lowest internet address (an easy-to-generate order). To this effect, we
82 * keep track (in #lowestTime and #lowestHost) of the lowest server trying to
83 * become a sync site. We wait for this server unless there is already a sync
84 * site (indicated by ServBeacon's parameter being 1).
87 afs_int32 ubik_debugFlag = 0; /*!< print out debugging messages? */
89 struct vote_data vote_globals;
93 * \brief Decide if we should try to become sync site.
95 * The basic rule is that we
96 * don't run if there is a valid sync site and it ain't us (we have to run if
97 * it is us, in order to keep our votes). If there is no sync site, then we
98 * want to run if we're the lowest numbered host running, otherwise we defer to
99 * the lowest host. However, if the lowest host hasn't been heard from for a
100 * while, then we start running again, in case he crashed.
102 * \return true if we should run, and false otherwise.
105 uvote_ShouldIRun(void)
108 int code = 1; /* default to yes */
111 now = FT_ApproxTime();
112 if (BIGTIME + vote_globals.ubik_lastYesTime < now)
114 if (vote_globals.lastYesState && vote_globals.lastYesHost != ubik_host[0]) {
115 code = 0; /* other guy is sync site, leave him alone */
118 if (ntohl((afs_uint32)vote_globals.lastYesHost) < ntohl((afs_uint32)ubik_host[0])) {
119 code = 0; /* if someone is valid and better than us, don't run */
129 * \brief Return the current synchronization site, if any.
131 * Simple approach: if the
132 * last guy we voted yes for claims to be the sync site, then we we're happy to
133 * use that guy for a sync site until the time his mandate expires. If the guy
134 * does not claim to be sync site, then, of course, there's none.
136 * In addition, if we lost the sync, we set #urecovery_syncSite to an invalid
137 * value, indicating that we no longer know which version of the dbase is the
138 * one we should have. We'll get a new one when we next hear from the sync
141 * \return 0 or currently valid sync site. It can return our own
142 * address, if we're the sync site.
145 uvote_GetSyncSite(void)
151 if (!vote_globals.lastYesState)
154 now = FT_ApproxTime();
155 if (SMALLTIME + vote_globals.lastYesClaim < now)
156 code = 0; /* last guy timed out */
158 code = vote_globals.lastYesHost;
165 * \brief called by the sync site to handle vote beacons; if aconn is null, this is a
168 * \returns 0 or time when the vote was sent. It returns 0 if we are
169 * not voting for this sync site, or the time we actually voted yes, if
173 SVOTE_Beacon(struct rx_call * rxcall, afs_int32 astate,
174 afs_int32 astart, struct ubik_version * avers,
175 struct ubik_tid * atid)
180 struct rx_connection *aconn;
182 struct ubik_server *ts;
186 if (rxcall) { /* caller's host */
187 aconn = rx_ConnectionOf(rxcall);
188 rxp = rx_PeerOf(aconn);
189 otherHost = rx_HostOf(rxp);
191 /* get the primary interface address for this host. */
192 /* This is the identifier that ubik uses. */
193 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
195 ubik_dprint("Received beacon from unknown host %s\n",
196 afs_inet_ntoa_r(rx_HostOf(rxp), hoststr));
197 return 0; /* I don't know about you: vote no */
199 for (ts = ubik_servers; ts; ts = ts->next) {
200 if (ts->addr[0] == otherHost)
204 ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
205 if (ts && ts->isClone)
208 otherHost = ubik_host[0]; /* this host */
212 ubik_dprint("Received beacon type %d from host %s\n", astate,
213 afs_inet_ntoa_r(otherHost, hoststr));
215 /* compute the lowest server we've heard from. We'll try to only vote for
216 * this dude if we don't already have a synchronization site. Also, don't
217 * let a very old lowestHost confusing things forever. We pick a new
218 * lowestHost after BIGTIME seconds to limit the damage if this host
219 * actually crashes. Finally, we also count in this computation: don't
220 * pick someone else if we're even better!
222 * Note that the test below must be <=, not <, so that we keep refreshing
223 * lowestTime. Otherwise it will look like we haven't heard from
224 * lowestHost in a while and another host could slip in. */
227 /* First compute the lowest host we've heard from, whether we want them
228 * for a sync site or not. If we haven't heard from a site in BIGTIME
229 * seconds, we ignore its presence in lowestHost: it may have crashed.
230 * Note that we don't ever let anyone appear in our lowestHost if we're
231 * lower than them, 'cause we know we're up. */
232 /* But do not consider clones for lowesHost since they never may become
235 now = FT_ApproxTime(); /* close to current time */
237 && (ntohl((afs_uint32)otherHost) <= ntohl((afs_uint32)vote_globals.lowestHost)
238 || vote_globals.lowestTime + BIGTIME < now)) {
239 vote_globals.lowestTime = now;
240 vote_globals.lowestHost = otherHost;
242 /* why do we need this next check? Consider the case where each of two
243 * servers decides the other is lowestHost. Each stops sending beacons
244 * 'cause the other is there. Not obvious that this process terminates:
245 * i.e. each guy could restart procedure and again think other side is
246 * lowest. Need to prove: if one guy in the system is lowest and knows
247 * he's lowest, these loops don't occur. because if someone knows he's
248 * lowest, he will send out beacons telling others to vote for him. */
250 && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32)vote_globals.lowestHost)
251 || vote_globals.lowestTime + BIGTIME < now)) {
252 vote_globals.lowestTime = now;
253 vote_globals.lowestHost = ubik_host[0];
256 /* tell if we've heard from a sync site recently (even if we're not voting
257 * for this dude yet). After a while, time the guy out. */
258 if (astate) { /* this guy is a sync site */
259 vote_globals.syncHost = otherHost;
260 vote_globals.syncTime = now;
261 } else if (vote_globals.syncTime + BIGTIME < now) {
262 if (vote_globals.syncHost) {
264 ("Ubik: Lost contact with sync-site %s (NOT in quorum)\n",
265 afs_inet_ntoa_r(vote_globals.syncHost, hoststr));
267 vote_globals.syncHost = 0;
270 /* decide how to vote */
271 vote = 0; /* start off voting no */
273 /* if we this guy isn't a sync site, we don't really have to vote for him.
274 * We get to apply some heuristics to try to avoid weird oscillation sates
275 * in the voting procedure. */
277 /* in here only if this guy doesn't claim to be a sync site */
279 /* lowestHost is also trying for our votes, then just say no. */
280 if (ntohl(vote_globals.lowestHost) != ntohl(otherHost)) {
284 /* someone else *is* a sync site, just say no */
285 if (vote_globals.syncHost && vote_globals.syncHost != otherHost)
287 } else if (vote_globals.lastYesHost == 0xffffffff && otherHost == ubik_host[0]) {
288 /* fast startup if this is the only non-clone */
290 for (ts = ubik_servers; ts; ts = ts->next) {
291 if (ts->addr[0] == otherHost)
297 vote_globals.lastYesHost = otherHost;
302 goto done_zero; /* clone never can become sync site */
304 /* Don't promise sync site support to more than one host every BIGTIME
305 * seconds. This is the heart of our invariants in this system. */
306 if (vote_globals.ubik_lastYesTime + BIGTIME < now || otherHost == vote_globals.lastYesHost) {
307 if ((vote_globals.ubik_lastYesTime + BIGTIME < now) || (otherHost != vote_globals.lastYesHost)
308 || (vote_globals.lastYesState != astate)) {
309 /* A new vote or a change in the vote or changed quorum */
310 ubik_dprint("Ubik: vote 'yes' for %s %s\n",
311 afs_inet_ntoa_r(otherHost, hoststr),
312 (astate ? "(in quorum)" : "(NOT in quorum)"));
315 vote = now; /* vote yes */
316 vote_globals.ubik_lastYesTime = now; /* remember when we voted yes */
317 vote_globals.lastYesClaim = astart; /* remember for computing when sync site expires */
318 vote_globals.lastYesHost = otherHost; /* and who for */
319 vote_globals.lastYesState = astate; /* remember if site is a sync site */
320 vote_globals.ubik_dbVersion = *avers; /* resync value */
321 vote_globals.ubik_dbTid = *atid; /* transaction id, if any, of active trans */
324 urecovery_CheckTid(atid, 0); /* check if current write trans needs aborted */
336 * \brief Handle per-server debug command, where 0 is the first server.
338 * Basic network debugging hooks.
341 SVOTE_SDebug(struct rx_call * rxcall, afs_int32 awhich,
342 struct ubik_sdebug * aparm)
344 afs_int32 code, isClone;
345 code = SVOTE_XSDebug(rxcall, awhich, aparm, &isClone);
350 SVOTE_XSDebug(struct rx_call * rxcall, afs_int32 awhich,
351 struct ubik_sdebug * aparm, afs_int32 * isclone)
353 struct ubik_server *ts;
355 for (ts = ubik_servers; ts; ts = ts->next) {
358 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
359 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR - 1; i++)
360 aparm->altAddr[i] = ntohl(ts->addr[i + 1]);
361 aparm->lastVoteTime = ts->lastVoteTime;
362 aparm->lastBeaconSent = ts->lastBeaconSent;
363 memcpy(&aparm->remoteVersion, &ts->version,
364 sizeof(struct ubik_version));
365 aparm->lastVote = ts->lastVote;
367 aparm->beaconSinceDown = ts->beaconSinceDown;
368 aparm->currentDB = ts->currentDB;
369 *isclone = ts->isClone;
377 SVOTE_XDebug(struct rx_call * rxcall, struct ubik_debug * aparm,
382 code = SVOTE_Debug(rxcall, aparm);
388 * \brief Handle basic network debug command. This is the global state dumper.
391 SVOTE_Debug(struct rx_call * rxcall, struct ubik_debug * aparm)
394 /* fill in the basic debug structure. Note the the RPC protocol transfers,
395 * integers in host order. */
397 aparm->now = FT_ApproxTime();
398 aparm->lastYesTime = vote_globals.ubik_lastYesTime;
399 aparm->lastYesHost = ntohl(vote_globals.lastYesHost);
400 aparm->lastYesState = vote_globals.lastYesState;
401 aparm->lastYesClaim = vote_globals.lastYesClaim;
402 aparm->lowestHost = ntohl(vote_globals.lowestHost);
403 aparm->lowestTime = vote_globals.lowestTime;
404 aparm->syncHost = ntohl(vote_globals.syncHost);
405 aparm->syncTime = vote_globals.syncTime;
406 memcpy(&aparm->syncVersion, &vote_globals.ubik_dbVersion, sizeof(struct ubik_version));
407 memcpy(&aparm->syncTid, &vote_globals.ubik_dbTid, sizeof(struct ubik_tid));
409 /* fill in all interface addresses of myself in hostbyte order */
410 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR; i++)
411 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
413 aparm->amSyncSite = beacon_globals.ubik_amSyncSite;
414 ubeacon_Debug(aparm);
420 /* Get the recovery state. The label of the database may not have
421 * been written yet but set the flag so udebug behavior remains.
424 aparm->recoveryState = urecovery_state;
425 if ((urecovery_state & UBIK_RECSYNCSITE)
426 && (urecovery_state & UBIK_RECFOUNDDB)
427 && (urecovery_state & UBIK_RECHAVEDB)) {
428 aparm->recoveryState |= UBIK_RECLABELDB;
430 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
431 aparm->tidCounter = ubik_dbase->tidCounter;
433 if (ubik_currentTrans) {
434 aparm->currentTrans = 1;
435 if (ubik_currentTrans->type == UBIK_WRITETRANS)
436 aparm->writeTrans = 1;
438 aparm->writeTrans = 0;
440 aparm->currentTrans = 0;
443 aparm->epochTime = version_globals.ubik_epochTime;
449 SVOTE_SDebugOld(struct rx_call * rxcall, afs_int32 awhich,
450 struct ubik_sdebug_old * aparm)
452 struct ubik_server *ts;
454 for (ts = ubik_servers; ts; ts = ts->next) {
457 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
458 aparm->lastVoteTime = ts->lastVoteTime;
459 aparm->lastBeaconSent = ts->lastBeaconSent;
460 memcpy(&aparm->remoteVersion, &ts->version,
461 sizeof(struct ubik_version));
462 aparm->lastVote = ts->lastVote;
464 aparm->beaconSinceDown = ts->beaconSinceDown;
465 aparm->currentDB = ts->currentDB;
474 * \brief Handle basic network debug command. This is the global state dumper.
477 SVOTE_DebugOld(struct rx_call * rxcall,
478 struct ubik_debug_old * aparm)
481 /* fill in the basic debug structure. Note the the RPC protocol transfers,
482 * integers in host order. */
484 aparm->now = FT_ApproxTime();
485 aparm->lastYesTime = vote_globals.ubik_lastYesTime;
486 aparm->lastYesHost = ntohl(vote_globals.lastYesHost);
487 aparm->lastYesState = vote_globals.lastYesState;
488 aparm->lastYesClaim = vote_globals.lastYesClaim;
489 aparm->lowestHost = ntohl(vote_globals.lowestHost);
490 aparm->lowestTime = vote_globals.lowestTime;
491 aparm->syncHost = ntohl(vote_globals.syncHost);
492 aparm->syncTime = vote_globals.syncTime;
493 memcpy(&aparm->syncVersion, &vote_globals.ubik_dbVersion, sizeof(struct ubik_version));
494 memcpy(&aparm->syncTid, &vote_globals.ubik_dbTid, sizeof(struct ubik_tid));
496 aparm->amSyncSite = beacon_globals.ubik_amSyncSite;
497 ubeacon_Debug((ubik_debug *)aparm);
499 udisk_Debug((ubik_debug *)aparm);
501 ulock_Debug((ubik_debug *)aparm);
503 /* Get the recovery state. The label of the database may not have
504 * been written yet but set the flag so udebug behavior remains.
507 aparm->recoveryState = urecovery_state;
508 if ((urecovery_state & UBIK_RECSYNCSITE)
509 && (urecovery_state & UBIK_RECFOUNDDB)
510 && (urecovery_state & UBIK_RECHAVEDB)) {
511 aparm->recoveryState |= UBIK_RECLABELDB;
513 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
514 aparm->tidCounter = ubik_dbase->tidCounter;
516 if (ubik_currentTrans) {
517 aparm->currentTrans = 1;
518 if (ubik_currentTrans->type == UBIK_WRITETRANS)
519 aparm->writeTrans = 1;
521 aparm->writeTrans = 0;
523 aparm->currentTrans = 0;
526 aparm->epochTime = version_globals.ubik_epochTime;
533 * \brief Get the sync site; called by remote servers to find where they should go.
536 SVOTE_GetSyncSite(struct rx_call * rxcall,
541 temp = uvote_GetSyncSite();
542 *ahost = ntohl(temp);
547 ubik_dprint_25(const char *format, ...)
551 va_start(ap, format);
552 vViceLog(25, (format, ap));
557 ubik_dprint(const char *format, ...)
561 va_start(ap, format);
562 vViceLog(5, (format, ap));
567 ubik_vprint(const char *format, va_list ap)
569 vViceLog(0, (format, ap));
573 ubik_print(const char *format, ...)
577 va_start(ap, format);
578 ubik_vprint(format, ap);
583 * \brief Called once/run to init the vote module
589 /* pretend we just voted for someone else, since we just restarted */
590 vote_globals.ubik_lastYesTime = FT_ApproxTime();
592 /* Initialize globals */
593 vote_globals.ubik_lastYesTime = 0;
594 vote_globals.lastYesHost = 0xffffffff;
595 vote_globals.lastYesClaim = 0;
596 vote_globals.lastYesState = 0;
597 vote_globals.lowestTime = 0;
598 vote_globals.lowestHost = 0xffffffff;
599 vote_globals.syncTime = 0;
600 vote_globals.syncHost = 0;
607 uvote_set_dbVersion(struct ubik_version version) {
609 vote_globals.ubik_dbVersion = version;
613 /* Compare given version to current DB version. Return true if equal. */
615 uvote_eq_dbVersion(struct ubik_version version) {
619 if (vote_globals.ubik_dbVersion.epoch == version.epoch && vote_globals.ubik_dbVersion.counter == version.counter) {