2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
16 #include <sys/types.h>
21 #include <netinet/in.h>
27 #include <afs/afsutil.h>
31 #define UBIK_INTERNALS
37 * The goal is to provide reliable operation among N servers, such that any
38 * server can crash with the remaining servers continuing operation within a
39 * short period of time. While a \b short outage is acceptable, this time
40 * should be order of 3 minutes or less.
42 * Theory of operation:
44 * Note: #SMALLTIME and #BIGTIME are essentially the same time value, separated
45 * only by the clock skew, #MAXSKEW. In general, if you are making guarantees
46 * for someone else, promise them no more than #SMALLTIME seconds of whatever
47 * invariant you provide. If you are waiting to be sure some invariant is now
48 * \b false, wait at least #BIGTIME seconds to be sure that #SMALLTIME seconds
49 * has passed at the other site.
51 * Now, back to the design:
52 * One site in the collection is a special site, designated the \b sync site.
53 * The sync site sends periodic messages, which can be thought of as
54 * keep-alive messages. When a non-sync site hears from the sync site, it
55 * knows that it is getting updates for the next #SMALLTIME seconds from that
58 * If a server does not hear from the sync site in #SMALLTIME seconds, it
59 * determines that it no longer is getting updates, and thus refuses to give
60 * out potentially out-of-date data. If a sync site can not muster a majority
61 * of servers to agree that it is the sync site, then there is a possibility
62 * that a network partition has occurred, allowing another server to claim to
63 * be the sync site. Thus, any time that the sync site has not heard from a
64 * majority of the servers in the last #SMALLTIME seconds, it voluntarily
65 * relinquishes its role as sync site.
67 * While attempting to nominate a new sync site, certain rules apply. First,
68 * a server can not reply "ok" (return 1 from ServBeacon) to two different
69 * hosts in less than #BIGTIME seconds; this allows a server that has heard
70 * affirmative replies from a majority of the servers to know that no other
71 * server in the network has heard enough affirmative replies in the last
72 * #BIGTIME seconds to become sync site, too. The variables #ubik_lastYesTime
73 * and #lastYesHost are used by all servers to keep track of which host they
74 * have last replied affirmatively to, when queried by a potential new sync
77 * Once a sync site has become a sync site, it periodically sends beacon
78 * messages with a parameter of 1, indicating that it already has determined
79 * it is supposed to be the sync site. The servers treat such a message as a
80 * guarantee that no other site will become sync site for the next #SMALLTIME
81 * seconds. In the interim, these servers can answer a query concerning which
82 * site is the sync site without any communication with any server. The
83 * variables #lastBeaconArrival and #lastBeaconHost are used by all servers to
84 * keep track of which sync site has last contacted them.
86 * One complication occurs while nominating a new sync site: each site may be
87 * trying to nominate a different site (based on the value of #lastYesHost),
88 * yet we must nominate the smallest host (under some order), to prevent this
89 * process from looping. The process could loop by having each server give
90 * one vote to another server, but with no server getting a majority of the
91 * votes. To avoid this, we try to withhold our votes for the server with the
92 * lowest internet address (an easy-to-generate order). To this effect, we
93 * keep track (in #lowestTime and #lowestHost) of the lowest server trying to
94 * become a sync site. We wait for this server unless there is already a sync
95 * site (indicated by ServBeacon's parameter being 1).
98 afs_int32 ubik_debugFlag = 0; /*!< print out debugging messages? */
100 /*! \name these statics are used by all sites in nominating new sync sites */
101 afs_int32 ubik_lastYesTime = 0; /*!< time we sent the last \b yes vote */
102 static afs_uint32 lastYesHost = 0xffffffff; /*!< host to which we sent \b yes vote */
104 /*! \name Next is time sync site began this vote: guarantees sync site until this + SMALLTIME */
105 static afs_int32 lastYesClaim = 0;
106 static int lastYesState = 0; /*!< did last site we voted for claim to be sync site? */
109 /*! \name used to guarantee that nomination process doesn't loop */
110 static afs_int32 lowestTime = 0;
111 static afs_uint32 lowestHost = 0xffffffff;
112 static afs_int32 syncTime = 0;
113 static afs_int32 syncHost = 0;
116 /*! \name used to remember which dbase version is the one at the sync site (for non-sync sites) */
117 struct ubik_version ubik_dbVersion; /*!< sync site's dbase version */
118 struct ubik_tid ubik_dbTid; /*!< sync site's tid, or 0 if none */
122 * \brief Decide if we should try to become sync site.
124 * The basic rule is that we
125 * don't run if there is a valid sync site and it ain't us (we have to run if
126 * it is us, in order to keep our votes). If there is no sync site, then we
127 * want to run if we're the lowest numbered host running, otherwise we defer to
128 * the lowest host. However, if the lowest host hasn't been heard from for a
129 * while, then we start running again, in case he crashed.
131 * \return true if we should run, and false otherwise.
134 uvote_ShouldIRun(void)
136 register afs_int32 now;
138 now = FT_ApproxTime();
139 if (BIGTIME + ubik_lastYesTime < now)
140 return 1; /* no valid guy even trying */
141 if (lastYesState && lastYesHost != ubik_host[0])
142 return 0; /* other guy is sync site, leave him alone */
143 if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0]))
144 return 0; /* if someone is valid and better than us, don't run */
145 /* otherwise we should run */
150 * \brief Return the current synchronization site, if any.
152 * Simple approach: if the
153 * last guy we voted yes for claims to be the sync site, then we we're happy to
154 * use that guy for a sync site until the time his mandate expires. If the guy
155 * does not claim to be sync site, then, of course, there's none.
157 * In addition, if we lost the sync, we set #urecovery_syncSite to an invalid
158 * value, indicating that we no longer know which version of the dbase is the
159 * one we should have. We'll get a new one when we next hear from the sync
162 * \return 0 or currently valid sync site. It can return our own
163 * address, if we're the sync site.
166 uvote_GetSyncSite(void)
168 register afs_int32 now;
169 register afs_int32 code;
174 now = FT_ApproxTime();
175 if (SMALLTIME + lastYesClaim < now)
176 code = 0; /* last guy timed out */
184 * \brief called by the sync site to handle vote beacons; if aconn is null, this is a
187 * \returns 0 or time when the vote was sent. It returns 0 if we are
188 * not voting for this sync site, or the time we actually voted yes, if
192 SVOTE_Beacon(register struct rx_call * rxcall, afs_int32 astate,
193 afs_int32 astart, struct ubik_version * avers,
194 struct ubik_tid * atid)
196 register afs_int32 otherHost;
197 register afs_int32 now;
199 struct rx_connection *aconn;
201 struct ubik_server *ts;
204 now = FT_ApproxTime(); /* close to current time */
205 if (rxcall) { /* caller's host */
206 aconn = rx_ConnectionOf(rxcall);
207 rxp = rx_PeerOf(aconn);
208 otherHost = rx_HostOf(rxp);
210 /* get the primary interface address for this host. */
211 /* This is the identifier that ubik uses. */
212 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
214 ubik_dprint("Received beacon from unknown host %s\n",
215 afs_inet_ntoa(rx_HostOf(rxp)));
216 return 0; /* I don't know about you: vote no */
218 for (ts = ubik_servers; ts; ts = ts->next) {
219 if (ts->addr[0] == otherHost)
223 ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
224 if (ts && ts->isClone)
227 otherHost = ubik_host[0]; /* this host */
231 ubik_dprint("Received beacon type %d from host %s\n", astate,
232 afs_inet_ntoa(otherHost));
234 /* compute the lowest server we've heard from. We'll try to only vote for
235 * this dude if we don't already have a synchronization site. Also, don't
236 * let a very old lowestHost confusing things forever. We pick a new
237 * lowestHost after BIGTIME seconds to limit the damage if this host
238 * actually crashes. Finally, we also count in this computation: don't
239 * pick someone else if we're even better!
241 * Note that the test below must be <=, not <, so that we keep refreshing
242 * lowestTime. Otherwise it will look like we haven't heard from
243 * lowestHost in a while and another host could slip in. */
246 /* First compute the lowest host we've heard from, whether we want them
247 * for a sync site or not. If we haven't heard from a site in BIGTIME
248 * seconds, we ignore its presence in lowestHost: it may have crashed.
249 * Note that we don't ever let anyone appear in our lowestHost if we're
250 * lower than them, 'cause we know we're up. */
251 /* But do not consider clones for lowesHost since they never may become
254 && (ntohl((afs_uint32) otherHost) <= ntohl((afs_uint32) lowestHost)
255 || lowestTime + BIGTIME < now)) {
257 lowestHost = otherHost;
259 /* why do we need this next check? Consider the case where each of two
260 * servers decides the other is lowestHost. Each stops sending beacons
261 * 'cause the other is there. Not obvious that this process terminates:
262 * i.e. each guy could restart procedure and again think other side is
263 * lowest. Need to prove: if one guy in the system is lowest and knows
264 * he's lowest, these loops don't occur. because if someone knows he's
265 * lowest, he will send out beacons telling others to vote for him. */
267 && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32) lowestHost)
268 || lowestTime + BIGTIME < now)) {
270 lowestHost = ubik_host[0];
273 /* tell if we've heard from a sync site recently (even if we're not voting
274 * for this dude yet). After a while, time the guy out. */
275 if (astate) { /* this guy is a sync site */
276 syncHost = otherHost;
278 } else if (syncTime + BIGTIME < now) {
281 ("Ubik: Lost contact with sync-site %d.%d.%d.%d (NOT in quorum)\n",
282 ((syncHost >> 24) & 0xff), ((syncHost >> 16) & 0xff),
283 ((syncHost >> 8) & 0xff), (syncHost & 0xff));
288 /* decide how to vote */
289 vote = 0; /* start off voting no */
291 /* if we this guy isn't a sync site, we don't really have to vote for him.
292 * We get to apply some heuristics to try to avoid weird oscillation sates
293 * in the voting procedure. */
295 /* in here only if this guy doesn't claim to be a sync site */
297 /* lowestHost is also trying for our votes, then just say no. */
298 if (ntohl(lowestHost) != ntohl(otherHost)) {
302 /* someone else *is* a sync site, just say no */
303 if (syncHost && syncHost != otherHost)
305 } else /* fast startup if this is the only non-clone */ if (lastYesHost ==
312 for (ts = ubik_servers; ts; ts = ts->next) {
313 if (ts->addr[0] == otherHost)
319 lastYesHost = otherHost;
324 return 0; /* clone never can become sync site */
326 /* Don't promise sync site support to more than one host every BIGTIME
327 * seconds. This is the heart of our invariants in this system. */
328 if (ubik_lastYesTime + BIGTIME < now || otherHost == lastYesHost) {
329 if ((ubik_lastYesTime + BIGTIME < now) || (otherHost != lastYesHost)
330 || (lastYesState != astate)) {
331 /* A new vote or a change in the vote or changed quorum */
332 ubik_dprint("Ubik: vote 'yes' for %s %s\n",
333 afs_inet_ntoa(otherHost),
334 (astate ? "(in quorum)" : "(NOT in quorum)"));
337 vote = now; /* vote yes */
338 ubik_lastYesTime = now; /* remember when we voted yes */
339 lastYesClaim = astart; /* remember for computing when sync site expires */
340 lastYesHost = otherHost; /* and who for */
341 lastYesState = astate; /* remember if site is a sync site */
342 ubik_dbVersion = *avers; /* resync value */
343 ubik_dbTid = *atid; /* transaction id, if any, of active trans */
344 urecovery_CheckTid(atid); /* check if current write trans needs aborted */
350 * \brief Handle per-server debug command, where 0 is the first server.
352 * Basic network debugging hooks.
355 SVOTE_SDebug(struct rx_call * rxcall, afs_int32 awhich,
356 register struct ubik_sdebug * aparm)
358 afs_int32 code, isClone;
359 code = SVOTE_XSDebug(rxcall, awhich, aparm, &isClone);
364 SVOTE_XSDebug(struct rx_call * rxcall, afs_int32 awhich,
365 register struct ubik_sdebug * aparm, afs_int32 * isclone)
367 register struct ubik_server *ts;
369 for (ts = ubik_servers; ts; ts = ts->next) {
372 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
373 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR - 1; i++)
374 aparm->altAddr[i] = ntohl(ts->addr[i + 1]);
375 aparm->lastVoteTime = ts->lastVoteTime;
376 aparm->lastBeaconSent = ts->lastBeaconSent;
377 memcpy(&aparm->remoteVersion, &ts->version,
378 sizeof(struct ubik_version));
379 aparm->lastVote = ts->lastVote;
381 aparm->beaconSinceDown = ts->beaconSinceDown;
382 aparm->currentDB = ts->currentDB;
383 *isclone = ts->isClone;
391 SVOTE_XDebug(struct rx_call * rxcall, register struct ubik_debug * aparm,
396 code = SVOTE_Debug(rxcall, aparm);
402 * \brief Handle basic network debug command. This is the global state dumper.
405 SVOTE_Debug(struct rx_call * rxcall, register struct ubik_debug * aparm)
408 /* fill in the basic debug structure. Note the the RPC protocol transfers,
409 * integers in host order. */
411 aparm->now = FT_ApproxTime();
412 aparm->lastYesTime = ubik_lastYesTime;
413 aparm->lastYesHost = ntohl(lastYesHost);
414 aparm->lastYesState = lastYesState;
415 aparm->lastYesClaim = lastYesClaim;
416 aparm->lowestHost = ntohl(lowestHost);
417 aparm->lowestTime = lowestTime;
418 aparm->syncHost = ntohl(syncHost);
419 aparm->syncTime = syncTime;
421 /* fill in all interface addresses of myself in hostbyte order */
422 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR; i++)
423 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
425 aparm->amSyncSite = ubik_amSyncSite;
426 ubeacon_Debug(aparm);
432 /* Get the recovery state. The label of the database may not have
433 * been written yet but set the flag so udebug behavior remains.
436 aparm->recoveryState = urecovery_state;
437 if ((urecovery_state & UBIK_RECSYNCSITE)
438 && (urecovery_state & UBIK_RECFOUNDDB)
439 && (urecovery_state & UBIK_RECHAVEDB)) {
440 aparm->recoveryState |= UBIK_RECLABELDB;
442 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
443 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
444 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
445 aparm->tidCounter = ubik_dbase->tidCounter;
447 if (ubik_currentTrans) {
448 aparm->currentTrans = 1;
449 if (ubik_currentTrans->type == UBIK_WRITETRANS)
450 aparm->writeTrans = 1;
452 aparm->writeTrans = 0;
454 aparm->currentTrans = 0;
457 aparm->epochTime = ubik_epochTime;
463 SVOTE_SDebugOld(struct rx_call * rxcall, afs_int32 awhich,
464 register struct ubik_sdebug_old * aparm)
466 register struct ubik_server *ts;
468 for (ts = ubik_servers; ts; ts = ts->next) {
471 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
472 aparm->lastVoteTime = ts->lastVoteTime;
473 aparm->lastBeaconSent = ts->lastBeaconSent;
474 memcpy(&aparm->remoteVersion, &ts->version,
475 sizeof(struct ubik_version));
476 aparm->lastVote = ts->lastVote;
478 aparm->beaconSinceDown = ts->beaconSinceDown;
479 aparm->currentDB = ts->currentDB;
488 * \brief Handle basic network debug command. This is the global state dumper.
491 SVOTE_DebugOld(struct rx_call * rxcall,
492 register struct ubik_debug_old * aparm)
495 /* fill in the basic debug structure. Note the the RPC protocol transfers,
496 * integers in host order. */
498 aparm->now = FT_ApproxTime();
499 aparm->lastYesTime = ubik_lastYesTime;
500 aparm->lastYesHost = ntohl(lastYesHost);
501 aparm->lastYesState = lastYesState;
502 aparm->lastYesClaim = lastYesClaim;
503 aparm->lowestHost = ntohl(lowestHost);
504 aparm->lowestTime = lowestTime;
505 aparm->syncHost = ntohl(syncHost);
506 aparm->syncTime = syncTime;
508 aparm->amSyncSite = ubik_amSyncSite;
509 ubeacon_Debug((ubik_debug *)aparm);
511 udisk_Debug((ubik_debug *)aparm);
513 ulock_Debug((ubik_debug *)aparm);
515 /* Get the recovery state. The label of the database may not have
516 * been written yet but set the flag so udebug behavior remains.
519 aparm->recoveryState = urecovery_state;
520 if ((urecovery_state & UBIK_RECSYNCSITE)
521 && (urecovery_state & UBIK_RECFOUNDDB)
522 && (urecovery_state & UBIK_RECHAVEDB)) {
523 aparm->recoveryState |= UBIK_RECLABELDB;
525 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
526 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
527 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
528 aparm->tidCounter = ubik_dbase->tidCounter;
530 if (ubik_currentTrans) {
531 aparm->currentTrans = 1;
532 if (ubik_currentTrans->type == UBIK_WRITETRANS)
533 aparm->writeTrans = 1;
535 aparm->writeTrans = 0;
537 aparm->currentTrans = 0;
540 aparm->epochTime = ubik_epochTime;
547 * \brief Get the sync site; called by remote servers to find where they should go.
550 SVOTE_GetSyncSite(register struct rx_call * rxcall,
551 register afs_int32 * ahost)
553 register afs_int32 temp;
555 temp = uvote_GetSyncSite();
556 *ahost = ntohl(temp);
561 ubik_dprint_25(const char *format, ...)
565 va_start(ap, format);
566 vViceLog(25, (format, ap));
571 ubik_dprint(const char *format, ...)
575 va_start(ap, format);
576 vViceLog(5, (format, ap));
581 ubik_vprint(const char *format, va_list ap)
583 vViceLog(0, (format, ap));
587 ubik_print(const char *format, ...)
591 va_start(ap, format);
592 ubik_vprint(format, ap);
597 * \brief Called once/run to init the vote module
602 /* pretend we just voted for someone else, since we just restarted */
603 ubik_lastYesTime = FT_ApproxTime();