2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
16 #include <sys/types.h>
21 #include <netinet/in.h>
23 #include <afs/afsutil.h>
28 #include <afs/afsutil.h>
31 #define UBIK_INTERNALS
37 The goal is to provide reliable operation among N servers, such that any
38 server can crash with the remaining servers continuing operation within a
39 short period of time. While a *short* outage is acceptable, this time
40 should be order of 3 minutes or less.
44 Note: SMALLTIME and BIGTIME are essentially the same time value, separated
45 only by the clock skew, MAXSKEW. In general, if you are making guarantees
46 for someone else, promise them no more than SMALLTIME seconds of whatever
47 invariant you provide. If you are waiting to be sure some invariant is now
48 *false*, wait at least BIGTIME seconds to be sure that SMALLTIME seconds
49 has passed at the other site.
51 Now, back to the design:
52 One site in the collection is a special site, designated the *sync* site.
53 The sync site sends periodic messages, which can be thought of as
54 keep-alive messages. When a non-sync site hears from the sync site, it
55 knows that it is getting updates for the next SMALLTIME seconds from that
58 If a server does not hear from the sync site in SMALLTIME seconds, it
59 determines that it no longer is getting updates, and thus refuses to give
60 out potentially out-of-date data. If a sync site can not muster a majority
61 of servers to agree that it is the sync site, then there is a possibility
62 that a network partition has occurred, allowing another server to claim to
63 be the sync site. Thus, any time that the sync site has not heard from a
64 majority of the servers in the last SMALLTIME seconds, it voluntarily
65 relinquishes its role as sync site.
67 While attempting to nominate a new sync site, certain rules apply. First,
68 a server can not reply "ok" (return 1 from ServBeacon) to two different
69 hosts in less than BIGTIME seconds; this allows a server that has heard
70 affirmative replies from a majority of the servers to know that no other
71 server in the network has heard enough affirmative replies in the last
72 BIGTIME seconds to become sync site, too. The variables ubik_lastYesTime
73 and lastYesHost are used by all servers to keep track of which host they
74 have last replied affirmatively to, when queried by a potential new sync
77 Once a sync site has become a sync site, it periodically sends beacon
78 messages with a parameter of 1, indicating that it already has determined
79 it is supposed to be the sync site. The servers treat such a message as a
80 guarantee that no other site will become sync site for the next SMALLTIME
81 seconds. In the interim, these servers can answer a query concerning which
82 site is the sync site without any communication with any server. The
83 variables lastBeaconArrival and lastBeaconHost are used by all servers to
84 keep track of which sync site has last contacted them.
86 One complication occurs while nominating a new sync site: each site may be
87 trying to nominate a different site (based on the value of lastYesHost),
88 yet we must nominate the smallest host (under some order), to prevent this
89 process from looping. The process could loop by having each server give
90 one vote to another server, but with no server getting a majority of the
91 votes. To avoid this, we try to withhold our votes for the server with the
92 lowest internet address (an easy-to-generate order). To this effect, we
93 keep track (in lowestTime and lowestHost) of the lowest server trying to
94 become a sync site. We wait for this server unless there is already a sync
95 site (indicated by ServBeacon's parameter being 1).
98 afs_int32 ubik_debugFlag = 0; /* print out debugging messages? */
100 /* these statics are used by all sites in nominating new sync sites */
101 afs_int32 ubik_lastYesTime = 0; /* time we sent the last *yes* vote */
102 static afs_uint32 lastYesHost = 0xffffffff; /* host to which we sent *yes* vote */
103 /* Next is time sync site began this vote: guarantees sync site until this + SMALLTIME */
104 static afs_int32 lastYesClaim = 0;
105 static int lastYesState = 0; /* did last site we voted for claim to be sync site? */
107 /* used to guarantee that nomination process doesn't loop */
108 static afs_int32 lowestTime = 0;
109 static afs_uint32 lowestHost = 0xffffffff;
110 static afs_int32 syncTime = 0;
111 static afs_int32 syncHost = 0;
113 /* used to remember which dbase version is the one at the sync site (for non-sync sites */
114 struct ubik_version ubik_dbVersion; /* sync site's dbase version */
115 struct ubik_tid ubik_dbTid; /* sync site's tid, or 0 if none */
117 /* decide if we should try to become sync site. The basic rule is that we
118 * don't run if there is a valid sync site and it ain't us (we have to run if
119 * it is us, in order to keep our votes). If there is no sync site, then we
120 * want to run if we're the lowest numbered host running, otherwise we defer to
121 * the lowest host. However, if the lowest host hasn't been heard from for a
122 * while, then we start running again, in case he crashed.
124 * This function returns true if we should run, and false otherwise.
127 uvote_ShouldIRun(void)
129 register afs_int32 now;
131 now = FT_ApproxTime();
132 if (BIGTIME + ubik_lastYesTime < now)
133 return 1; /* no valid guy even trying */
134 if (lastYesState && lastYesHost != ubik_host[0])
135 return 0; /* other guy is sync site, leave him alone */
136 if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0]))
137 return 0; /* if someone is valid and better than us, don't run */
138 /* otherwise we should run */
142 /* Return the current synchronization site, if any. Simple approach: if the
143 * last guy we voted yes for claims to be the sync site, then we we're happy to
144 * use that guy for a sync site until the time his mandate expires. If the guy
145 * does not claim to be sync site, then, of course, there's none.
147 * In addition, if we lost the sync, we set urecovery_syncSite to an invalid
148 * value, indicating that we no longer know which version of the dbase is the
149 * one we should have. We'll get a new one when we next hear from the sync
152 * This function returns 0 or currently valid sync site. It can return our own
153 * address, if we're the sync site.
156 uvote_GetSyncSite(void)
158 register afs_int32 now;
159 register afs_int32 code;
164 now = FT_ApproxTime();
165 if (SMALLTIME + lastYesClaim < now)
166 code = 0; /* last guy timed out */
173 /* called by the sync site to handle vote beacons; if aconn is null, this is a
174 * local call; returns 0 or time whe the vote was sent. It returns 0 if we are
175 * not voting for this sync site, or the time we actually voted yes, if
179 SVOTE_Beacon(register struct rx_call * rxcall, afs_int32 astate,
180 afs_int32 astart, struct ubik_version * avers,
181 struct ubik_tid * atid)
183 register afs_int32 otherHost;
184 register afs_int32 now;
186 struct rx_connection *aconn;
188 struct ubik_server *ts;
191 now = FT_ApproxTime(); /* close to current time */
192 if (rxcall) { /* caller's host */
193 aconn = rx_ConnectionOf(rxcall);
194 rxp = rx_PeerOf(aconn);
195 otherHost = rx_HostOf(rxp);
197 /* get the primary interface address for this host. */
198 /* This is the identifier that ubik uses. */
199 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
201 ubik_dprint("Received beacon from unknown host %s\n",
202 afs_inet_ntoa(rx_HostOf(rxp)));
203 return 0; /* I don't know about you: vote no */
205 for (ts = ubik_servers; ts; ts = ts->next) {
206 if (ts->addr[0] == otherHost)
210 ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
211 if (ts && ts->isClone)
214 otherHost = ubik_host[0]; /* this host */
218 ubik_dprint("Received beacon type %d from host %s\n", astate,
219 afs_inet_ntoa(otherHost));
221 /* compute the lowest server we've heard from. We'll try to only vote for
222 * this dude if we don't already have a synchronization site. Also, don't
223 * let a very old lowestHost confusing things forever. We pick a new
224 * lowestHost after BIGTIME seconds to limit the damage if this host
225 * actually crashes. Finally, we also count in this computation: don't
226 * pick someone else if we're even better!
228 * Note that the test below must be <=, not <, so that we keep refreshing
229 * lowestTime. Otherwise it will look like we haven't heard from
230 * lowestHost in a while and another host could slip in. */
233 /* First compute the lowest host we've heard from, whether we want them
234 * for a sync site or not. If we haven't heard from a site in BIGTIME
235 * seconds, we ignore its presence in lowestHost: it may have crashed.
236 * Note that we don't ever let anyone appear in our lowestHost if we're
237 * lower than them, 'cause we know we're up. */
238 /* But do not consider clones for lowesHost since they never may become
241 && (ntohl((afs_uint32) otherHost) <= ntohl((afs_uint32) lowestHost)
242 || lowestTime + BIGTIME < now)) {
244 lowestHost = otherHost;
246 /* why do we need this next check? Consider the case where each of two
247 * servers decides the other is lowestHost. Each stops sending beacons
248 * 'cause the other is there. Not obvious that this process terminates:
249 * i.e. each guy could restart procedure and again think other side is
250 * lowest. Need to prove: if one guy in the system is lowest and knows
251 * he's lowest, these loops don't occur. because if someone knows he's
252 * lowest, he will send out beacons telling others to vote for him. */
254 && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32) lowestHost)
255 || lowestTime + BIGTIME < now)) {
257 lowestHost = ubik_host[0];
260 /* tell if we've heard from a sync site recently (even if we're not voting
261 * for this dude yet). After a while, time the guy out. */
262 if (astate) { /* this guy is a sync site */
263 syncHost = otherHost;
265 } else if (syncTime + BIGTIME < now) {
268 ("Ubik: Lost contact with sync-site %d.%d.%d.%d (NOT in quorum)\n",
269 ((syncHost >> 24) & 0xff), ((syncHost >> 16) & 0xff),
270 ((syncHost >> 8) & 0xff), (syncHost & 0xff));
275 /* decide how to vote */
276 vote = 0; /* start off voting no */
278 /* if we this guy isn't a sync site, we don't really have to vote for him.
279 * We get to apply some heuristics to try to avoid weird oscillation sates
280 * in the voting procedure. */
282 /* in here only if this guy doesn't claim to be a sync site */
284 /* lowestHost is also trying for our votes, then just say no. */
285 if (ntohl(lowestHost) != ntohl(otherHost)) {
289 /* someone else *is* a sync site, just say no */
290 if (syncHost && syncHost != otherHost)
292 } else /* fast startup if this is the only non-clone */ if (lastYesHost ==
299 for (ts = ubik_servers; ts; ts = ts->next) {
300 if (ts->addr[0] == otherHost)
306 lastYesHost = otherHost;
311 return 0; /* clone never can become sync site */
313 /* Don't promise sync site support to more than one host every BIGTIME
314 * seconds. This is the heart of our invariants in this system. */
315 if (ubik_lastYesTime + BIGTIME < now || otherHost == lastYesHost) {
316 if ((ubik_lastYesTime + BIGTIME < now) || (otherHost != lastYesHost)
317 || (lastYesState != astate)) {
318 /* A new vote or a change in the vote or changed quorum */
319 ubik_dprint("Ubik: vote 'yes' for %s %s\n",
320 afs_inet_ntoa(otherHost),
321 (astate ? "(in quorum)" : "(NOT in quorum)"));
324 vote = now; /* vote yes */
325 ubik_lastYesTime = now; /* remember when we voted yes */
326 lastYesClaim = astart; /* remember for computing when sync site expires */
327 lastYesHost = otherHost; /* and who for */
328 lastYesState = astate; /* remember if site is a sync site */
329 ubik_dbVersion = *avers; /* resync value */
330 ubik_dbTid = *atid; /* transaction id, if any, of active trans */
331 urecovery_CheckTid(atid); /* check if current write trans needs aborted */
336 /* handle per-server debug command, where 0 is the first server. Basic network
339 SVOTE_SDebug(struct rx_call * rxcall, afs_int32 awhich,
340 register struct ubik_sdebug * aparm)
342 afs_int32 code, isClone;
343 code = SVOTE_XSDebug(rxcall, awhich, aparm, &isClone);
348 SVOTE_XSDebug(struct rx_call * rxcall, afs_int32 awhich,
349 register struct ubik_sdebug * aparm, afs_int32 * isclone)
351 register struct ubik_server *ts;
353 for (ts = ubik_servers; ts; ts = ts->next) {
356 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
357 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR - 1; i++)
358 aparm->altAddr[i] = ntohl(ts->addr[i + 1]);
359 aparm->lastVoteTime = ts->lastVoteTime;
360 aparm->lastBeaconSent = ts->lastBeaconSent;
361 memcpy(&aparm->remoteVersion, &ts->version,
362 sizeof(struct ubik_version));
363 aparm->lastVote = ts->lastVote;
365 aparm->beaconSinceDown = ts->beaconSinceDown;
366 aparm->currentDB = ts->currentDB;
367 *isclone = ts->isClone;
375 SVOTE_XDebug(struct rx_call * rxcall, register struct ubik_debug * aparm,
380 code = SVOTE_Debug(rxcall, aparm);
385 /* handle basic network debug command. This is the global state dumper */
387 SVOTE_Debug(struct rx_call * rxcall, register struct ubik_debug * aparm)
390 /* fill in the basic debug structure. Note the the RPC protocol transfers,
391 * integers in host order. */
393 aparm->now = FT_ApproxTime();
394 aparm->lastYesTime = ubik_lastYesTime;
395 aparm->lastYesHost = ntohl(lastYesHost);
396 aparm->lastYesState = lastYesState;
397 aparm->lastYesClaim = lastYesClaim;
398 aparm->lowestHost = ntohl(lowestHost);
399 aparm->lowestTime = lowestTime;
400 aparm->syncHost = ntohl(syncHost);
401 aparm->syncTime = syncTime;
403 /* fill in all interface addresses of myself in hostbyte order */
404 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR; i++)
405 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
407 aparm->amSyncSite = ubik_amSyncSite;
408 ubeacon_Debug(aparm);
414 /* Get the recovery state. The label of the database may not have
415 * been written yet but set the flag so udebug behavior remains.
418 aparm->recoveryState = urecovery_state;
419 if ((urecovery_state & UBIK_RECSYNCSITE)
420 && (urecovery_state & UBIK_RECFOUNDDB)
421 && (urecovery_state & UBIK_RECHAVEDB)) {
422 aparm->recoveryState |= UBIK_RECLABELDB;
424 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
425 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
426 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
427 aparm->tidCounter = ubik_dbase->tidCounter;
429 if (ubik_currentTrans) {
430 aparm->currentTrans = 1;
431 if (ubik_currentTrans->type == UBIK_WRITETRANS)
432 aparm->writeTrans = 1;
434 aparm->writeTrans = 0;
436 aparm->currentTrans = 0;
439 aparm->epochTime = ubik_epochTime;
445 SVOTE_SDebugOld(struct rx_call * rxcall, afs_int32 awhich,
446 register struct ubik_sdebug_old * aparm)
448 register struct ubik_server *ts;
450 for (ts = ubik_servers; ts; ts = ts->next) {
453 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
454 aparm->lastVoteTime = ts->lastVoteTime;
455 aparm->lastBeaconSent = ts->lastBeaconSent;
456 memcpy(&aparm->remoteVersion, &ts->version,
457 sizeof(struct ubik_version));
458 aparm->lastVote = ts->lastVote;
460 aparm->beaconSinceDown = ts->beaconSinceDown;
461 aparm->currentDB = ts->currentDB;
469 /* handle basic network debug command. This is the global state dumper */
471 SVOTE_DebugOld(struct rx_call * rxcall,
472 register struct ubik_debug_old * aparm)
475 /* fill in the basic debug structure. Note the the RPC protocol transfers,
476 * integers in host order. */
478 aparm->now = FT_ApproxTime();
479 aparm->lastYesTime = ubik_lastYesTime;
480 aparm->lastYesHost = ntohl(lastYesHost);
481 aparm->lastYesState = lastYesState;
482 aparm->lastYesClaim = lastYesClaim;
483 aparm->lowestHost = ntohl(lowestHost);
484 aparm->lowestTime = lowestTime;
485 aparm->syncHost = ntohl(syncHost);
486 aparm->syncTime = syncTime;
488 aparm->amSyncSite = ubik_amSyncSite;
489 ubeacon_Debug(aparm);
495 /* Get the recovery state. The label of the database may not have
496 * been written yet but set the flag so udebug behavior remains.
499 aparm->recoveryState = urecovery_state;
500 if ((urecovery_state & UBIK_RECSYNCSITE)
501 && (urecovery_state & UBIK_RECFOUNDDB)
502 && (urecovery_state & UBIK_RECHAVEDB)) {
503 aparm->recoveryState |= UBIK_RECLABELDB;
505 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
506 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
507 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
508 aparm->tidCounter = ubik_dbase->tidCounter;
510 if (ubik_currentTrans) {
511 aparm->currentTrans = 1;
512 if (ubik_currentTrans->type == UBIK_WRITETRANS)
513 aparm->writeTrans = 1;
515 aparm->writeTrans = 0;
517 aparm->currentTrans = 0;
520 aparm->epochTime = ubik_epochTime;
526 /* get the sync site; called by remote servers to find where they should go */
528 SVOTE_GetSyncSite(register struct rx_call * rxcall,
529 register afs_int32 * ahost)
531 register afs_int32 temp;
533 temp = uvote_GetSyncSite();
534 *ahost = ntohl(temp);
539 ubik_dprint(char *a, char *b, char *c, char *d, char *e, char *f, char *g,
542 ViceLog(5, (a, b, c, d, e, f, g, h));
547 ubik_print(char *a, char *b, char *c, char *d, char *e, char *f, char *g,
550 ViceLog(0, (a, b, c, d, e, f, g, h));
554 /* called once/run to init the vote module */
558 /* pretend we just voted for someone else, since we just restarted */
559 ubik_lastYesTime = FT_ApproxTime();