2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
16 #include <sys/types.h>
21 #include <netinet/in.h>
23 #include <afs/afsutil.h>
34 #include <afs/afsutil.h>
37 #define UBIK_INTERNALS
43 The goal is to provide reliable operation among N servers, such that any
44 server can crash with the remaining servers continuing operation within a
45 short period of time. While a *short* outage is acceptable, this time
46 should be order of 3 minutes or less.
50 Note: SMALLTIME and BIGTIME are essentially the same time value, separated
51 only by the clock skew, MAXSKEW. In general, if you are making guarantees
52 for someone else, promise them no more than SMALLTIME seconds of whatever
53 invariant you provide. If you are waiting to be sure some invariant is now
54 *false*, wait at least BIGTIME seconds to be sure that SMALLTIME seconds
55 has passed at the other site.
57 Now, back to the design:
58 One site in the collection is a special site, designated the *sync* site.
59 The sync site sends periodic messages, which can be thought of as
60 keep-alive messages. When a non-sync site hears from the sync site, it
61 knows that it is getting updates for the next SMALLTIME seconds from that
64 If a server does not hear from the sync site in SMALLTIME seconds, it
65 determines that it no longer is getting updates, and thus refuses to give
66 out potentially out-of-date data. If a sync site can not muster a majority
67 of servers to agree that it is the sync site, then there is a possibility
68 that a network partition has occurred, allowing another server to claim to
69 be the sync site. Thus, any time that the sync site has not heard from a
70 majority of the servers in the last SMALLTIME seconds, it voluntarily
71 relinquishes its role as sync site.
73 While attempting to nominate a new sync site, certain rules apply. First,
74 a server can not reply "ok" (return 1 from ServBeacon) to two different
75 hosts in less than BIGTIME seconds; this allows a server that has heard
76 affirmative replies from a majority of the servers to know that no other
77 server in the network has heard enough affirmative replies in the last
78 BIGTIME seconds to become sync site, too. The variables ubik_lastYesTime
79 and lastYesHost are used by all servers to keep track of which host they
80 have last replied affirmatively to, when queried by a potential new sync
83 Once a sync site has become a sync site, it periodically sends beacon
84 messages with a parameter of 1, indicating that it already has determined
85 it is supposed to be the sync site. The servers treat such a message as a
86 guarantee that no other site will become sync site for the next SMALLTIME
87 seconds. In the interim, these servers can answer a query concerning which
88 site is the sync site without any communication with any server. The
89 variables lastBeaconArrival and lastBeaconHost are used by all servers to
90 keep track of which sync site has last contacted them.
92 One complication occurs while nominating a new sync site: each site may be
93 trying to nominate a different site (based on the value of lastYesHost),
94 yet we must nominate the smallest host (under some order), to prevent this
95 process from looping. The process could loop by having each server give
96 one vote to another server, but with no server getting a majority of the
97 votes. To avoid this, we try to withhold our votes for the server with the
98 lowest internet address (an easy-to-generate order). To this effect, we
99 keep track (in lowestTime and lowestHost) of the lowest server trying to
100 become a sync site. We wait for this server unless there is already a sync
101 site (indicated by ServBeacon's parameter being 1).
104 afs_int32 ubik_debugFlag = 0; /* print out debugging messages? */
106 /* these statics are used by all sites in nominating new sync sites */
107 afs_int32 ubik_lastYesTime = 0; /* time we sent the last *yes* vote */
108 static afs_uint32 lastYesHost = 0xffffffff; /* host to which we sent *yes* vote */
109 /* Next is time sync site began this vote: guarantees sync site until this + SMALLTIME */
110 static afs_int32 lastYesClaim = 0;
111 static int lastYesState = 0; /* did last site we voted for claim to be sync site? */
113 /* used to guarantee that nomination process doesn't loop */
114 static afs_int32 lowestTime = 0;
115 static afs_uint32 lowestHost = 0xffffffff;
116 static afs_int32 syncTime = 0;
117 static afs_int32 syncHost = 0;
119 /* used to remember which dbase version is the one at the sync site (for non-sync sites */
120 struct ubik_version ubik_dbVersion; /* sync site's dbase version */
121 struct ubik_tid ubik_dbTid; /* sync site's tid, or 0 if none */
123 /* decide if we should try to become sync site. The basic rule is that we
124 * don't run if there is a valid sync site and it ain't us (we have to run if
125 * it is us, in order to keep our votes). If there is no sync site, then we
126 * want to run if we're the lowest numbered host running, otherwise we defer to
127 * the lowest host. However, if the lowest host hasn't been heard from for a
128 * while, then we start running again, in case he crashed.
130 * This function returns true if we should run, and false otherwise.
133 uvote_ShouldIRun(void)
135 register afs_int32 now;
137 now = FT_ApproxTime();
138 if (BIGTIME + ubik_lastYesTime < now)
139 return 1; /* no valid guy even trying */
140 if (lastYesState && lastYesHost != ubik_host[0])
141 return 0; /* other guy is sync site, leave him alone */
142 if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0]))
143 return 0; /* if someone is valid and better than us, don't run */
144 /* otherwise we should run */
148 /* Return the current synchronization site, if any. Simple approach: if the
149 * last guy we voted yes for claims to be the sync site, then we we're happy to
150 * use that guy for a sync site until the time his mandate expires. If the guy
151 * does not claim to be sync site, then, of course, there's none.
153 * In addition, if we lost the sync, we set urecovery_syncSite to an invalid
154 * value, indicating that we no longer know which version of the dbase is the
155 * one we should have. We'll get a new one when we next hear from the sync
158 * This function returns 0 or currently valid sync site. It can return our own
159 * address, if we're the sync site.
162 uvote_GetSyncSite(void)
164 register afs_int32 now;
165 register afs_int32 code;
170 now = FT_ApproxTime();
171 if (SMALLTIME + lastYesClaim < now)
172 code = 0; /* last guy timed out */
179 /* called by the sync site to handle vote beacons; if aconn is null, this is a
180 * local call; returns 0 or time whe the vote was sent. It returns 0 if we are
181 * not voting for this sync site, or the time we actually voted yes, if
185 SVOTE_Beacon(register struct rx_call * rxcall, afs_int32 astate,
186 afs_int32 astart, struct ubik_version * avers,
187 struct ubik_tid * atid)
189 register afs_int32 otherHost;
190 register afs_int32 now;
192 struct rx_connection *aconn;
194 struct ubik_server *ts;
197 now = FT_ApproxTime(); /* close to current time */
198 if (rxcall) { /* caller's host */
199 aconn = rx_ConnectionOf(rxcall);
200 rxp = rx_PeerOf(aconn);
201 otherHost = rx_HostOf(rxp);
203 /* get the primary interface address for this host. */
204 /* This is the identifier that ubik uses. */
205 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
207 ubik_dprint("Received beacon from unknown host %s\n",
208 afs_inet_ntoa(rx_HostOf(rxp)));
209 return 0; /* I don't know about you: vote no */
211 for (ts = ubik_servers; ts; ts = ts->next) {
212 if (ts->addr[0] == otherHost)
216 ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
217 if (ts && ts->isClone)
220 otherHost = ubik_host[0]; /* this host */
224 ubik_dprint("Received beacon type %d from host %s\n", astate,
225 afs_inet_ntoa(otherHost));
227 /* compute the lowest server we've heard from. We'll try to only vote for
228 * this dude if we don't already have a synchronization site. Also, don't
229 * let a very old lowestHost confusing things forever. We pick a new
230 * lowestHost after BIGTIME seconds to limit the damage if this host
231 * actually crashes. Finally, we also count in this computation: don't
232 * pick someone else if we're even better!
234 * Note that the test below must be <=, not <, so that we keep refreshing
235 * lowestTime. Otherwise it will look like we haven't heard from
236 * lowestHost in a while and another host could slip in. */
239 /* First compute the lowest host we've heard from, whether we want them
240 * for a sync site or not. If we haven't heard from a site in BIGTIME
241 * seconds, we ignore its presence in lowestHost: it may have crashed.
242 * Note that we don't ever let anyone appear in our lowestHost if we're
243 * lower than them, 'cause we know we're up. */
244 /* But do not consider clones for lowesHost since they never may become
247 && (ntohl((afs_uint32) otherHost) <= ntohl((afs_uint32) lowestHost)
248 || lowestTime + BIGTIME < now)) {
250 lowestHost = otherHost;
252 /* why do we need this next check? Consider the case where each of two
253 * servers decides the other is lowestHost. Each stops sending beacons
254 * 'cause the other is there. Not obvious that this process terminates:
255 * i.e. each guy could restart procedure and again think other side is
256 * lowest. Need to prove: if one guy in the system is lowest and knows
257 * he's lowest, these loops don't occur. because if someone knows he's
258 * lowest, he will send out beacons telling others to vote for him. */
260 && (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32) lowestHost)
261 || lowestTime + BIGTIME < now)) {
263 lowestHost = ubik_host[0];
266 /* tell if we've heard from a sync site recently (even if we're not voting
267 * for this dude yet). After a while, time the guy out. */
268 if (astate) { /* this guy is a sync site */
269 syncHost = otherHost;
271 } else if (syncTime + BIGTIME < now) {
274 ("Ubik: Lost contact with sync-site %d.%d.%d.%d (NOT in quorum)\n",
275 ((syncHost >> 24) & 0xff), ((syncHost >> 16) & 0xff),
276 ((syncHost >> 8) & 0xff), (syncHost & 0xff));
281 /* decide how to vote */
282 vote = 0; /* start off voting no */
284 /* if we this guy isn't a sync site, we don't really have to vote for him.
285 * We get to apply some heuristics to try to avoid weird oscillation sates
286 * in the voting procedure. */
288 /* in here only if this guy doesn't claim to be a sync site */
290 /* lowestHost is also trying for our votes, then just say no. */
291 if (ntohl(lowestHost) != ntohl(otherHost)) {
295 /* someone else *is* a sync site, just say no */
296 if (syncHost && syncHost != otherHost)
298 } else /* fast startup if this is the only non-clone */ if (lastYesHost ==
305 for (ts = ubik_servers; ts; ts = ts->next) {
306 if (ts->addr[0] == otherHost)
312 lastYesHost = otherHost;
317 return 0; /* clone never can become sync site */
319 /* Don't promise sync site support to more than one host every BIGTIME
320 * seconds. This is the heart of our invariants in this system. */
321 if (ubik_lastYesTime + BIGTIME < now || otherHost == lastYesHost) {
322 if ((ubik_lastYesTime + BIGTIME < now) || (otherHost != lastYesHost)
323 || (lastYesState != astate)) {
324 /* A new vote or a change in the vote or changed quorum */
325 ubik_dprint("Ubik: vote 'yes' for %s %s\n",
326 afs_inet_ntoa(otherHost),
327 (astate ? "(in quorum)" : "(NOT in quorum)"));
330 vote = now; /* vote yes */
331 ubik_lastYesTime = now; /* remember when we voted yes */
332 lastYesClaim = astart; /* remember for computing when sync site expires */
333 lastYesHost = otherHost; /* and who for */
334 lastYesState = astate; /* remember if site is a sync site */
335 ubik_dbVersion = *avers; /* resync value */
336 ubik_dbTid = *atid; /* transaction id, if any, of active trans */
337 urecovery_CheckTid(atid); /* check if current write trans needs aborted */
342 /* handle per-server debug command, where 0 is the first server. Basic network
345 SVOTE_SDebug(struct rx_call * rxcall, afs_int32 awhich,
346 register struct ubik_sdebug * aparm)
348 afs_int32 code, isClone;
349 code = SVOTE_XSDebug(rxcall, awhich, aparm, &isClone);
354 SVOTE_XSDebug(struct rx_call * rxcall, afs_int32 awhich,
355 register struct ubik_sdebug * aparm, afs_int32 * isclone)
357 register struct ubik_server *ts;
359 for (ts = ubik_servers; ts; ts = ts->next) {
362 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
363 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR - 1; i++)
364 aparm->altAddr[i] = ntohl(ts->addr[i + 1]);
365 aparm->lastVoteTime = ts->lastVoteTime;
366 aparm->lastBeaconSent = ts->lastBeaconSent;
367 memcpy(&aparm->remoteVersion, &ts->version,
368 sizeof(struct ubik_version));
369 aparm->lastVote = ts->lastVote;
371 aparm->beaconSinceDown = ts->beaconSinceDown;
372 aparm->currentDB = ts->currentDB;
373 *isclone = ts->isClone;
381 SVOTE_XDebug(struct rx_call * rxcall, register struct ubik_debug * aparm,
386 code = SVOTE_Debug(rxcall, aparm);
391 /* handle basic network debug command. This is the global state dumper */
393 SVOTE_Debug(struct rx_call * rxcall, register struct ubik_debug * aparm)
396 /* fill in the basic debug structure. Note the the RPC protocol transfers,
397 * integers in host order. */
399 aparm->now = FT_ApproxTime();
400 aparm->lastYesTime = ubik_lastYesTime;
401 aparm->lastYesHost = ntohl(lastYesHost);
402 aparm->lastYesState = lastYesState;
403 aparm->lastYesClaim = lastYesClaim;
404 aparm->lowestHost = ntohl(lowestHost);
405 aparm->lowestTime = lowestTime;
406 aparm->syncHost = ntohl(syncHost);
407 aparm->syncTime = syncTime;
409 /* fill in all interface addresses of myself in hostbyte order */
410 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR; i++)
411 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
413 aparm->amSyncSite = ubik_amSyncSite;
414 ubeacon_Debug(aparm);
420 /* Get the recovery state. The label of the database may not have
421 * been written yet but set the flag so udebug behavior remains.
424 aparm->recoveryState = urecovery_state;
425 if ((urecovery_state & UBIK_RECSYNCSITE)
426 && (urecovery_state & UBIK_RECFOUNDDB)
427 && (urecovery_state & UBIK_RECHAVEDB)) {
428 aparm->recoveryState |= UBIK_RECLABELDB;
430 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
431 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
432 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
433 aparm->tidCounter = ubik_dbase->tidCounter;
435 if (ubik_currentTrans) {
436 aparm->currentTrans = 1;
437 if (ubik_currentTrans->type == UBIK_WRITETRANS)
438 aparm->writeTrans = 1;
440 aparm->writeTrans = 0;
442 aparm->currentTrans = 0;
445 aparm->epochTime = ubik_epochTime;
451 SVOTE_SDebugOld(struct rx_call * rxcall, afs_int32 awhich,
452 register struct ubik_sdebug_old * aparm)
454 register struct ubik_server *ts;
456 for (ts = ubik_servers; ts; ts = ts->next) {
459 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
460 aparm->lastVoteTime = ts->lastVoteTime;
461 aparm->lastBeaconSent = ts->lastBeaconSent;
462 memcpy(&aparm->remoteVersion, &ts->version,
463 sizeof(struct ubik_version));
464 aparm->lastVote = ts->lastVote;
466 aparm->beaconSinceDown = ts->beaconSinceDown;
467 aparm->currentDB = ts->currentDB;
475 /* handle basic network debug command. This is the global state dumper */
477 SVOTE_DebugOld(struct rx_call * rxcall,
478 register struct ubik_debug_old * aparm)
481 /* fill in the basic debug structure. Note the the RPC protocol transfers,
482 * integers in host order. */
484 aparm->now = FT_ApproxTime();
485 aparm->lastYesTime = ubik_lastYesTime;
486 aparm->lastYesHost = ntohl(lastYesHost);
487 aparm->lastYesState = lastYesState;
488 aparm->lastYesClaim = lastYesClaim;
489 aparm->lowestHost = ntohl(lowestHost);
490 aparm->lowestTime = lowestTime;
491 aparm->syncHost = ntohl(syncHost);
492 aparm->syncTime = syncTime;
494 aparm->amSyncSite = ubik_amSyncSite;
495 ubeacon_Debug(aparm);
501 /* Get the recovery state. The label of the database may not have
502 * been written yet but set the flag so udebug behavior remains.
505 aparm->recoveryState = urecovery_state;
506 if ((urecovery_state & UBIK_RECSYNCSITE)
507 && (urecovery_state & UBIK_RECFOUNDDB)
508 && (urecovery_state & UBIK_RECHAVEDB)) {
509 aparm->recoveryState |= UBIK_RECLABELDB;
511 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
512 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
513 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
514 aparm->tidCounter = ubik_dbase->tidCounter;
516 if (ubik_currentTrans) {
517 aparm->currentTrans = 1;
518 if (ubik_currentTrans->type == UBIK_WRITETRANS)
519 aparm->writeTrans = 1;
521 aparm->writeTrans = 0;
523 aparm->currentTrans = 0;
526 aparm->epochTime = ubik_epochTime;
532 /* get the sync site; called by remote servers to find where they should go */
534 SVOTE_GetSyncSite(register struct rx_call * rxcall,
535 register afs_int32 * ahost)
537 register afs_int32 temp;
539 temp = uvote_GetSyncSite();
540 *ahost = ntohl(temp);
545 ubik_dprint(char *a, char *b, char *c, char *d, char *e, char *f, char *g,
548 ViceLog(5, (a, b, c, d, e, f, g, h));
553 ubik_print(char *a, char *b, char *c, char *d, char *e, char *f, char *g,
556 ViceLog(0, (a, b, c, d, e, f, g, h));
560 /* called once/run to init the vote module */
564 /* pretend we just voted for someone else, since we just restarted */
565 ubik_lastYesTime = FT_ApproxTime();