2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
20 #include <netinet/in.h>
22 #include <afs/afsutil.h>
33 #include <afs/afsutil.h>
36 #define UBIK_INTERNALS
42 The goal is to provide reliable operation among N servers, such that any
43 server can crash with the remaining servers continuing operation within a
44 short period of time. While a *short* outage is acceptable, this time
45 should be order of 3 minutes or less.
49 Note: SMALLTIME and BIGTIME are essentially the same time value, separated
50 only by the clock skew, MAXSKEW. In general, if you are making guarantees
51 for someone else, promise them no more than SMALLTIME seconds of whatever
52 invariant you provide. If you are waiting to be sure some invariant is now
53 *false*, wait at least BIGTIME seconds to be sure that SMALLTIME seconds
54 has passed at the other site.
56 Now, back to the design:
57 One site in the collection is a special site, designated the *sync* site.
58 The sync site sends periodic messages, which can be thought of as
59 keep-alive messages. When a non-sync site hears from the sync site, it
60 knows that it is getting updates for the next SMALLTIME seconds from that
63 If a server does not hear from the sync site in SMALLTIME seconds, it
64 determines that it no longer is getting updates, and thus refuses to give
65 out potentially out-of-date data. If a sync site can not muster a majority
66 of servers to agree that it is the sync site, then there is a possibility
67 that a network partition has occurred, allowing another server to claim to
68 be the sync site. Thus, any time that the sync site has not heard from a
69 majority of the servers in the last SMALLTIME seconds, it voluntarily
70 relinquishes its role as sync site.
72 While attempting to nominate a new sync site, certain rules apply. First,
73 a server can not reply "ok" (return 1 from ServBeacon) to two different
74 hosts in less than BIGTIME seconds; this allows a server that has heard
75 affirmative replies from a majority of the servers to know that no other
76 server in the network has heard enough affirmative replies in the last
77 BIGTIME seconds to become sync site, too. The variables ubik_lastYesTime
78 and lastYesHost are used by all servers to keep track of which host they
79 have last replied affirmatively to, when queried by a potential new sync
82 Once a sync site has become a sync site, it periodically sends beacon
83 messages with a parameter of 1, indicating that it already has determined
84 it is supposed to be the sync site. The servers treat such a message as a
85 guarantee that no other site will become sync site for the next SMALLTIME
86 seconds. In the interim, these servers can answer a query concerning which
87 site is the sync site without any communication with any server. The
88 variables lastBeaconArrival and lastBeaconHost are used by all servers to
89 keep track of which sync site has last contacted them.
91 One complication occurs while nominating a new sync site: each site may be
92 trying to nominate a different site (based on the value of lastYesHost),
93 yet we must nominate the smallest host (under some order), to prevent this
94 process from looping. The process could loop by having each server give
95 one vote to another server, but with no server getting a majority of the
96 votes. To avoid this, we try to withhold our votes for the server with the
97 lowest internet address (an easy-to-generate order). To this effect, we
98 keep track (in lowestTime and lowestHost) of the lowest server trying to
99 become a sync site. We wait for this server unless there is already a sync
100 site (indicated by ServBeacon's parameter being 1).
103 afs_int32 ubik_debugFlag = 0; /* print out debugging messages? */
105 /* these statics are used by all sites in nominating new sync sites */
106 afs_int32 ubik_lastYesTime = 0; /* time we sent the last *yes* vote */
107 static afs_uint32 lastYesHost = 0xffffffff; /* host to which we sent *yes* vote */
108 /* Next is time sync site began this vote: guarantees sync site until this + SMALLTIME */
109 static afs_int32 lastYesClaim = 0;
110 static int lastYesState = 0; /* did last site we voted for claim to be sync site? */
112 /* used to guarantee that nomination process doesn't loop */
113 static afs_int32 lowestTime = 0;
114 static afs_uint32 lowestHost = 0xffffffff;
115 static afs_int32 syncTime = 0;
116 static afs_int32 syncHost = 0;
118 /* used to remember which dbase version is the one at the sync site (for non-sync sites */
119 struct ubik_version ubik_dbVersion; /* sync site's dbase version */
120 struct ubik_tid ubik_dbTid; /* sync site's tid, or 0 if none */
122 /* decide if we should try to become sync site. The basic rule is that we
123 * don't run if there is a valid sync site and it ain't us (we have to run if
124 * it is us, in order to keep our votes). If there is no sync site, then we
125 * want to run if we're the lowest numbered host running, otherwise we defer to
126 * the lowest host. However, if the lowest host hasn't been heard from for a
127 * while, then we start running again, in case he crashed.
129 * This function returns true if we should run, and false otherwise.
131 int uvote_ShouldIRun() {
132 register afs_int32 now;
134 now = FT_ApproxTime();
135 if (BIGTIME + ubik_lastYesTime < now) return 1; /* no valid guy even trying */
136 if (lastYesState && lastYesHost != ubik_host[0]) return 0; /* other guy is sync site, leave him alone */
137 if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0]))
138 return 0; /* if someone is valid and better than us, don't run */
139 /* otherwise we should run */
143 /* Return the current synchronization site, if any. Simple approach: if the
144 * last guy we voted yes for claims to be the sync site, then we we're happy to
145 * use that guy for a sync site until the time his mandate expires. If the guy
146 * does not claim to be sync site, then, of course, there's none.
148 * In addition, if we lost the sync, we set urecovery_syncSite to an invalid
149 * value, indicating that we no longer know which version of the dbase is the
150 * one we should have. We'll get a new one when we next hear from the sync
153 * This function returns 0 or currently valid sync site. It can return our own
154 * address, if we're the sync site.
156 afs_int32 uvote_GetSyncSite() {
157 register afs_int32 now;
158 register afs_int32 code;
160 if (!lastYesState) code = 0;
162 now = FT_ApproxTime();
163 if (SMALLTIME + lastYesClaim < now) code = 0; /* last guy timed out */
164 else code = lastYesHost;
169 /* called by the sync site to handle vote beacons; if aconn is null, this is a
170 * local call; returns 0 or time whe the vote was sent. It returns 0 if we are
171 * not voting for this sync site, or the time we actually voted yes, if
174 afs_int32 SVOTE_Beacon(rxcall, astate, astart, avers, atid)
175 register struct rx_call *rxcall;
177 struct ubik_version *avers;
178 struct ubik_tid *atid;
181 register afs_int32 otherHost;
182 register afs_int32 now;
184 struct rx_connection *aconn;
186 struct ubik_server *ts;
189 now = FT_ApproxTime(); /* close to current time */
190 if (rxcall) { /* caller's host */
191 aconn = rx_ConnectionOf(rxcall);
192 rxp = rx_PeerOf(aconn);
193 otherHost = rx_HostOf(rxp);
195 /* get the primary interface address for this host. */
196 /* This is the identifier that ubik uses. */
197 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
200 ubik_dprint("Received beacon from unknown host %s\n",
201 afs_inet_ntoa(rx_HostOf(rxp)));
202 return 0; /* I don't know about you: vote no */
204 for (ts = ubik_servers; ts; ts = ts->next) {
205 if (ts->addr[0] == otherHost) break;
207 if (!ts) ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
208 if (ts && ts->isClone) isClone = 1;
210 otherHost = ubik_host[0]; /* this host */
214 ubik_dprint("Received beacon type %d from host %s\n", astate,
215 afs_inet_ntoa(otherHost));
217 /* compute the lowest server we've heard from. We'll try to only vote for
218 this dude if we don't already have a synchronization site. Also, don't
219 let a very old lowestHost confusing things forever. We pick a new
220 lowestHost after BIGTIME seconds to limit the damage if this host
221 actually crashes. Finally, we also count in this computation: don't
222 pick someone else if we're even better!
224 Note that the test below must be <=, not <, so that we keep refreshing
225 lowestTime. Otherwise it will look like we haven't heard from
226 lowestHost in a while and another host could slip in. */
229 /* First compute the lowest host we've heard from, whether we want them
230 for a sync site or not. If we haven't heard from a site in BIGTIME
231 seconds, we ignore its presence in lowestHost: it may have crashed.
232 Note that we don't ever let anyone appear in our lowestHost if we're
233 lower than them, 'cause we know we're up. */
234 /* But do not consider clones for lowesHost since they never may become
237 (ntohl((afs_uint32) otherHost) <= ntohl((afs_uint32) lowestHost)
238 || lowestTime + BIGTIME < now)) {
240 lowestHost = otherHost;
242 /* why do we need this next check? Consider the case where each of two
243 servers decides the other is lowestHost. Each stops sending beacons
244 'cause the other is there. Not obvious that this process terminates:
245 i.e. each guy could restart procedure and again think other side is
246 lowest. Need to prove: if one guy in the system is lowest and knows
247 he's lowest, these loops don't occur. because if someone knows he's
248 lowest, he will send out beacons telling others to vote for him. */
250 (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32) lowestHost)
251 || lowestTime + BIGTIME < now)) {
253 lowestHost = ubik_host[0];
256 /* tell if we've heard from a sync site recently (even if we're not voting
257 for this dude yet). After a while, time the guy out. */
258 if (astate) { /* this guy is a sync site */
259 syncHost = otherHost;
262 else if (syncTime + BIGTIME < now) {
264 ubik_dprint("Ubik: Lost contact with sync-site %d.%d.%d.%d (NOT in quorum)\n",
265 ((syncHost>>24)&0xff), ((syncHost>>16)&0xff),
266 ((syncHost>> 8)&0xff), (syncHost &0xff));
271 /* decide how to vote */
272 vote = 0; /* start off voting no */
274 /* if we this guy isn't a sync site, we don't really have to vote for him.
275 We get to apply some heuristics to try to avoid weird oscillation sates
276 in the voting procedure. */
278 /* in here only if this guy doesn't claim to be a sync site */
280 /* lowestHost is also trying for our votes, then just say no. */
281 if (ntohl(lowestHost) != ntohl(otherHost)) {
285 /* someone else *is* a sync site, just say no */
286 if (syncHost && syncHost != otherHost)
288 } else /* fast startup if this is the only non-clone */
289 if (lastYesHost == 0xffffffff && otherHost == ubik_host[0]) {
291 for (ts = ubik_servers; ts; ts = ts->next) {
292 if (ts->addr[0] == otherHost) continue;
293 if (!ts->isClone) i++;
295 if (!i) lastYesHost = otherHost;
299 if (isClone) return 0; /* clone never can become sync site */
301 /* Don't promise sync site support to more than one host every BIGTIME
302 seconds. This is the heart of our invariants in this system. */
303 if (ubik_lastYesTime + BIGTIME < now || otherHost == lastYesHost) {
304 if ((ubik_lastYesTime+BIGTIME < now) ||
305 (otherHost != lastYesHost) ||
306 (lastYesState != astate)) {
307 /* A new vote or a change in the vote or changed quorum */
308 ubik_dprint("Ubik: vote 'yes' for %s %s\n",
309 afs_inet_ntoa(otherHost),
310 (astate?"(in quorum)":"(NOT in quorum)"));
313 vote = now; /* vote yes */
314 ubik_lastYesTime = now; /* remember when we voted yes */
315 lastYesClaim = astart; /* remember for computing when sync site expires */
316 lastYesHost = otherHost; /* and who for */
317 lastYesState = astate; /* remember if site is a sync site */
318 ubik_dbVersion = *avers; /* resync value */
319 ubik_dbTid = *atid; /* transaction id, if any, of active trans */
320 urecovery_CheckTid(atid); /* check if current write trans needs aborted */
325 /* handle per-server debug command, where 0 is the first server. Basic network
327 afs_int32 SVOTE_SDebug(rxcall, awhich, aparm)
328 struct rx_call *rxcall;
330 register struct ubik_sdebug *aparm;
332 afs_int32 code, isClone;
333 code = SVOTE_XSDebug(rxcall, awhich, aparm, &isClone);
337 afs_int32 SVOTE_XSDebug(rxcall, awhich, aparm, isclone)
339 struct rx_call *rxcall;
341 register struct ubik_sdebug *aparm;
343 register struct ubik_server *ts;
345 for(ts=ubik_servers; ts; ts=ts->next) {
348 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
349 for ( i=0; i < UBIK_MAX_INTERFACE_ADDR-1; i++)
350 aparm->altAddr[i] = ntohl(ts->addr[i+1]);
351 aparm->lastVoteTime = ts->lastVoteTime;
352 aparm->lastBeaconSent = ts->lastBeaconSent;
353 memcpy(&aparm->remoteVersion, &ts->version, sizeof(struct ubik_version));
354 aparm->lastVote = ts->lastVote;
356 aparm->beaconSinceDown = ts->beaconSinceDown;
357 aparm->currentDB = ts->currentDB;
358 *isclone = ts->isClone;
365 afs_int32 SVOTE_XDebug(rxcall, aparm, isclone)
366 struct rx_call *rxcall;
367 register struct ubik_debug *aparm;
372 code = SVOTE_Debug(rxcall, aparm);
377 /* handle basic network debug command. This is the global state dumper */
378 afs_int32 SVOTE_Debug(rxcall, aparm)
379 struct rx_call *rxcall;
380 register struct ubik_debug *aparm; {
382 /* fill in the basic debug structure. Note the the RPC protocol transfers,
383 * integers in host order. */
385 aparm->now = FT_ApproxTime();
386 aparm->lastYesTime = ubik_lastYesTime;
387 aparm->lastYesHost = ntohl(lastYesHost);
388 aparm->lastYesState = lastYesState;
389 aparm->lastYesClaim = lastYesClaim;
390 aparm->lowestHost = ntohl(lowestHost);
391 aparm->lowestTime = lowestTime;
392 aparm->syncHost = ntohl(syncHost);
393 aparm->syncTime = syncTime;
395 /* fill in all interface addresses of myself in hostbyte order */
396 for ( i=0; i < UBIK_MAX_INTERFACE_ADDR; i++)
397 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
399 aparm->amSyncSite = ubik_amSyncSite;
400 ubeacon_Debug(aparm);
406 /* Get the recovery state. The label of the database may not have
407 * been written yet but set the flag so udebug behavior remains.
410 aparm->recoveryState = urecovery_state;
411 if ((urecovery_state & UBIK_RECSYNCSITE) &&
412 (urecovery_state & UBIK_RECFOUNDDB ) &&
413 (urecovery_state & UBIK_RECHAVEDB ) ) {
414 aparm->recoveryState |= UBIK_RECLABELDB;
416 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
417 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
418 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
419 aparm->tidCounter = ubik_dbase->tidCounter;
421 if (ubik_currentTrans) {
422 aparm->currentTrans = 1;
423 if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1;
424 else aparm->writeTrans = 0;
427 aparm->currentTrans = 0;
430 aparm->epochTime = ubik_epochTime;
435 afs_int32 SVOTE_SDebugOld(rxcall, awhich, aparm)
436 struct rx_call *rxcall;
438 register struct ubik_sdebug_old *aparm; {
439 register struct ubik_server *ts;
441 for(ts=ubik_servers; ts; ts=ts->next) {
444 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
445 aparm->lastVoteTime = ts->lastVoteTime;
446 aparm->lastBeaconSent = ts->lastBeaconSent;
447 memcpy(&aparm->remoteVersion, &ts->version, sizeof(struct ubik_version));
448 aparm->lastVote = ts->lastVote;
450 aparm->beaconSinceDown = ts->beaconSinceDown;
451 aparm->currentDB = ts->currentDB;
459 /* handle basic network debug command. This is the global state dumper */
460 afs_int32 SVOTE_DebugOld(rxcall, aparm)
461 struct rx_call *rxcall;
462 register struct ubik_debug_old *aparm; {
464 /* fill in the basic debug structure. Note the the RPC protocol transfers,
465 * integers in host order. */
467 aparm->now = FT_ApproxTime();
468 aparm->lastYesTime = ubik_lastYesTime;
469 aparm->lastYesHost = ntohl(lastYesHost);
470 aparm->lastYesState = lastYesState;
471 aparm->lastYesClaim = lastYesClaim;
472 aparm->lowestHost = ntohl(lowestHost);
473 aparm->lowestTime = lowestTime;
474 aparm->syncHost = ntohl(syncHost);
475 aparm->syncTime = syncTime;
477 aparm->amSyncSite = ubik_amSyncSite;
478 ubeacon_Debug(aparm);
484 /* Get the recovery state. The label of the database may not have
485 * been written yet but set the flag so udebug behavior remains.
488 aparm->recoveryState = urecovery_state;
489 if ((urecovery_state & UBIK_RECSYNCSITE) &&
490 (urecovery_state & UBIK_RECFOUNDDB ) &&
491 (urecovery_state & UBIK_RECHAVEDB ) ) {
492 aparm->recoveryState |= UBIK_RECLABELDB;
494 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
495 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
496 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
497 aparm->tidCounter = ubik_dbase->tidCounter;
499 if (ubik_currentTrans) {
500 aparm->currentTrans = 1;
501 if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1;
502 else aparm->writeTrans = 0;
505 aparm->currentTrans = 0;
508 aparm->epochTime = ubik_epochTime;
514 /* get the sync site; called by remote servers to find where they should go */
515 afs_int32 SVOTE_GetSyncSite(rxcall, ahost)
516 register struct rx_call *rxcall;
517 register afs_int32 *ahost; {
518 register afs_int32 temp;
520 temp = uvote_GetSyncSite();
521 *ahost = ntohl(temp);
525 ubik_dprint(a,b,c,d,e,f,g,h)
526 char *a, *b, *c, *d, *e, *f, *g, *h; {
527 ViceLog(5, (a,b,c,d,e,f,g,h));
530 ubik_print(a,b,c,d,e,f,g,h)
531 char *a, *b, *c, *d, *e, *f, *g, *h;
533 ViceLog(0, (a,b,c,d,e,f,g,h));
536 /* called once/run to init the vote module */
538 /* pretend we just voted for someone else, since we just restarted */
539 ubik_lastYesTime = FT_ApproxTime();