2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
20 #include <netinet/in.h>
22 #include <afs/afsutil.h>
26 #include <afs/afsutil.h>
29 #define UBIK_INTERNALS
35 The goal is to provide reliable operation among N servers, such that any
36 server can crash with the remaining servers continuing operation within a
37 short period of time. While a *short* outage is acceptable, this time
38 should be order of 3 minutes or less.
42 Note: SMALLTIME and BIGTIME are essentially the same time value, separated
43 only by the clock skew, MAXSKEW. In general, if you are making guarantees
44 for someone else, promise them no more than SMALLTIME seconds of whatever
45 invariant you provide. If you are waiting to be sure some invariant is now
46 *false*, wait at least BIGTIME seconds to be sure that SMALLTIME seconds
47 has passed at the other site.
49 Now, back to the design:
50 One site in the collection is a special site, designated the *sync* site.
51 The sync site sends periodic messages, which can be thought of as
52 keep-alive messages. When a non-sync site hears from the sync site, it
53 knows that it is getting updates for the next SMALLTIME seconds from that
56 If a server does not hear from the sync site in SMALLTIME seconds, it
57 determines that it no longer is getting updates, and thus refuses to give
58 out potentially out-of-date data. If a sync site can not muster a majority
59 of servers to agree that it is the sync site, then there is a possibility
60 that a network partition has occurred, allowing another server to claim to
61 be the sync site. Thus, any time that the sync site has not heard from a
62 majority of the servers in the last SMALLTIME seconds, it voluntarily
63 relinquishes its role as sync site.
65 While attempting to nominate a new sync site, certain rules apply. First,
66 a server can not reply "ok" (return 1 from ServBeacon) to two different
67 hosts in less than BIGTIME seconds; this allows a server that has heard
68 affirmative replies from a majority of the servers to know that no other
69 server in the network has heard enough affirmative replies in the last
70 BIGTIME seconds to become sync site, too. The variables ubik_lastYesTime
71 and lastYesHost are used by all servers to keep track of which host they
72 have last replied affirmatively to, when queried by a potential new sync
75 Once a sync site has become a sync site, it periodically sends beacon
76 messages with a parameter of 1, indicating that it already has determined
77 it is supposed to be the sync site. The servers treat such a message as a
78 guarantee that no other site will become sync site for the next SMALLTIME
79 seconds. In the interim, these servers can answer a query concerning which
80 site is the sync site without any communication with any server. The
81 variables lastBeaconArrival and lastBeaconHost are used by all servers to
82 keep track of which sync site has last contacted them.
84 One complication occurs while nominating a new sync site: each site may be
85 trying to nominate a different site (based on the value of lastYesHost),
86 yet we must nominate the smallest host (under some order), to prevent this
87 process from looping. The process could loop by having each server give
88 one vote to another server, but with no server getting a majority of the
89 votes. To avoid this, we try to withhold our votes for the server with the
90 lowest internet address (an easy-to-generate order). To this effect, we
91 keep track (in lowestTime and lowestHost) of the lowest server trying to
92 become a sync site. We wait for this server unless there is already a sync
93 site (indicated by ServBeacon's parameter being 1).
96 afs_int32 ubik_debugFlag = 0; /* print out debugging messages? */
98 /* these statics are used by all sites in nominating new sync sites */
99 afs_int32 ubik_lastYesTime = 0; /* time we sent the last *yes* vote */
100 static afs_uint32 lastYesHost = 0xffffffff; /* host to which we sent *yes* vote */
101 /* Next is time sync site began this vote: guarantees sync site until this + SMALLTIME */
102 static afs_int32 lastYesClaim = 0;
103 static int lastYesState = 0; /* did last site we voted for claim to be sync site? */
105 /* used to guarantee that nomination process doesn't loop */
106 static afs_int32 lowestTime = 0;
107 static afs_uint32 lowestHost = 0xffffffff;
108 static afs_int32 syncTime = 0;
109 static afs_int32 syncHost = 0;
111 /* used to remember which dbase version is the one at the sync site (for non-sync sites */
112 struct ubik_version ubik_dbVersion; /* sync site's dbase version */
113 struct ubik_tid ubik_dbTid; /* sync site's tid, or 0 if none */
115 /* decide if we should try to become sync site. The basic rule is that we
116 * don't run if there is a valid sync site and it ain't us (we have to run if
117 * it is us, in order to keep our votes). If there is no sync site, then we
118 * want to run if we're the lowest numbered host running, otherwise we defer to
119 * the lowest host. However, if the lowest host hasn't been heard from for a
120 * while, then we start running again, in case he crashed.
122 * This function returns true if we should run, and false otherwise.
124 int uvote_ShouldIRun() {
125 register afs_int32 now;
127 now = FT_ApproxTime();
128 if (BIGTIME + ubik_lastYesTime < now) return 1; /* no valid guy even trying */
129 if (lastYesState && lastYerHost != ubik_host[0]) return 0; /* other guy is sync site, leave him alone */
130 if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0]))
131 return 0; /* if someone is valid and better than us, don't run */
132 /* otherwise we should run */
136 /* Return the current synchronization site, if any. Simple approach: if the
137 * last guy we voted yes for claims to be the sync site, then we we're happy to
138 * use that guy for a sync site until the time his mandate expires. If the guy
139 * does not claim to be sync site, then, of course, there's none.
141 * In addition, if we lost the sync, we set urecovery_syncSite to an invalid
142 * value, indicating that we no longer know which version of the dbase is the
143 * one we should have. We'll get a new one when we next hear from the sync
146 * This function returns 0 or currently valid sync site. It can return our own
147 * address, if we're the sync site.
149 afs_int32 uvote_GetSyncSite() {
150 register afs_int32 now;
151 register afs_int32 code;
153 if (!lastYesState) code = 0;
155 now = FT_ApproxTime();
156 if (SMALLTIME + lastYesClaim < now) code = 0; /* last guy timed out */
157 else code = lastYesHost;
162 /* called by the sync site to handle vote beacons; if aconn is null, this is a
163 * local call; returns 0 or time whe the vote was sent. It returns 0 if we are
164 * not voting for this sync site, or the time we actually voted yes, if
167 SVOTE_Beacon(rxcall, astate, astart, avers, atid)
168 register struct rx_call *rxcall;
170 struct ubik_version *avers;
171 struct ubik_tid *atid;
174 register afs_int32 otherHost;
175 register afs_int32 now;
177 struct rx_connection *aconn;
179 struct ubik_server *ts;
182 now = FT_ApproxTime(); /* close to current time */
183 if (rxcall) { /* caller's host */
184 aconn = rx_ConnectionOf(rxcall);
185 rxp = rx_PeerOf(aconn);
186 otherHost = rx_HostOf(rxp);
188 /* get the primary interface address for this host. */
189 /* This is the identifier that ubik uses. */
190 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
193 ubik_dprint("Received beacon from unknown host %s\n",
194 afs_inet_ntoa(rx_HostOf(rxp)));
195 return 0; /* I don't know about you: vote no */
197 for (ts = ubik_servers; ts; ts = ts->next) {
198 if (ts->addr[0] == otherHost) break;
200 if (!ts) ubik_dprint("Unknown host %x has sent a beacon\n", otherHost);
201 if (ts && ts->isClone) isClone = 1;
203 otherHost = ubik_host[0]; /* this host */
207 ubik_dprint("Received beacon type %d from host %s\n", astate,
208 afs_inet_ntoa(otherHost));
210 /* compute the lowest server we've heard from. We'll try to only vote for
211 this dude if we don't already have a synchronization site. Also, don't
212 let a very old lowestHost confusing things forever. We pick a new
213 lowestHost after BIGTIME seconds to limit the damage if this host
214 actually crashes. Finally, we also count in this computation: don't
215 pick someone else if we're even better!
217 Note that the test below must be <=, not <, so that we keep refreshing
218 lowestTime. Otherwise it will look like we haven't heard from
219 lowestHost in a while and another host could slip in. */
222 /* First compute the lowest host we've heard from, whether we want them
223 for a sync site or not. If we haven't heard from a site in BIGTIME
224 seconds, we ignore its presence in lowestHost: it may have crashed.
225 Note that we don't ever let anyone appear in our lowestHost if we're
226 lower than them, 'cause we know we're up. */
227 /* But do not consider clones for lowesHost since they never may become
230 (ntohl((afs_uint32) otherHost) <= ntohl((afs_uint32) lowestHost)
231 || lowestTime + BIGTIME < now)) {
233 lowestHost = otherHost;
235 /* why do we need this next check? Consider the case where each of two
236 servers decides the other is lowestHost. Each stops sending beacons
237 'cause the other is there. Not obvious that this process terminates:
238 i.e. each guy could restart procedure and again think other side is
239 lowest. Need to prove: if one guy in the system is lowest and knows
240 he's lowest, these loops don't occur. because if someone knows he's
241 lowest, he will send out beacons telling others to vote for him. */
243 (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32) lowestHost)
244 || lowestTime + BIGTIME < now)) {
246 lowestHost = ubik_host[0];
249 /* tell if we've heard from a sync site recently (even if we're not voting
250 for this dude yet). After a while, time the guy out. */
251 if (astate) { /* this guy is a sync site */
252 syncHost = otherHost;
255 else if (syncTime + BIGTIME < now) {
257 ubik_dprint("Ubik: Lost contact with sync-site %d.%d.%d.%d (NOT in quorum)\n",
258 ((syncHost>>24)&0xff), ((syncHost>>16)&0xff),
259 ((syncHost>> 8)&0xff), (syncHost &0xff));
264 /* decide how to vote */
265 vote = 0; /* start off voting no */
267 /* if we this guy isn't a sync site, we don't really have to vote for him.
268 We get to apply some heuristics to try to avoid weird oscillation sates
269 in the voting procedure. */
271 /* in here only if this guy doesn't claim to be a sync site */
273 /* lowestHost is also trying for our votes, then just say no. */
274 if (ntohl(lowestHost) != ntohl(otherHost)) {
278 /* someone else *is* a sync site, just say no */
279 if (syncHost && syncHost != otherHost)
281 } else /* fast startup if this is the only non-clone */
282 if (lastYesHost == 0xffffffff && otherHost == ubik_host[0]) {
284 for (ts = ubik_servers; ts; ts = ts->next) {
285 if (ts->addr[0] == otherHost) continue;
286 if (!ts->isClone) i++;
288 if (!i) lastYesHost = otherHost;
292 if (isClone) return 0; /* clone never can become sync site */
294 /* Don't promise sync site support to more than one host every BIGTIME
295 seconds. This is the heart of our invariants in this system. */
296 if (ubik_lastYesTime + BIGTIME < now || otherHost == lastYesHost) {
297 if ((ubik_lastYesTime+BIGTIME < now) ||
298 (otherHost != lastYesHost) ||
299 (lastYesState != astate)) {
300 /* A new vote or a change in the vote or changed quorum */
301 ubik_dprint("Ubik: vote 'yes' for %s %s\n",
302 afs_inet_ntoa(otherHost),
303 (astate?"(in quorum)":"(NOT in quorum)"));
306 vote = now; /* vote yes */
307 ubik_lastYesTime = now; /* remember when we voted yes */
308 lastYesClaim = astart; /* remember for computing when sync site expires */
309 lastYesHost = otherHost; /* and who for */
310 lastYesState = astate; /* remember if site is a sync site */
311 ubik_dbVersion = *avers; /* resync value */
312 ubik_dbTid = *atid; /* transaction id, if any, of active trans */
313 urecovery_CheckTid(atid); /* check if current write trans needs aborted */
318 /* handle per-server debug command, where 0 is the first server. Basic network
320 SVOTE_SDebug(rxcall, awhich, aparm)
321 struct rx_call *rxcall;
323 register struct ubik_sdebug *aparm;
325 afs_int32 code, isClone;
326 code = SVOTE_XSDebug(rxcall, awhich, aparm, &isClone);
330 SVOTE_XSDebug(rxcall, awhich, aparm, isclone)
332 struct rx_call *rxcall;
334 register struct ubik_sdebug *aparm;
336 register struct ubik_server *ts;
338 for(ts=ubik_servers; ts; ts=ts->next) {
341 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
342 for ( i=0; i < UBIK_MAX_INTERFACE_ADDR-1; i++)
343 aparm->altAddr[i] = ntohl(ts->addr[i+1]);
344 aparm->lastVoteTime = ts->lastVoteTime;
345 aparm->lastBeaconSent = ts->lastBeaconSent;
346 memcpy(&aparm->remoteVersion, &ts->version, sizeof(struct ubik_version));
347 aparm->lastVote = ts->lastVote;
349 aparm->beaconSinceDown = ts->beaconSinceDown;
350 aparm->currentDB = ts->currentDB;
351 *isclone = ts->isClone;
358 SVOTE_XDebug(rxcall, aparm, isclone)
359 struct rx_call *rxcall;
360 register struct ubik_debug *aparm;
365 code = SVOTE_Debug(rxcall, aparm);
370 /* handle basic network debug command. This is the global state dumper */
371 SVOTE_Debug(rxcall, aparm)
372 struct rx_call *rxcall;
373 register struct ubik_debug *aparm; {
375 /* fill in the basic debug structure. Note the the RPC protocol transfers,
376 * integers in host order. */
378 aparm->now = FT_ApproxTime();
379 aparm->lastYesTime = ubik_lastYesTime;
380 aparm->lastYesHost = ntohl(lastYesHost);
381 aparm->lastYesState = lastYesState;
382 aparm->lastYesClaim = lastYesClaim;
383 aparm->lowestHost = ntohl(lowestHost);
384 aparm->lowestTime = lowestTime;
385 aparm->syncHost = ntohl(syncHost);
386 aparm->syncTime = syncTime;
388 /* fill in all interface addresses of myself in hostbyte order */
389 for ( i=0; i < UBIK_MAX_INTERFACE_ADDR; i++)
390 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
392 aparm->amSyncSite = ubik_amSyncSite;
393 ubeacon_Debug(aparm);
399 /* Get the recovery state. The label of the database may not have
400 * been written yet but set the flag so udebug behavior remains.
403 aparm->recoveryState = urecovery_state;
404 if ((urecovery_state & UBIK_RECSYNCSITE) &&
405 (urecovery_state & UBIK_RECFOUNDDB ) &&
406 (urecovery_state & UBIK_RECHAVEDB ) ) {
407 aparm->recoveryState |= UBIK_RECLABELDB;
409 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
410 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
411 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
412 aparm->tidCounter = ubik_dbase->tidCounter;
414 if (ubik_currentTrans) {
415 aparm->currentTrans = 1;
416 if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1;
417 else aparm->writeTrans = 0;
420 aparm->currentTrans = 0;
423 aparm->epochTime = ubik_epochTime;
428 SVOTE_SDebugOld(rxcall, awhich, aparm)
429 struct rx_call *rxcall;
431 register struct ubik_sdebug_old *aparm; {
432 register struct ubik_server *ts;
434 for(ts=ubik_servers; ts; ts=ts->next) {
437 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
438 aparm->lastVoteTime = ts->lastVoteTime;
439 aparm->lastBeaconSent = ts->lastBeaconSent;
440 memcpy(&aparm->remoteVersion, &ts->version, sizeof(struct ubik_version));
441 aparm->lastVote = ts->lastVote;
443 aparm->beaconSinceDown = ts->beaconSinceDown;
444 aparm->currentDB = ts->currentDB;
452 /* handle basic network debug command. This is the global state dumper */
453 SVOTE_DebugOld(rxcall, aparm)
454 struct rx_call *rxcall;
455 register struct ubik_debug_old *aparm; {
457 /* fill in the basic debug structure. Note the the RPC protocol transfers,
458 * integers in host order. */
460 aparm->now = FT_ApproxTime();
461 aparm->lastYesTime = ubik_lastYesTime;
462 aparm->lastYesHost = ntohl(lastYesHost);
463 aparm->lastYesState = lastYesState;
464 aparm->lastYesClaim = lastYesClaim;
465 aparm->lowestHost = ntohl(lowestHost);
466 aparm->lowestTime = lowestTime;
467 aparm->syncHost = ntohl(syncHost);
468 aparm->syncTime = syncTime;
470 aparm->amSyncSite = ubik_amSyncSite;
471 ubeacon_Debug(aparm);
477 /* Get the recovery state. The label of the database may not have
478 * been written yet but set the flag so udebug behavior remains.
481 aparm->recoveryState = urecovery_state;
482 if ((urecovery_state & UBIK_RECSYNCSITE) &&
483 (urecovery_state & UBIK_RECFOUNDDB ) &&
484 (urecovery_state & UBIK_RECHAVEDB ) ) {
485 aparm->recoveryState |= UBIK_RECLABELDB;
487 memcpy(&aparm->syncVersion, &ubik_dbVersion, sizeof(struct ubik_version));
488 memcpy(&aparm->syncTid, &ubik_dbTid, sizeof(struct ubik_tid));
489 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
490 aparm->tidCounter = ubik_dbase->tidCounter;
492 if (ubik_currentTrans) {
493 aparm->currentTrans = 1;
494 if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1;
495 else aparm->writeTrans = 0;
498 aparm->currentTrans = 0;
501 aparm->epochTime = ubik_epochTime;
507 /* get the sync site; called by remote servers to find where they should go */
508 SVOTE_GetSyncSite(rxcall, ahost)
509 register struct rx_call *rxcall;
510 register afs_int32 *ahost; {
511 register afs_int32 temp;
513 temp = uvote_GetSyncSite();
514 *ahost = ntohl(temp);
518 ubik_dprint(a,b,c,d,e,f,g,h)
519 char *a, *b, *c, *d, *e, *f, *g, *h; {
520 ViceLog(5, (a,b,c,d,e,f,g,h));
523 ubik_print(a,b,c,d,e,f,g,h)
524 char *a, *b, *c, *d, *e, *f, *g, *h;
526 ViceLog(0, (a,b,c,d,e,f,g,h));
529 /* called once/run to init the vote module */
531 /* pretend we just voted for someone else, since we just restarted */
532 ubik_lastYesTime = FT_ApproxTime();