2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afs/param.h>
11 #include <sys/types.h>
16 #include <netinet/in.h>
18 #include <afs/afsutil.h>
22 #include <afs/afsutil.h>
25 #define UBIK_INTERNALS
31 The goal is to provide reliable operation among N servers, such that any
32 server can crash with the remaining servers continuing operation within a
33 short period of time. While a *short* outage is acceptable, this time
34 should be order of 3 minutes or less.
38 Note: SMALLTIME and BIGTIME are essentially the same time value, separated
39 only by the clock skew, MAXSKEW. In general, if you are making guarantees
40 for someone else, promise them no more than SMALLTIME seconds of whatever
41 invariant you provide. If you are waiting to be sure some invariant is now
42 *false*, wait at least BIGTIME seconds to be sure that SMALLTIME seconds
43 has passed at the other site.
45 Now, back to the design:
46 One site in the collection is a special site, designated the *sync* site.
47 The sync site sends periodic messages, which can be thought of as
48 keep-alive messages. When a non-sync site hears from the sync site, it
49 knows that it is getting updates for the next SMALLTIME seconds from that
52 If a server does not hear from the sync site in SMALLTIME seconds, it
53 determines that it no longer is getting updates, and thus refuses to give
54 out potentially out-of-date data. If a sync site can not muster a majority
55 of servers to agree that it is the sync site, then there is a possibility
56 that a network partition has occurred, allowing another server to claim to
57 be the sync site. Thus, any time that the sync site has not heard from a
58 majority of the servers in the last SMALLTIME seconds, it voluntarily
59 relinquishes its role as sync site.
61 While attempting to nominate a new sync site, certain rules apply. First,
62 a server can not reply "ok" (return 1 from ServBeacon) to two different
63 hosts in less than BIGTIME seconds; this allows a server that has heard
64 affirmative replies from a majority of the servers to know that no other
65 server in the network has heard enough affirmative replies in the last
66 BIGTIME seconds to become sync site, too. The variables ubik_lastYesTime
67 and lastYesHost are used by all servers to keep track of which host they
68 have last replied affirmatively to, when queried by a potential new sync
71 Once a sync site has become a sync site, it periodically sends beacon
72 messages with a parameter of 1, indicating that it already has determined
73 it is supposed to be the sync site. The servers treat such a message as a
74 guarantee that no other site will become sync site for the next SMALLTIME
75 seconds. In the interim, these servers can answer a query concerning which
76 site is the sync site without any communication with any server. The
77 variables lastBeaconArrival and lastBeaconHost are used by all servers to
78 keep track of which sync site has last contacted them.
80 One complication occurs while nominating a new sync site: each site may be
81 trying to nominate a different site (based on the value of lastYesHost),
82 yet we must nominate the smallest host (under some order), to prevent this
83 process from looping. The process could loop by having each server give
84 one vote to another server, but with no server getting a majority of the
85 votes. To avoid this, we try to withhold our votes for the server with the
86 lowest internet address (an easy-to-generate order). To this effect, we
87 keep track (in lowestTime and lowestHost) of the lowest server trying to
88 become a sync site. We wait for this server unless there is already a sync
89 site (indicated by ServBeacon's parameter being 1).
92 afs_int32 ubik_debugFlag = 0; /* print out debugging messages? */
94 /* these statics are used by all sites in nominating new sync sites */
95 afs_int32 ubik_lastYesTime = 0; /* time we sent the last *yes* vote */
96 static afs_uint32 lastYesHost = 0xffffffff; /* host to which we sent *yes* vote */
97 /* Next is time sync site began this vote: guarantees sync site until this + SMALLTIME */
98 static afs_int32 lastYesClaim = 0;
99 static int lastYesState = 0; /* did last site we voted for claim to be sync site? */
101 /* used to guarantee that nomination process doesn't loop */
102 static afs_int32 lowestTime = 0;
103 static afs_uint32 lowestHost = 0xffffffff;
104 static afs_int32 syncTime = 0;
105 static afs_int32 syncHost = 0;
107 /* used to remember which dbase version is the one at the sync site (for non-sync sites */
108 struct ubik_version ubik_dbVersion; /* sync site's dbase version */
109 struct ubik_tid ubik_dbTid; /* sync site's tid, or 0 if none */
111 /* decide if we should try to become sync site. The basic rule is that we
112 * don't run if there is a valid sync site and it ain't us (we have to run if
113 * it is us, in order to keep our votes). If there is no sync site, then we
114 * want to run if we're the lowest numbered host running, otherwise we defer to
115 * the lowest host. However, if the lowest host hasn't been heard from for a
116 * while, then we start running again, in case he crashed.
118 * This function returns true if we should run, and false otherwise.
120 int uvote_ShouldIRun() {
121 register afs_int32 now;
123 now = FT_ApproxTime();
124 if (BIGTIME + ubik_lastYesTime < now) return 1; /* no valid guy even trying */
125 if (lastYesState && lastYerHost != ubik_host[0]) return 0; /* other guy is sync site, leave him alone */
126 if (ntohl((afs_uint32) lastYesHost) < ntohl((afs_uint32) ubik_host[0]))
127 return 0; /* if someone is valid and better than us, don't run */
128 /* otherwise we should run */
132 /* Return the current synchronization site, if any. Simple approach: if the
133 * last guy we voted yes for claims to be the sync site, then we we're happy to
134 * use that guy for a sync site until the time his mandate expires. If the guy
135 * does not claim to be sync site, then, of course, there's none.
137 * In addition, if we lost the sync, we set urecovery_syncSite to an invalid
138 * value, indicating that we no longer know which version of the dbase is the
139 * one we should have. We'll get a new one when we next hear from the sync
142 * This function returns 0 or currently valid sync site. It can return our own
143 * address, if we're the sync site.
145 afs_int32 uvote_GetSyncSite() {
146 register afs_int32 now;
147 register afs_int32 code;
149 if (!lastYesState) code = 0;
151 now = FT_ApproxTime();
152 if (SMALLTIME + lastYesClaim < now) code = 0; /* last guy timed out */
153 else code = lastYesHost;
158 /* called by the sync site to handle vote beacons; if aconn is null, this is a
159 * local call; returns 0 or time whe the vote was sent. It returns 0 if we are
160 * not voting for this sync site, or the time we actually voted yes, if
163 SVOTE_Beacon(rxcall, astate, astart, avers, atid)
164 register struct rx_call *rxcall;
166 struct ubik_version *avers;
167 struct ubik_tid *atid;
170 register afs_int32 otherHost;
171 register afs_int32 now;
173 struct rx_connection *aconn;
176 now = FT_ApproxTime(); /* close to current time */
177 if (rxcall) { /* caller's host */
178 aconn = rx_ConnectionOf(rxcall);
179 rxp = rx_PeerOf(aconn);
180 otherHost = rx_HostOf(rxp);
182 /* get the primary interface address for this host. */
183 /* This is the identifier that ubik uses. */
184 otherHost = ubikGetPrimaryInterfaceAddr(otherHost);
187 ubik_dprint("Received beacon from unknown host %s\n",
188 afs_inet_ntoa(rx_HostOf(rxp)));
189 return 0; /* I don't know about you: vote no */
192 otherHost = ubik_host[0]; /* this host */
194 ubik_dprint("Received beacon type %d from host %s\n", astate,
195 afs_inet_ntoa(otherHost));
197 /* compute the lowest server we've heard from. We'll try to only vote for
198 this dude if we don't already have a synchronization site. Also, don't
199 let a very old lowestHost confusing things forever. We pick a new
200 lowestHost after BIGTIME seconds to limit the damage if this host
201 actually crashes. Finally, we also count in this computation: don't
202 pick someone else if we're even better!
204 Note that the test below must be <=, not <, so that we keep refreshing
205 lowestTime. Otherwise it will look like we haven't heard from
206 lowestHost in a while and another host could slip in. */
209 /* First compute the lowest host we've heard from, whether we want them
210 for a sync site or not. If we haven't heard from a site in BIGTIME
211 seconds, we ignore its presence in lowestHost: it may have crashed.
212 Note that we don't ever let anyone appear in our lowestHost if we're
213 lower than them, 'cause we know we're up. */
214 if (ntohl((afs_uint32) otherHost) <= ntohl((afs_uint32) lowestHost)
215 || lowestTime + BIGTIME < now) {
217 lowestHost = otherHost;
219 /* why do we need this next check? Consider the case where each of two
220 servers decides the other is lowestHost. Each stops sending beacons
221 'cause the other is there. Not obvious that this process terminates:
222 i.e. each guy could restart procedure and again think other side is
223 lowest. Need to prove: if one guy in the system is lowest and knows
224 he's lowest, these loops don't occur. because if someone knows he's
225 lowest, he will send out beacons telling others to vote for him. */
226 if (ntohl((afs_uint32) ubik_host[0]) <= ntohl((afs_uint32) lowestHost)
227 || lowestTime + BIGTIME < now) {
229 lowestHost = ubik_host[0];
232 /* tell if we've heard from a sync site recently (even if we're not voting
233 for this dude yet). After a while, time the guy out. */
234 if (astate) { /* this guy is a sync site */
235 syncHost = otherHost;
238 else if (syncTime + BIGTIME < now) {
240 ubik_dprint("Ubik: Lost contact with sync-site %d.%d.%d.%d (NOT in quorum)\n",
241 ((syncHost>>24)&0xff), ((syncHost>>16)&0xff),
242 ((syncHost>> 8)&0xff), (syncHost &0xff));
247 /* decide how to vote */
248 vote = 0; /* start off voting no */
250 /* if we this guy isn't a sync site, we don't really have to vote for him.
251 We get to apply some heuristics to try to avoid weird oscillation sates
252 in the voting procedure. */
254 /* in here only if this guy doesn't claim to be a sync site */
256 /* lowestHost is also trying for our votes, then just say no. */
257 if (ntohl(lowestHost) != ntohl(otherHost)) {
261 /* someone else *is* a sync site, just say no */
262 if (syncHost && syncHost != otherHost)
266 /* Don't promise sync site support to more than one host every BIGTIME
267 seconds. This is the heart of our invariants in this system. */
268 if (ubik_lastYesTime + BIGTIME < now || otherHost == lastYesHost) {
269 if ((ubik_lastYesTime+BIGTIME < now) ||
270 (otherHost != lastYesHost) ||
271 (lastYesState != astate)) {
272 /* A new vote or a change in the vote or changed quorum */
273 ubik_dprint("Ubik: vote 'yes' for %d.%d.%d.%d %s\n",
274 ((otherHost>>24)&0xff), ((otherHost>>16)&0xff),
275 ((otherHost>> 8)&0xff), (otherHost &0xff),
276 (astate?"(in quorum)":"(NOT in quorum)"));
279 vote = now; /* vote yes */
280 ubik_lastYesTime = now; /* remember when we voted yes */
281 lastYesClaim = astart; /* remember for computing when sync site expires */
282 lastYesHost = otherHost; /* and who for */
283 lastYesState = astate; /* remember if site is a sync site */
284 ubik_dbVersion = *avers; /* resync value */
285 ubik_dbTid = *atid; /* transaction id, if any, of active trans */
286 urecovery_CheckTid(atid); /* check if current write trans needs aborted */
291 /* handle per-server debug command, where 0 is the first server. Basic network
293 SVOTE_SDebug(rxcall, awhich, aparm)
294 struct rx_call *rxcall;
296 register struct ubik_sdebug *aparm; {
297 register struct ubik_server *ts;
299 for(ts=ubik_servers; ts; ts=ts->next) {
302 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
303 for ( i=0; i < UBIK_MAX_INTERFACE_ADDR-1; i++)
304 aparm->altAddr[i] = ntohl(ts->addr[i+1]);
305 aparm->lastVoteTime = ts->lastVoteTime;
306 aparm->lastBeaconSent = ts->lastBeaconSent;
307 bcopy(&ts->version, &aparm->remoteVersion, sizeof(struct ubik_version));
308 aparm->lastVote = ts->lastVote;
310 aparm->beaconSinceDown = ts->beaconSinceDown;
311 aparm->currentDB = ts->currentDB;
319 /* handle basic network debug command. This is the global state dumper */
320 SVOTE_Debug(rxcall, aparm)
321 struct rx_call *rxcall;
322 register struct ubik_debug *aparm; {
324 /* fill in the basic debug structure. Note the the RPC protocol transfers,
325 * integers in host order. */
327 aparm->now = FT_ApproxTime();
328 aparm->lastYesTime = ubik_lastYesTime;
329 aparm->lastYesHost = ntohl(lastYesHost);
330 aparm->lastYesState = lastYesState;
331 aparm->lastYesClaim = lastYesClaim;
332 aparm->lowestHost = ntohl(lowestHost);
333 aparm->lowestTime = lowestTime;
334 aparm->syncHost = ntohl(syncHost);
335 aparm->syncTime = syncTime;
337 /* fill in all interface addresses of myself in hostbyte order */
338 for ( i=0; i < UBIK_MAX_INTERFACE_ADDR; i++)
339 aparm->interfaceAddr[i] = ntohl(ubik_host[i]);
341 aparm->amSyncSite = ubik_amSyncSite;
342 ubeacon_Debug(aparm);
348 /* Get the recovery state. The label of the database may not have
349 * been written yet but set the flag so udebug behavior remains.
352 aparm->recoveryState = urecovery_state;
353 if ((urecovery_state & UBIK_RECSYNCSITE) &&
354 (urecovery_state & UBIK_RECFOUNDDB ) &&
355 (urecovery_state & UBIK_RECHAVEDB ) ) {
356 aparm->recoveryState |= UBIK_RECLABELDB;
358 bcopy(&ubik_dbVersion, &aparm->syncVersion, sizeof(struct ubik_version));
359 bcopy(&ubik_dbTid, &aparm->syncTid, sizeof(struct ubik_tid));
360 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
361 aparm->tidCounter = ubik_dbase->tidCounter;
363 if (ubik_currentTrans) {
364 aparm->currentTrans = 1;
365 if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1;
366 else aparm->writeTrans = 0;
369 aparm->currentTrans = 0;
372 aparm->epochTime = ubik_epochTime;
377 SVOTE_SDebugOld(rxcall, awhich, aparm)
378 struct rx_call *rxcall;
380 register struct ubik_sdebug_old *aparm; {
381 register struct ubik_server *ts;
383 for(ts=ubik_servers; ts; ts=ts->next) {
386 aparm->addr = ntohl(ts->addr[0]); /* primary interface */
387 aparm->lastVoteTime = ts->lastVoteTime;
388 aparm->lastBeaconSent = ts->lastBeaconSent;
389 bcopy(&ts->version, &aparm->remoteVersion, sizeof(struct ubik_version));
390 aparm->lastVote = ts->lastVote;
392 aparm->beaconSinceDown = ts->beaconSinceDown;
393 aparm->currentDB = ts->currentDB;
401 /* handle basic network debug command. This is the global state dumper */
402 SVOTE_DebugOld(rxcall, aparm)
403 struct rx_call *rxcall;
404 register struct ubik_debug_old *aparm; {
406 /* fill in the basic debug structure. Note the the RPC protocol transfers,
407 * integers in host order. */
409 aparm->now = FT_ApproxTime();
410 aparm->lastYesTime = ubik_lastYesTime;
411 aparm->lastYesHost = ntohl(lastYesHost);
412 aparm->lastYesState = lastYesState;
413 aparm->lastYesClaim = lastYesClaim;
414 aparm->lowestHost = ntohl(lowestHost);
415 aparm->lowestTime = lowestTime;
416 aparm->syncHost = ntohl(syncHost);
417 aparm->syncTime = syncTime;
419 aparm->amSyncSite = ubik_amSyncSite;
420 ubeacon_Debug(aparm);
426 /* Get the recovery state. The label of the database may not have
427 * been written yet but set the flag so udebug behavior remains.
430 aparm->recoveryState = urecovery_state;
431 if ((urecovery_state & UBIK_RECSYNCSITE) &&
432 (urecovery_state & UBIK_RECFOUNDDB ) &&
433 (urecovery_state & UBIK_RECHAVEDB ) ) {
434 aparm->recoveryState |= UBIK_RECLABELDB;
436 bcopy(&ubik_dbVersion, &aparm->syncVersion, sizeof(struct ubik_version));
437 bcopy(&ubik_dbTid, &aparm->syncTid, sizeof(struct ubik_tid));
438 aparm->activeWrite = (ubik_dbase->flags & DBWRITING);
439 aparm->tidCounter = ubik_dbase->tidCounter;
441 if (ubik_currentTrans) {
442 aparm->currentTrans = 1;
443 if (ubik_currentTrans->type == UBIK_WRITETRANS) aparm->writeTrans = 1;
444 else aparm->writeTrans = 0;
447 aparm->currentTrans = 0;
450 aparm->epochTime = ubik_epochTime;
456 /* get the sync site; called by remote servers to find where they should go */
457 SVOTE_GetSyncSite(rxcall, ahost)
458 register struct rx_call *rxcall;
459 register afs_int32 *ahost; {
460 register afs_int32 temp;
462 temp = uvote_GetSyncSite();
463 *ahost = ntohl(temp);
467 ubik_dprint(a,b,c,d,e,f,g,h)
468 char *a, *b, *c, *d, *e, *f, *g, *h; {
469 ViceLog(5, (a,b,c,d,e,f,g,h));
472 ubik_print(a,b,c,d,e,f,g,h)
473 char *a, *b, *c, *d, *e, *f, *g, *h;
475 ViceLog(0, (a,b,c,d,e,f,g,h));
478 /* called once/run to init the vote module */
480 /* pretend we just voted for someone else, since we just restarted */
481 ubik_lastYesTime = FT_ApproxTime();