2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
16 #include <sys/types.h>
23 #include <sys/socket.h>
24 #include <netinet/in.h>
38 #include <rx/rx_multi.h>
39 #include <afs/cellconfig.h>
41 #include <afs/afsutil.h>
42 #include <afs/netutils.h>
45 #define UBIK_INTERNALS
49 /* statics used to determine if we're the sync site */
50 static afs_int32 syncSiteUntil = 0; /* valid only if amSyncSite */
51 int ubik_amSyncSite = 0; /* flag telling if I'm sync site */
52 static nServers; /* total number of servers */
53 static char amIMagic = 0; /* is this host the magic host */
54 char amIClone = 0; /* is this a clone which doesn't vote */
55 static char ubik_singleServer = 0;
56 int (*ubik_CRXSecurityProc) ();
57 char *ubik_CRXSecurityRock;
58 afs_int32 ubikSecIndex;
59 struct rx_securityClass *ubikSecClass;
60 static verifyInterfaceAddress();
63 /* Module responsible for both deciding if we're currently the sync site,
64 * and keeping collecting votes so as to stay sync site.
66 * The basic module contacts all of the servers it can, trying to get them to vote
67 * for this server for sync site. The vote request message (called a beacon message)
68 * also specifies until which time this site claims to be the sync site, if at all, thus enabling
69 * receiving sites to know how long the sync site guarantee is made for.
71 * Each of these beacon messages is thus both a declaration of how long this site will
72 * remain sync site, and an attempt to extend that time by collecting votes for a later
73 * sync site extension.
75 * The voting module is responsible for choosing a reasonable time until which it promises
76 * not to vote for someone else. This parameter (BIG seconds) is not actually passed in
77 * the interface (perhaps it should be?) but is instead a compile time constant that both
80 * The beacon and vote modules work intimately together; the vote module decides how long
81 * it should promise the beacon module its vote, and the beacon module takes all of these
82 * votes and decides for how long it is the synchronization site.
85 /* procedure called from debug rpc call to get this module's state for debugging */
87 register struct ubik_debug *aparm;
89 /* fill in beacon's state fields in the ubik_debug structure */
90 aparm->syncSiteUntil = syncSiteUntil;
91 aparm->nServers = nServers;
94 /* procedure that determines whether this site has enough current votes to remain sync site.
95 * called from higher-level modules (everything but the vote module).
97 * If we're the sync site, check that our guarantees, obtained by the ubeacon_Interact
98 * light-weight process, haven't expired. We're sync site as long as a majority of the
99 * servers in existence have promised us unexpired guarantees. The variable ubik_syncSiteUntil
100 * contains the time at which the latest of the majority of the sync site guarantees expires
101 * (if the variable ubik_amSyncSite is true)
102 * This module also calls up to the recovery module if it thinks that the recovery module
103 * may have to pick up a new database (which offucr sif we lose the sync site votes).
107 register afs_int32 now;
108 register afs_int32 rcode;
110 /* special case for fast startup */
111 if (nServers == 1 && !amIClone) {
112 return 1; /* one guy is always the sync site */
115 if (ubik_amSyncSite == 0 || amIClone)
116 rcode = 0; /* if I don't think I'm the sync site, say so */
118 now = FT_ApproxTime();
119 if (syncSiteUntil <= now) { /* if my votes have expired, say so */
121 ubik_dprint("Ubik: I am no longer the sync site\n");
125 rcode = 1; /* otherwise still have the required votes */
129 urecovery_ResetState(); /* force recovery to re-execute */
130 ubik_dprint("beacon: amSyncSite is %d\n", rcode);
134 /* setup server list; called with two parms, first is my address, second is list of other servers
135 * called only at initialization to set up the list of servers to contact for votes. Just creates
136 * the server structure. Note that there are two connections in every server structure, one for
137 * vote calls (which must always go through quickly) and one for database operations, which
138 * are subject to waiting for locks. If we used only one, the votes would sometimes get
139 * held up behind database operations, and the sync site guarantees would timeout
140 * even though the host would be up for communication.
142 * The "magic" host is the one with the lowest internet address. It is
143 * magic because its vote counts epsilon more than the others. This acts
144 * as a tie-breaker when we have an even number of hosts in the system.
145 * For example, if the "magic" host is up in a 2 site system, then it
146 * is sync site. Without the magic host hack, if anyone crashed in a 2
147 * site system, we'd be out of business.
149 ubeacon_InitServerListByInfo(ame, info, clones)
151 struct afsconf_cell *info;
156 code = ubeacon_InitServerListCommon(ame, info, clones, 0);
160 ubeacon_InitServerList(ame, aservers)
162 register afs_int32 aservers[];
167 ubeacon_InitServerListCommon(ame, (struct afsconf_cell *)0, 0,
172 ubeacon_InitServerListCommon(ame, info, clones, aservers)
174 struct afsconf_cell *info;
176 register afs_int32 aservers[];
178 register struct ubik_server *ts;
180 register afs_int32 servAddr;
181 register afs_int32 i, code;
183 struct ubik_server *magicServer;
185 /* verify that the addresses passed in are correct */
186 if ((code = verifyInterfaceAddress(&ame, info, aservers)))
189 /* get the security index to use, if we can */
190 if (ubik_CRXSecurityProc) {
191 i = (*ubik_CRXSecurityProc) (ubik_CRXSecurityRock, &ubikSecClass,
196 /* don't have sec module yet */
198 ubikSecClass = rxnull_NewClientSecurityObject();
200 magicHost = ntohl(ame); /* do comparisons in host order */
201 magicServer = (struct ubik_server *)0;
204 for (i = 0; i < info->numServers; i++) {
205 if (ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr) ==
206 ntohl((afs_uint32) ame)) {
215 for (i = 0; i < info->numServers; i++) {
218 ts = (struct ubik_server *)malloc(sizeof(struct ubik_server));
219 memset(ts, 0, sizeof(struct ubik_server));
220 ts->next = ubik_servers;
222 ts->addr[0] = info->hostAddr[i].sin_addr.s_addr;
227 || ntohl((afs_uint32) ts->addr[0]) <
228 (afs_uint32) magicHost) {
229 magicHost = ntohl(ts->addr[0]);
236 rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
237 ubik_callPortal, VOTE_SERVICE_ID,
238 ubikSecClass, ubikSecIndex);
241 rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
242 ubik_callPortal, DISK_SERVICE_ID,
243 ubikSecClass, ubikSecIndex);
248 while ((servAddr = *aservers++)) {
250 return UNHOSTS; /* too many hosts */
251 ts = (struct ubik_server *)malloc(sizeof(struct ubik_server));
252 memset(ts, 0, sizeof(struct ubik_server));
253 ts->next = ubik_servers;
255 ts->addr[0] = servAddr; /* primary address in net byte order */
256 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal, VOTE_SERVICE_ID, ubikSecClass, ubikSecIndex); /* for vote reqs */
257 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal, DISK_SERVICE_ID, ubikSecClass, ubikSecIndex); /* for disk reqs */
258 ts->isClone = 0; /* don't know about clones */
260 if (ntohl((afs_uint32) servAddr) < (afs_uint32) magicHost) {
261 magicHost = ntohl(servAddr);
268 magicServer->magic = 1; /* remember for when counting votes */
270 if (!amIClone && !magicServer)
274 ++nServers; /* count this server as well as the remotes */
276 nServers = i + 1; /* count this server as well as the remotes */
278 ubik_quorum = (nServers >> 1) + 1; /* compute the majority figure */
279 /* send addrs to all other servers */
280 code = updateUbikNetworkAddress(ubik_host);
284 /* Shoud we set some defaults for RX??
286 r_nRetries = (RPCTIMEOUT/r_retryInterval);
289 if (!ubik_servers) /* special case 1 server */
290 ubik_singleServer = 1;
291 if (nServers == 1 && !amIClone) {
292 ubik_amSyncSite = 1; /* let's start as sync site */
293 syncSiteUntil = 0x7fffffff; /* and be it quite a while */
296 if (nServers == 1) /* special case 1 server */
297 ubik_singleServer = 1;
300 if (ubik_singleServer) {
301 if (!ubik_amSyncSite)
302 ubik_dprint("Ubik: I am the sync site - 1 server\n");
304 syncSiteUntil = 0x7fffffff; /* quite a while */
309 /* main lwp loop for code that sends out beacons. This code only runs while
310 * we're sync site or we want to be the sync site. It runs in its very own light-weight
315 register afs_int32 code;
317 struct rx_connection *connections[MAXSERVERS];
318 struct ubik_server *servers[MAXSERVERS];
319 register afs_int32 i;
320 register struct ubik_server *ts;
321 afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
322 struct ubik_tid ttid;
325 /* loop forever getting votes */
326 lastWakeupTime = 0; /* keep track of time we last started a vote collection */
329 /* don't wakeup more than every POLLTIME seconds */
330 temp = (lastWakeupTime + POLLTIME) - FT_ApproxTime();
331 /* don't sleep if last collection phase took too long (probably timed someone out ) */
337 code = IOMGR_Select(0, 0, 0, 0, &tt);
341 lastWakeupTime = FT_ApproxTime(); /* started a new collection phase */
343 if (ubik_singleServer)
344 continue; /* special-case 1 server for speedy startup */
346 if (!uvote_ShouldIRun())
347 continue; /* if voter has heard from a better candidate than us, don't bother running */
349 /* otherwise we should run for election, or we're the sync site (and have already won);
350 * send out the beacon packets */
351 /* build list of all up hosts (noticing dead hosts are running again
352 * is a task for the recovery module, not the beacon module), and
353 * prepare to send them an r multi-call containing the beacon message */
354 i = 0; /* collect connections */
355 for (ts = ubik_servers; ts; ts = ts->next) {
356 if (ts->up && ts->addr[0] != ubik_host[0]) {
358 connections[i++] = ts->vote_rxcid;
361 servers[i] = (struct ubik_server *)0; /* end of list */
362 /* note that we assume in the vote module that we'll always get at least BIGTIME
363 * seconds of vote from anyone who votes for us, which means we can conservatively
364 * assume we'll be fine until SMALLTIME seconds after we start collecting votes */
365 /* this next is essentially an expansion of rgen's ServBeacon routine */
367 ttid.epoch = ubik_epochTime;
368 if (ubik_dbase->flags & DBWRITING) {
370 * if a write is in progress, we have to send the writeTidCounter
371 * which holds the tid counter of the write transaction , and not
372 * send the tidCounter value which holds the tid counter of the
375 ttid.counter = ubik_dbase->writeTidCounter;
377 ttid.counter = ubik_dbase->tidCounter + 1;
378 #if defined(UBIK_PAUSE)
379 ubik_dbase->flags |= DBVOTING;
380 #endif /* UBIK_PAUSE */
382 /* now analyze return codes, counting up our votes */
383 yesVotes = 0; /* count how many to ensure we have quorum */
384 oldestYesVote = 0x3fffffff; /* time quorum expires */
385 syncsite = ubeacon_AmSyncSite();
386 startTime = FT_ApproxTime();
388 * Don't waste time using mult Rx calls if there are no connections out there
391 multi_Rx(connections, i) {
392 multi_VOTE_Beacon(syncsite, startTime, &ubik_dbase->version,
394 temp = FT_ApproxTime(); /* now, more or less */
395 ts = servers[multi_i];
396 ts->lastBeaconSent = temp;
398 /* note that the vote time (the return code) represents the time
399 * the vote was computed, *not* the time the vote expires. We compute
400 * the latter down below if we got enough votes to go with */
402 ts->lastVoteTime = code;
403 if (code < oldestYesVote)
404 oldestYesVote = code;
409 yesVotes++; /* the extra epsilon */
410 ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
411 ts->beaconSinceDown = 1;
412 ubik_dprint("yes vote from host %s\n",
413 afs_inet_ntoa(ts->addr[0]));
414 } else if (code == 0) {
415 ts->lastVoteTime = temp;
417 ts->beaconSinceDown = 1;
418 ubik_dprint("no vote from %s\n",
419 afs_inet_ntoa(ts->addr[0]));
420 } else if (code < 0) {
422 ts->beaconSinceDown = 0;
423 urecovery_LostServer();
424 ubik_dprint("time out from %s\n",
425 afs_inet_ntoa(ts->addr[0]));
430 /* now call our own voter module to see if we'll vote for ourself. Note that
431 * the same restrictions apply for our voting for ourself as for our voting
432 * for anyone else. */
433 i = SVOTE_Beacon((struct rx_call *)0, ubeacon_AmSyncSite(), startTime,
434 &ubik_dbase->version, &ttid);
438 yesVotes++; /* extra epsilon */
439 if (i < oldestYesVote)
442 #if defined(UBIK_PAUSE)
443 ubik_dbase->flags &= ~DBVOTING;
444 #endif /* UBIK_PAUSE */
446 /* now decide if we have enough votes to become sync site.
447 * Note that we can still get enough votes even if we didn't for ourself. */
448 if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
449 if (!ubik_amSyncSite)
450 ubik_dprint("Ubik: I am the sync site\n");
452 syncSiteUntil = oldestYesVote + SMALLTIME;
453 LWP_NoYieldSignal(&ubik_amSyncSite);
456 ubik_dprint("Ubik: I am no longer the sync site\n");
458 urecovery_ResetState(); /* tell recovery we're no longer the sync site */
465 * Input Param : ame is the pointer to my IP address specified in the
466 * CellServDB file. aservers is an array containing IP
467 * addresses of remote ubik servers. The array is
468 * terminated by a zero address.
470 * Algorithm : Verify that my IP addresses 'ame' does actually exist
471 * on this machine. If any of my IP addresses are there
472 * in the remote server list 'aserver', remove them from
473 * this list. Update global variable ubik_host[] with
476 * Return Values : 0 on success, non-zero on failure
479 verifyInterfaceAddress(ame, info, aservers)
480 afs_uint32 *ame; /* one of my interface addr in net byte order */
481 struct afsconf_cell *info;
482 afs_uint32 aservers[]; /* list of all possible server addresses */
484 afs_uint32 myAddr[UBIK_MAX_INTERFACE_ADDR], *servList, tmpAddr;
485 afs_uint32 myAddr2[UBIK_MAX_INTERFACE_ADDR];
486 int tcount, count, found, i, j, totalServers, start, end, usednetfiles =
490 totalServers = info->numServers;
491 else { /* count the number of servers */
492 for (totalServers = 0, servList = aservers; *servList; servList++)
497 /* for now use getaddr(). use getAllAddr when implemented */
498 myAddr[0] = rxi_getaddr();
499 count = (myAddr[0] != 0);
501 if (AFSDIR_SERVER_NETRESTRICT_FILEPATH || AFSDIR_SERVER_NETINFO_FILEPATH) {
503 * Find addresses we are supposed to register as per the netrestrict file
504 * if it exists, else just register all the addresses we find on this
505 * host as returned by rx_getAllAddr (in NBO)
509 parseNetFiles(myAddr, NULL, NULL, UBIK_MAX_INTERFACE_ADDR, reason,
510 AFSDIR_SERVER_NETINFO_FILEPATH,
511 AFSDIR_SERVER_NETRESTRICT_FILEPATH);
513 ubik_print("ubik: Can't register any valid addresses:%s\n",
515 ubik_print("Aborting..\n");
520 /* get all my interface addresses in net byte order */
521 count = rx_getAllAddr(myAddr, UBIK_MAX_INTERFACE_ADDR);
525 if (count <= 0) { /* no address found */
526 ubik_print("ubik: No network addresses found, aborting..");
530 /* verify that the My-address passed in by ubik is correct */
531 for (j = 0, found = 0; j < count; j++) {
532 if (*ame == myAddr[j]) { /* both in net byte order */
539 ubik_print("ubik: primary address %s does not exist\n",
540 afs_inet_ntoa(*ame));
541 /* if we had the result of rx_getAllAddr already, avoid subverting
542 * the "is gethostbyname(gethostname()) us" check. If we're
543 * using NetInfo/NetRestrict, we assume they have enough clue
544 * to avoid that big hole in their foot from the loaded gun. */
546 /* take the address we did get, then see if ame was masked */
548 tcount = rx_getAllAddr(myAddr2, UBIK_MAX_INTERFACE_ADDR);
549 if (tcount <= 0) { /* no address found */
550 ubik_print("ubik: No network addresses found, aborting..");
554 /* verify that the My-address passed in by ubik is correct */
555 for (j = 0, found = 0; j < tcount; j++) {
556 if (*ame == myAddr2[j]) { /* both in net byte order */
566 /* if any of my addresses are there in serverList, then
567 ** use that as my primary addresses : the higher level
568 ** application screwed up in dealing with multihomed concepts
570 for (j = 0, found = 0; j < count; j++) {
571 for (i = 0; i < totalServers; i++) {
574 ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr);
576 tmpAddr = aservers[i];
577 if (myAddr[j] == tmpAddr) {
586 ubik_print("Using %s as my primary address\n", afs_inet_ntoa(*ame));
589 /* get rid of servers which were purged because all
590 ** those interface addresses are myself
592 for (start = 0, end = totalServers - 1; (start < end); start++, end--) {
593 /* find the first zero entry from the beginning */
594 for (; (start < end) && (aservers[start]); start++);
596 /* find the last non-zero entry from the end */
597 for (; (end >= 0) && (!aservers[end]); end--);
599 /* if there is nothing more to purge, exit from loop */
604 aservers[start] = aservers[end];
605 aservers[end] = 0; /* this entry was moved */
609 /* update all my addresses in ubik_host in such a way
610 ** that ubik_host[0] has the primary address
613 for (j = 0, i = 1; j < count; j++)
614 if (*ame != myAddr[j])
615 ubik_host[i++] = myAddr[j];
617 return 0; /* return success */
622 * Input Param : ubik_host is an array containing all my IP addresses.
624 * Algorithm : Do an RPC to all remote ubik servers infroming them
625 * about my IP addresses. Get their IP addresses and
626 * update my linked list of ubik servers 'ubik_servers'
628 * Return Values : 0 on success, non-zero on failure
631 updateUbikNetworkAddress(ubik_host)
632 afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR];
634 int j, count, code = 0;
635 UbikInterfaceAddr inAddr, outAddr;
636 struct rx_connection *conns[MAXSERVERS];
637 struct ubik_server *ts, *server[MAXSERVERS];
640 for (count = 0, ts = ubik_servers; ts; count++, ts = ts->next) {
641 conns[count] = ts->disk_rxcid;
646 /* inform all other servers only if there are more than one
647 * database servers in the cell */
651 for (j = 0; j < UBIK_MAX_INTERFACE_ADDR; j++)
652 inAddr.hostAddr[j] = ntohl(ubik_host[j]);
655 /* do the multi-RX RPC to all other servers */
656 multi_Rx(conns, count) {
657 multi_DISK_UpdateInterfaceAddr(&inAddr, &outAddr);
658 ts = server[multi_i]; /* reply received from this server */
660 if (ts->addr[0] != htonl(outAddr.hostAddr[0])) {
662 strcpy(buffer, (char *)afs_inet_ntoa(ts->addr[0]));
663 ubik_print("ubik:Two primary addresses for same server \
664 %s %s\n", buffer, afs_inet_ntoa(htonl(outAddr.hostAddr[0])));
666 for (j = 1; j < UBIK_MAX_INTERFACE_ADDR; j++)
667 ts->addr[j] = htonl(outAddr.hostAddr[j]);
669 } else if (multi_error == RXGEN_OPCODE) { /* pre 3.5 remote server */
671 ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
672 afs_inet_ntoa(ts->addr[0]));
673 } else if (multi_error == UBADHOST) {
674 code = UBADHOST; /* remote CellServDB inconsistency */
675 ubik_print("Inconsistent Cell Info on server: ");
676 for (j = 0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
677 ubik_print("%s ", afs_inet_ntoa(ts->addr[j]));
680 ts->up = 0; /* mark the remote server as down */