2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
19 #include <rx/rx_multi.h>
20 #include <afs/cellconfig.h>
22 #include <afs/afsutil.h>
23 #include <afs/netutils.h>
26 #define UBIK_INTERNALS
30 /* These global variables were used to set the function to use to initialise
31 * the client security layer. They are retained for backwards compatiblity with
32 * legacy callers - the ubik_SetClientSecurityProcs() interface should be used
35 int (*ubik_CRXSecurityProc) (void *rock, struct rx_securityClass **,
37 void *ubik_CRXSecurityRock;
39 /*! \name statics used to determine if we're the sync site */
40 static int nServers; /*!< total number of servers */
41 static char amIMagic = 0; /*!< is this host the magic host */
42 char amIClone = 0; /*!< is this a clone which doesn't vote */
43 static char ubik_singleServer = 0;
45 static int (*secLayerProc) (void *rock, struct rx_securityClass **,
47 static int (*tokenCheckProc) (void *rock) = NULL;
48 static void * securityRock = NULL;
50 afs_int32 ubikSecIndex;
51 struct rx_securityClass *ubikSecClass;
53 /* Values protected by the address lock */
54 struct addr_data addr_globals;
56 /* Values protected by the beacon lock */
57 struct beacon_data beacon_globals;
59 static int ubeacon_InitServerListCommon(afs_uint32 ame,
60 struct afsconf_cell *info,
62 afs_uint32 aservers[]);
63 static int verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
64 afs_uint32 aservers[]);
65 static int updateUbikNetworkAddress(afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR]);
69 * Module responsible for both deciding if we're currently the sync site,
70 * and keeping collecting votes so as to stay sync site.
72 * The basic module contacts all of the servers it can, trying to get them to vote
73 * for this server for sync site. The vote request message (called a beacon message)
74 * also specifies until which time this site claims to be the sync site, if at all, thus enabling
75 * receiving sites to know how long the sync site guarantee is made for.
77 * Each of these beacon messages is thus both a declaration of how long this site will
78 * remain sync site, and an attempt to extend that time by collecting votes for a later
79 * sync site extension.
81 * The voting module is responsible for choosing a reasonable time until which it promises
82 * not to vote for someone else. This parameter (BIG seconds) is not actually passed in
83 * the interface (perhaps it should be?) but is instead a compile time constant that both
86 * The beacon and vote modules work intimately together; the vote module decides how long
87 * it should promise the beacon module its vote, and the beacon module takes all of these
88 * votes and decides for how long it is the synchronization site.
91 /*! \brief procedure called from debug rpc call to get this module's state for debugging */
93 ubeacon_Debug(struct ubik_debug *aparm)
95 /* fill in beacon's state fields in the ubik_debug structure */
96 aparm->syncSiteUntil = beacon_globals.syncSiteUntil;
97 aparm->nServers = nServers;
105 /* special case for fast startup */
106 if (nServers == 1 && !amIClone)
107 return 1; /* one guy is always the sync site */
110 if (beacon_globals.ubik_amSyncSite == 0 || amIClone)
111 rcode = 0; /* if I don't think I'm the sync site, say so */
113 now = FT_ApproxTime();
114 if (beacon_globals.syncSiteUntil <= now) { /* if my votes have expired, say so */
115 if (beacon_globals.ubik_amSyncSite)
116 ubik_dprint("Ubik: I am no longer the sync site\n");
117 beacon_globals.ubik_amSyncSite = 0;
120 rcode = 1; /* otherwise still have the required votes */
124 ubik_dprint("beacon: amSyncSite is %d\n", rcode);
129 * \brief Procedure that determines whether this site has enough current votes to remain sync site.
131 * Called from higher-level modules (everything but the vote module).
133 * If we're the sync site, check that our guarantees, obtained by the ubeacon_Interact()
134 * light-weight process, haven't expired. We're sync site as long as a majority of the
135 * servers in existence have promised us unexpired guarantees. The variable #ubik_syncSiteUntil
136 * contains the time at which the latest of the majority of the sync site guarantees expires
137 * (if the variable #ubik_amSyncSite is true)
138 * This module also calls up to the recovery module if it thinks that the recovery module
139 * may have to pick up a new database (which offucr sif [sic] we lose the sync site votes).
141 * \return 1 if local site is the sync site
142 * \return 0 if sync site is elsewhere
145 ubeacon_AmSyncSite(void)
149 rcode = amSyncSite();
152 urecovery_ResetState();
158 * \see ubeacon_InitServerListCommon()
161 ubeacon_InitServerListByInfo(afs_uint32 ame, struct afsconf_cell *info,
166 code = ubeacon_InitServerListCommon(ame, info, clones, 0);
171 * \param ame "address of me"
172 * \param aservers list of other servers
174 * \see ubeacon_InitServerListCommon()
177 ubeacon_InitServerList(afs_uint32 ame, afs_uint32 aservers[])
182 ubeacon_InitServerListCommon(ame, (struct afsconf_cell *)0, 0,
187 /* Must be called with address lock held */
189 ubeacon_InitSecurityClass(void)
192 /* get the security index to use, if we can */
194 i = (*secLayerProc) (securityRock, &addr_globals.ubikSecClass, &addr_globals.ubikSecIndex);
195 } else if (ubik_CRXSecurityProc) {
196 i = (*ubik_CRXSecurityProc) (ubik_CRXSecurityRock, &addr_globals.ubikSecClass,
197 &addr_globals.ubikSecIndex);
201 /* don't have sec module yet */
202 addr_globals.ubikSecIndex = 0;
203 addr_globals.ubikSecClass = rxnull_NewClientSecurityObject();
208 ubeacon_ReinitServer(struct ubik_server *ts)
210 if (tokenCheckProc && !(*tokenCheckProc) (securityRock)) {
211 struct rx_connection *disk_rxcid;
212 struct rx_connection *vote_rxcid;
213 struct rx_connection *tmp;
215 ubeacon_InitSecurityClass();
217 rx_NewConnection(rx_HostOf(rx_PeerOf(ts->disk_rxcid)),
218 ubik_callPortal, DISK_SERVICE_ID,
219 addr_globals.ubikSecClass, addr_globals.ubikSecIndex);
221 tmp = ts->disk_rxcid;
222 ts->disk_rxcid = disk_rxcid;
223 rx_PutConnection(tmp);
226 rx_NewConnection(rx_HostOf(rx_PeerOf(ts->vote_rxcid)),
227 ubik_callPortal, VOTE_SERVICE_ID,
228 addr_globals.ubikSecClass, addr_globals.ubikSecIndex);
230 tmp = ts->vote_rxcid;
231 ts->vote_rxcid = vote_rxcid;
232 rx_PutConnection(tmp);
239 * \brief setup server list
241 * \param ame "address of me"
242 * \param aservers list of other servers
244 * called only at initialization to set up the list of servers to
245 * contact for votes. Just creates the server structure.
247 * The "magic" host is the one with the lowest internet address. It is
248 * magic because its vote counts epsilon more than the others. This acts
249 * as a tie-breaker when we have an even number of hosts in the system.
250 * For example, if the "magic" host is up in a 2 site system, then it
251 * is sync site. Without the magic host hack, if anyone crashed in a 2
252 * site system, we'd be out of business.
254 * \note There are two connections in every server structure, one for
255 * vote calls (which must always go through quickly) and one for database
256 * operations, which are subject to waiting for locks. If we used only
257 * one, the votes would sometimes get held up behind database operations,
258 * and the sync site guarantees would timeout even though the host would be
259 * up for communication.
261 * \see ubeacon_InitServerList(), ubeacon_InitServerListByInfo()
264 ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
265 char clones[], afs_uint32 aservers[])
267 struct ubik_server *ts;
272 struct ubik_server *magicServer;
274 /* verify that the addresses passed in are correct */
275 if ((code = verifyInterfaceAddress(&ame, info, aservers)))
278 ubeacon_InitSecurityClass();
280 magicHost = ntohl(ame); /* do comparisons in host order */
281 magicServer = (struct ubik_server *)0;
284 for (i = 0; i < info->numServers; i++) {
285 if (ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr) ==
286 ntohl((afs_uint32) ame)) {
295 for (i = 0; i < info->numServers; i++) {
298 ts = (struct ubik_server *)malloc(sizeof(struct ubik_server));
299 memset(ts, 0, sizeof(struct ubik_server));
300 ts->next = ubik_servers;
302 ts->addr[0] = info->hostAddr[i].sin_addr.s_addr;
307 || ntohl((afs_uint32) ts->addr[0]) <
308 (afs_uint32) magicHost) {
309 magicHost = ntohl(ts->addr[0]);
316 rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
317 ubik_callPortal, VOTE_SERVICE_ID,
318 addr_globals.ubikSecClass, addr_globals.ubikSecIndex);
321 rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
322 ubik_callPortal, DISK_SERVICE_ID,
323 addr_globals.ubikSecClass, addr_globals.ubikSecIndex);
328 while ((servAddr = *aservers++)) {
330 return UNHOSTS; /* too many hosts */
331 ts = (struct ubik_server *)malloc(sizeof(struct ubik_server));
332 memset(ts, 0, sizeof(struct ubik_server));
333 ts->next = ubik_servers;
335 ts->addr[0] = servAddr; /* primary address in net byte order */
336 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal, VOTE_SERVICE_ID,
337 addr_globals.ubikSecClass, addr_globals.ubikSecIndex); /* for vote reqs */
338 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal, DISK_SERVICE_ID,
339 addr_globals.ubikSecClass, addr_globals.ubikSecIndex); /* for disk reqs */
340 ts->isClone = 0; /* don't know about clones */
342 if (ntohl((afs_uint32) servAddr) < (afs_uint32) magicHost) {
343 magicHost = ntohl(servAddr);
350 magicServer->magic = 1; /* remember for when counting votes */
352 if (!amIClone && !magicServer)
356 ++nServers; /* count this server as well as the remotes */
358 nServers = i + 1; /* count this server as well as the remotes */
360 ubik_quorum = (nServers >> 1) + 1; /* compute the majority figure */
361 /* send addrs to all other servers */
362 code = updateUbikNetworkAddress(ubik_host);
366 /* Shoud we set some defaults for RX??
368 r_nRetries = (RPCTIMEOUT/r_retryInterval);
371 if (!ubik_servers) /* special case 1 server */
372 ubik_singleServer = 1;
373 if (nServers == 1 && !amIClone) {
374 beacon_globals.ubik_amSyncSite = 1; /* let's start as sync site */
375 beacon_globals.syncSiteUntil = 0x7fffffff; /* and be it quite a while */
378 if (nServers == 1) /* special case 1 server */
379 ubik_singleServer = 1;
382 if (ubik_singleServer) {
383 if (!beacon_globals.ubik_amSyncSite)
384 ubik_dprint("Ubik: I am the sync site - 1 server\n");
385 beacon_globals.ubik_amSyncSite = 1;
386 beacon_globals.syncSiteUntil = 0x7fffffff; /* quite a while */
392 * \brief main lwp loop for code that sends out beacons.
394 * This code only runs while we're sync site or we want to be the sync site.
395 * It runs in its very own light-weight process.
398 ubeacon_Interact(void *dummy)
402 struct rx_connection *connections[MAXSERVERS];
403 struct ubik_server *servers[MAXSERVERS];
405 struct ubik_server *ts;
406 afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
407 struct ubik_tid ttid;
408 struct ubik_version tversion;
411 /* loop forever getting votes */
412 lastWakeupTime = 0; /* keep track of time we last started a vote collection */
415 /* don't wakeup more than every POLLTIME seconds */
416 temp = (lastWakeupTime + POLLTIME) - FT_ApproxTime();
417 /* don't sleep if last collection phase took too long (probably timed someone out ) */
423 #ifdef AFS_PTHREAD_ENV
424 code = select(0, 0, 0, 0, &tt);
426 code = IOMGR_Select(0, 0, 0, 0, &tt);
431 lastWakeupTime = FT_ApproxTime(); /* started a new collection phase */
433 if (ubik_singleServer)
434 continue; /* special-case 1 server for speedy startup */
436 if (!uvote_ShouldIRun())
437 continue; /* if voter has heard from a better candidate than us, don't bother running */
439 /* otherwise we should run for election, or we're the sync site (and have already won);
440 * send out the beacon packets */
441 /* build list of all up hosts (noticing dead hosts are running again
442 * is a task for the recovery module, not the beacon module), and
443 * prepare to send them an r multi-call containing the beacon message */
444 i = 0; /* collect connections */
447 for (ts = ubik_servers; ts; ts = ts->next) {
448 if (ts->up && ts->addr[0] != ubik_host[0]) {
450 connections[i++] = ts->vote_rxcid;
455 servers[i] = (struct ubik_server *)0; /* end of list */
456 /* note that we assume in the vote module that we'll always get at least BIGTIME
457 * seconds of vote from anyone who votes for us, which means we can conservatively
458 * assume we'll be fine until SMALLTIME seconds after we start collecting votes */
459 /* this next is essentially an expansion of rgen's ServBeacon routine */
462 ttid.epoch = version_globals.ubik_epochTime;
463 if (ubik_dbase->flags & DBWRITING) {
465 * if a write is in progress, we have to send the writeTidCounter
466 * which holds the tid counter of the write transaction , and not
467 * send the tidCounter value which holds the tid counter of the
470 ttid.counter = ubik_dbase->writeTidCounter;
472 ttid.counter = ubik_dbase->tidCounter + 1;
473 tversion.epoch = ubik_dbase->version.epoch;
474 tversion.counter = ubik_dbase->version.counter;
477 /* now analyze return codes, counting up our votes */
478 yesVotes = 0; /* count how many to ensure we have quorum */
479 oldestYesVote = 0x7fffffff; /* time quorum expires */
480 syncsite = amSyncSite();
482 /* Ok to use the DB lock here since we aren't sync site */
484 urecovery_ResetState();
487 startTime = FT_ApproxTime();
489 * Don't waste time using mult Rx calls if there are no connections out there
493 multi_Rx(connections, i) {
494 multi_VOTE_Beacon(syncsite, startTime, &tversion,
496 temp = FT_ApproxTime(); /* now, more or less */
497 ts = servers[multi_i];
499 ts->lastBeaconSent = temp;
501 /* note that the vote time (the return code) represents the time
502 * the vote was computed, *not* the time the vote expires. We compute
503 * the latter down below if we got enough votes to go with */
505 if ((code & ~0xff) == ERROR_TABLE_BASE_RXK) {
506 ubik_dprint("token error %d from host %s\n",
507 code, afs_inet_ntoa_r(ts->addr[0], hoststr));
509 ts->beaconSinceDown = 0;
510 urecovery_LostServer(ts);
512 ts->lastVoteTime = code;
513 if (code < oldestYesVote)
514 oldestYesVote = code;
519 yesVotes++; /* the extra epsilon */
520 ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
521 ts->beaconSinceDown = 1;
522 ubik_dprint("yes vote from host %s\n",
523 afs_inet_ntoa_r(ts->addr[0], hoststr));
525 } else if (code == 0) {
526 ts->lastVoteTime = temp;
528 ts->beaconSinceDown = 1;
529 ubik_dprint("no vote from %s\n",
530 afs_inet_ntoa_r(ts->addr[0], hoststr));
531 } else if (code < 0) {
533 ts->beaconSinceDown = 0;
534 urecovery_LostServer(ts);
535 ubik_dprint("time out from %s\n",
536 afs_inet_ntoa_r(ts->addr[0], hoststr));
542 /* now call our own voter module to see if we'll vote for ourself. Note that
543 * the same restrictions apply for our voting for ourself as for our voting
544 * for anyone else. */
545 i = SVOTE_Beacon((struct rx_call *)0, ubeacon_AmSyncSite(), startTime,
550 yesVotes++; /* extra epsilon */
551 if (i < oldestYesVote)
555 /* now decide if we have enough votes to become sync site.
556 * Note that we can still get enough votes even if we didn't for ourself. */
557 if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
559 if (!beacon_globals.ubik_amSyncSite)
560 ubik_dprint("Ubik: I am the sync site\n");
561 beacon_globals.ubik_amSyncSite = 1;
562 beacon_globals.syncSiteUntil = oldestYesVote + SMALLTIME;
563 #ifndef AFS_PTHREAD_ENV
564 /* I did not find a corresponding LWP_WaitProcess(&ubik_amSyncSite) --
565 this may be a spurious signal call -- sjenkins */
566 LWP_NoYieldSignal(&beacon_globals.ubik_amSyncSite);
571 if (beacon_globals.ubik_amSyncSite)
572 ubik_dprint("Ubik: I am no longer the sync site\n");
573 beacon_globals.ubik_amSyncSite = 0;
576 urecovery_ResetState(); /* tell recovery we're no longer the sync site */
585 * \brief Verify that a given IP addresses does actually exist on this machine.
587 * \param ame the pointer to my IP address specified in the
589 * \param aservers an array containing IP
590 * addresses of remote ubik servers. The array is
591 * terminated by a zero address.
593 * Algorithm : Verify that my IP addresses \p ame does actually exist
594 * on this machine. If any of my IP addresses are there
595 * in the remote server list \p aserver, remove them from
596 * this list. Update global variable \p ubik_host[] with
599 * \return 0 on success, non-zero on failure
602 verifyInterfaceAddress(afs_uint32 *ame, struct afsconf_cell *info,
603 afs_uint32 aservers[]) {
604 afs_uint32 myAddr[UBIK_MAX_INTERFACE_ADDR], *servList, tmpAddr;
605 afs_uint32 myAddr2[UBIK_MAX_INTERFACE_ADDR];
607 int tcount, count, found, i, j, totalServers, start, end, usednetfiles =
611 totalServers = info->numServers;
612 else { /* count the number of servers */
613 for (totalServers = 0, servList = aservers; *servList; servList++)
617 if (AFSDIR_SERVER_NETRESTRICT_FILEPATH || AFSDIR_SERVER_NETINFO_FILEPATH) {
619 * Find addresses we are supposed to register as per the netrestrict file
620 * if it exists, else just register all the addresses we find on this
621 * host as returned by rx_getAllAddr (in NBO)
625 parseNetFiles(myAddr, NULL, NULL, UBIK_MAX_INTERFACE_ADDR, reason,
626 AFSDIR_SERVER_NETINFO_FILEPATH,
627 AFSDIR_SERVER_NETRESTRICT_FILEPATH);
629 ubik_print("ubik: Can't register any valid addresses:%s\n",
631 ubik_print("Aborting..\n");
636 /* get all my interface addresses in net byte order */
637 count = rx_getAllAddr(myAddr, UBIK_MAX_INTERFACE_ADDR);
640 if (count <= 0) { /* no address found */
641 ubik_print("ubik: No network addresses found, aborting..");
645 /* verify that the My-address passed in by ubik is correct */
646 for (j = 0, found = 0; j < count; j++) {
647 if (*ame == myAddr[j]) { /* both in net byte order */
654 ubik_print("ubik: primary address %s does not exist\n",
655 afs_inet_ntoa_r(*ame, hoststr));
656 /* if we had the result of rx_getAllAddr already, avoid subverting
657 * the "is gethostbyname(gethostname()) us" check. If we're
658 * using NetInfo/NetRestrict, we assume they have enough clue
659 * to avoid that big hole in their foot from the loaded gun. */
661 /* take the address we did get, then see if ame was masked */
663 tcount = rx_getAllAddr(myAddr2, UBIK_MAX_INTERFACE_ADDR);
664 if (tcount <= 0) { /* no address found */
665 ubik_print("ubik: No network addresses found, aborting..");
669 /* verify that the My-address passed in by ubik is correct */
670 for (j = 0, found = 0; j < tcount; j++) {
671 if (*ame == myAddr2[j]) { /* both in net byte order */
681 /* if any of my addresses are there in serverList, then
682 ** use that as my primary addresses : the higher level
683 ** application screwed up in dealing with multihomed concepts
685 for (j = 0, found = 0; j < count; j++) {
686 for (i = 0; i < totalServers; i++) {
688 tmpAddr = (afs_uint32) info->hostAddr[i].sin_addr.s_addr;
690 tmpAddr = aservers[i];
691 if (myAddr[j] == tmpAddr) {
700 ubik_print("Using %s as my primary address\n", afs_inet_ntoa_r(*ame, hoststr));
703 /* get rid of servers which were purged because all
704 ** those interface addresses are myself
706 for (start = 0, end = totalServers - 1; (start < end); start++, end--) {
707 /* find the first zero entry from the beginning */
708 for (; (start < end) && (aservers[start]); start++);
710 /* find the last non-zero entry from the end */
711 for (; (end >= 0) && (!aservers[end]); end--);
713 /* if there is nothing more to purge, exit from loop */
718 aservers[start] = aservers[end];
719 aservers[end] = 0; /* this entry was moved */
723 /* update all my addresses in ubik_host in such a way
724 ** that ubik_host[0] has the primary address
727 for (j = 0, i = 1; j < count; j++)
728 if (*ame != myAddr[j])
729 ubik_host[i++] = myAddr[j];
731 return 0; /* return success */
736 * \brief Exchange IP address information with remote servers.
738 * \param ubik_host an array containing all my IP addresses.
740 * Algorithm : Do an RPC to all remote ubik servers infroming them
741 * about my IP addresses. Get their IP addresses and
742 * update my linked list of ubik servers \p ubik_servers
744 * \return 0 on success, non-zero on failure
747 updateUbikNetworkAddress(afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR])
749 int j, count, code = 0;
750 UbikInterfaceAddr inAddr, outAddr;
751 struct rx_connection *conns[MAXSERVERS];
752 struct ubik_server *ts, *server[MAXSERVERS];
757 for (count = 0, ts = ubik_servers; ts; count++, ts = ts->next) {
758 conns[count] = ts->disk_rxcid;
764 /* inform all other servers only if there are more than one
765 * database servers in the cell */
769 for (j = 0; j < UBIK_MAX_INTERFACE_ADDR; j++)
770 inAddr.hostAddr[j] = ntohl(ubik_host[j]);
773 /* do the multi-RX RPC to all other servers */
774 multi_Rx(conns, count) {
775 multi_DISK_UpdateInterfaceAddr(&inAddr, &outAddr);
776 ts = server[multi_i]; /* reply received from this server */
779 if (ts->addr[0] != htonl(outAddr.hostAddr[0])) {
781 strcpy(buffer, afs_inet_ntoa_r(ts->addr[0], hoststr));
782 ubik_print("ubik:Two primary addresses for same server \
784 afs_inet_ntoa_r(htonl(outAddr.hostAddr[0]), hoststr));
786 for (j = 1; j < UBIK_MAX_INTERFACE_ADDR; j++)
787 ts->addr[j] = htonl(outAddr.hostAddr[j]);
790 } else if (multi_error == RXGEN_OPCODE) { /* pre 3.5 remote server */
793 ("ubik server %s does not support UpdateInterfaceAddr RPC\n",
794 afs_inet_ntoa_r(ts->addr[0], hoststr));
796 } else if (multi_error == UBADHOST) {
797 code = UBADHOST; /* remote CellServDB inconsistency */
798 ubik_print("Inconsistent Cell Info on server: ");
800 for (j = 0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
801 ubik_print("%s ", afs_inet_ntoa_r(ts->addr[j], hoststr));
806 ts->up = 0; /* mark the remote server as down */
816 ubik_SetClientSecurityProcs(int (*secproc) (void *,
817 struct rx_securityClass **,
819 int (*checkproc) (void *),
822 secLayerProc = secproc;
823 tokenCheckProc = checkproc;