2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
22 #include <sys/socket.h>
23 #include <netinet/in.h>
37 #include <rx/rx_multi.h>
38 #include <afs/cellconfig.h>
40 #include <afs/afsutil.h>
41 #include <afs/netutils.h>
44 #define UBIK_INTERNALS
48 /* statics used to determine if we're the sync site */
49 static afs_int32 syncSiteUntil = 0; /* valid only if amSyncSite */
50 int ubik_amSyncSite = 0; /* flag telling if I'm sync site */
51 static nServers; /* total number of servers */
52 static char amIMagic=0; /* is this host the magic host */
53 char amIClone=0; /* is this a clone which doesn't vote */
54 static char ubik_singleServer = 0;
55 extern struct rx_securityClass *rxnull_NewClientSecurityObject();
56 int (*ubik_CRXSecurityProc)();
57 char *ubik_CRXSecurityRock;
58 afs_int32 ubikSecIndex;
59 struct rx_securityClass *ubikSecClass;
60 static verifyInterfaceAddress();
63 /* Module responsible for both deciding if we're currently the sync site,
64 * and keeping collecting votes so as to stay sync site.
66 * The basic module contacts all of the servers it can, trying to get them to vote
67 * for this server for sync site. The vote request message (called a beacon message)
68 * also specifies until which time this site claims to be the sync site, if at all, thus enabling
69 * receiving sites to know how long the sync site guarantee is made for.
71 * Each of these beacon messages is thus both a declaration of how long this site will
72 * remain sync site, and an attempt to extend that time by collecting votes for a later
73 * sync site extension.
75 * The voting module is responsible for choosing a reasonable time until which it promises
76 * not to vote for someone else. This parameter (BIG seconds) is not actually passed in
77 * the interface (perhaps it should be?) but is instead a compile time constant that both
80 * The beacon and vote modules work intimately together; the vote module decides how long
81 * it should promise the beacon module its vote, and the beacon module takes all of these
82 * votes and decides for how long it is the synchronization site.
85 /* procedure called from debug rpc call to get this module's state for debugging */
87 register struct ubik_debug *aparm; {
88 /* fill in beacon's state fields in the ubik_debug structure */
89 aparm->syncSiteUntil = syncSiteUntil;
90 aparm->nServers = nServers;
93 /* procedure that determines whether this site has enough current votes to remain sync site.
94 * called from higher-level modules (everything but the vote module).
96 * If we're the sync site, check that our guarantees, obtained by the ubeacon_Interact
97 * light-weight process, haven't expired. We're sync site as long as a majority of the
98 * servers in existence have promised us unexpired guarantees. The variable ubik_syncSiteUntil
99 * contains the time at which the latest of the majority of the sync site guarantees expires
100 * (if the variable ubik_amSyncSite is true)
101 * This module also calls up to the recovery module if it thinks that the recovery module
102 * may have to pick up a new database (which offucr sif we lose the sync site votes).
104 ubeacon_AmSyncSite() {
105 register afs_int32 now;
106 register afs_int32 rcode;
108 /* special case for fast startup */
109 if (nServers == 1 && !amIClone) {
110 return 1; /* one guy is always the sync site */
113 if (ubik_amSyncSite == 0 || amIClone) rcode = 0; /* if I don't think I'm the sync site, say so */
115 now = FT_ApproxTime();
116 if (syncSiteUntil <= now) { /* if my votes have expired, say so */
117 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
122 rcode = 1; /* otherwise still have the required votes */
125 if (rcode == 0) urecovery_ResetState(); /* force recovery to re-execute */
126 ubik_dprint("beacon: amSyncSite is %d\n", rcode);
130 /* setup server list; called with two parms, first is my address, second is list of other servers
131 * called only at initialization to set up the list of servers to contact for votes. Just creates
132 * the server structure. Note that there are two connections in every server structure, one for
133 * vote calls (which must always go through quickly) and one for database operations, which
134 * are subject to waiting for locks. If we used only one, the votes would sometimes get
135 * held up behind database operations, and the sync site guarantees would timeout
136 * even though the host would be up for communication.
138 * The "magic" host is the one with the lowest internet address. It is
139 * magic because its vote counts epsilon more than the others. This acts
140 * as a tie-breaker when we have an even number of hosts in the system.
141 * For example, if the "magic" host is up in a 2 site system, then it
142 * is sync site. Without the magic host hack, if anyone crashed in a 2
143 * site system, we'd be out of business.
145 ubeacon_InitServerListByInfo(ame, info, clones)
147 struct afsconf_cell *info;
152 code = ubeacon_InitServerListCommon(ame, info, clones, 0);
156 ubeacon_InitServerList(ame, aservers)
158 register afs_int32 aservers[];
162 code = ubeacon_InitServerListCommon(ame, (struct afsconf_cell *)0, 0,
167 ubeacon_InitServerListCommon(ame, info, clones, aservers)
169 struct afsconf_cell *info;
171 register afs_int32 aservers[];
173 register struct ubik_server *ts;
175 register afs_int32 servAddr;
176 register afs_int32 i, code;
178 struct ubik_server *magicServer;
180 /* verify that the addresses passed in are correct */
181 if ((code = verifyInterfaceAddress(&ame, info, aservers)))
184 /* get the security index to use, if we can */
185 if (ubik_CRXSecurityProc) {
186 i = (*ubik_CRXSecurityProc)(ubik_CRXSecurityRock, &ubikSecClass, &ubikSecIndex);
190 /* don't have sec module yet */
192 ubikSecClass = rxnull_NewClientSecurityObject();
194 magicHost = ntohl(ame); /* do comparisons in host order */
195 magicServer = (struct ubik_server *) 0;
198 for (i = 0; i < info->numServers; i++) {
199 if (ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr) ==
200 ntohl((afs_uint32) ame)) {
209 for (i = 0; i < info->numServers; i++) {
210 if (i == me) continue;
211 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
212 memset(ts, 0, sizeof(struct ubik_server));
213 ts->next = ubik_servers;
215 ts->addr[0] = info->hostAddr[i].sin_addr.s_addr;
220 ntohl((afs_uint32) ts->addr[0]) < (afs_uint32) magicHost) {
221 magicHost = ntohl(ts->addr[0]);
227 ts->vote_rxcid = rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
230 ubikSecClass, ubikSecIndex);
232 ts->disk_rxcid = rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
234 DISK_SERVICE_ID, ubikSecClass,
240 while ((servAddr = *aservers++)) {
241 if (i >= MAXSERVERS) return UNHOSTS; /* too many hosts */
242 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
243 memset(ts, 0, sizeof(struct ubik_server));
244 ts->next = ubik_servers;
246 ts->addr[0] = servAddr; /* primary address in net byte order */
247 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
249 ubikSecClass, ubikSecIndex); /* for vote reqs */
250 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
251 DISK_SERVICE_ID, ubikSecClass,
252 ubikSecIndex); /* for disk reqs */
253 ts->isClone = 0; /* don't know about clones */
255 if (ntohl((afs_uint32) servAddr) < (afs_uint32) magicHost) {
256 magicHost = ntohl(servAddr);
262 if (magicServer) magicServer->magic = 1; /* remember for when counting votes */
264 if (!amIClone && !magicServer) amIMagic = 1;
267 ++nServers; /* count this server as well as the remotes */
269 nServers = i+1; /* count this server as well as the remotes */
271 ubik_quorum = (nServers>>1)+1; /* compute the majority figure */
272 /* send addrs to all other servers */
273 code = updateUbikNetworkAddress(ubik_host);
277 /* Shoud we set some defaults for RX??
279 r_nRetries = (RPCTIMEOUT/r_retryInterval);
282 if (!ubik_servers) /* special case 1 server */
283 ubik_singleServer = 1;
284 if (nServers == 1 && !amIClone) {
285 ubik_amSyncSite = 1; /* let's start as sync site */
286 syncSiteUntil = 0x7fffffff; /* and be it quite a while */
289 if (nServers == 1) /* special case 1 server */
290 ubik_singleServer = 1;
293 if (ubik_singleServer) {
294 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site - 1 server\n");
296 syncSiteUntil = 0x7fffffff; /* quite a while */
301 /* main lwp loop for code that sends out beacons. This code only runs while
302 * we're sync site or we want to be the sync site. It runs in its very own light-weight
306 register afs_int32 code;
308 struct rx_connection *connections[MAXSERVERS];
309 struct ubik_server *servers[MAXSERVERS];
310 register afs_int32 i;
311 register struct ubik_server *ts;
312 afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
313 struct ubik_tid ttid;
316 /* loop forever getting votes */
317 lastWakeupTime = 0; /* keep track of time we last started a vote collection */
320 /* don't wakeup more than every POLLTIME seconds */
321 temp = (lastWakeupTime + POLLTIME) - FT_ApproxTime();
322 /* don't sleep if last collection phase took too long (probably timed someone out ) */
324 if (temp > POLLTIME) temp = POLLTIME;
327 code = IOMGR_Select(0, 0, 0, 0, &tt);
331 lastWakeupTime = FT_ApproxTime(); /* started a new collection phase */
333 if (ubik_singleServer) continue; /* special-case 1 server for speedy startup */
335 if (!uvote_ShouldIRun()) continue; /* if voter has heard from a better candidate than us, don't bother running */
337 /* otherwise we should run for election, or we're the sync site (and have already won);
338 send out the beacon packets */
339 /* build list of all up hosts (noticing dead hosts are running again
340 is a task for the recovery module, not the beacon module), and
341 prepare to send them an r multi-call containing the beacon message */
342 i = 0; /* collect connections */
343 for(ts = ubik_servers; ts; ts=ts->next) {
344 if (ts->up && ts->addr[0] != ubik_host[0]) {
346 connections[i++] = ts->vote_rxcid;
349 servers[i] = (struct ubik_server *) 0; /* end of list */
350 /* note that we assume in the vote module that we'll always get at least BIGTIME
351 seconds of vote from anyone who votes for us, which means we can conservatively
352 assume we'll be fine until SMALLTIME seconds after we start collecting votes */
353 /* this next is essentially an expansion of rgen's ServBeacon routine */
355 ttid.epoch = ubik_epochTime;
356 if (ubik_dbase->flags & DBWRITING) {
358 * if a write is in progress, we have to send the writeTidCounter
359 * which holds the tid counter of the write transaction , and not
360 * send the tidCounter value which holds the tid counter of the
363 ttid.counter = ubik_dbase->writeTidCounter;
366 ttid.counter = ubik_dbase->tidCounter+1;
368 /* now analyze return codes, counting up our votes */
369 yesVotes = 0; /* count how many to ensure we have quorum */
370 oldestYesVote = 0x3fffffff; /* time quorum expires */
371 syncsite= ubeacon_AmSyncSite();
372 startTime = FT_ApproxTime();
374 * Don't waste time using mult Rx calls if there are no connections out there
377 multi_Rx(connections, i) {
378 multi_VOTE_Beacon(syncsite, startTime, &ubik_dbase->version, &ttid);
379 temp = FT_ApproxTime(); /* now, more or less */
380 ts = servers[multi_i];
381 ts->lastBeaconSent = temp;
383 /* note that the vote time (the return code) represents the time
384 the vote was computed, *not* the time the vote expires. We compute
385 the latter down below if we got enough votes to go with */
387 ts->lastVoteTime = code;
388 if (code < oldestYesVote) oldestYesVote = code;
392 if (ts->magic) yesVotes++; /* the extra epsilon */
393 ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
394 ts->beaconSinceDown = 1;
395 ubik_dprint("yes vote from host %s\n",afs_inet_ntoa(ts->addr[0]));
397 else if (code == 0) {
398 ts->lastVoteTime = temp;
400 ts->beaconSinceDown = 1;
401 ubik_dprint("no vote from %s\n", afs_inet_ntoa(ts->addr[0]));
405 ts->beaconSinceDown = 0;
406 urecovery_LostServer();
407 ubik_dprint("time out from %s\n", afs_inet_ntoa(ts->addr[0]));
411 /* now call our own voter module to see if we'll vote for ourself. Note that
412 the same restrictions apply for our voting for ourself as for our voting
414 i = SVOTE_Beacon((struct rx_connection *) 0, ubeacon_AmSyncSite(), startTime, &ubik_dbase->version, &ttid);
417 if (amIMagic) yesVotes++; /* extra epsilon */
418 if (i < oldestYesVote) oldestYesVote = i;
421 /* now decide if we have enough votes to become sync site.
422 Note that we can still get enough votes even if we didn't for ourself. */
423 if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
424 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site\n");
426 syncSiteUntil = oldestYesVote + SMALLTIME;
427 LWP_NoYieldSignal(&ubik_amSyncSite);
430 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
432 urecovery_ResetState(); /* tell recovery we're no longer the sync site */
439 * Input Param : ame is the pointer to my IP address specified in the
440 * CellServDB file. aservers is an array containing IP
441 * addresses of remote ubik servers. The array is
442 * terminated by a zero address.
444 * Algorithm : Verify that my IP addresses 'ame' does actually exist
445 * on this machine. If any of my IP addresses are there
446 * in the remote server list 'aserver', remove them from
447 * this list. Update global variable ubik_host[] with
450 * Return Values : 0 on success, non-zero on failure
452 static verifyInterfaceAddress(ame, info, aservers)
453 afs_uint32 *ame; /* one of my interface addr in net byte order */
454 struct afsconf_cell *info;
455 afs_uint32 aservers[]; /* list of all possible server addresses */
457 afs_uint32 myAddr[UBIK_MAX_INTERFACE_ADDR], *servList, tmpAddr;
458 int count, found, i, j, totalServers, start, end;
461 totalServers = info->numServers;
462 else { /* count the number of servers */
463 for ( totalServers=0, servList = aservers; *servList; servList++)
468 /* for now use getaddr(). use getAllAddr when implemented */
469 myAddr[0] = rxi_getaddr();
470 count = (myAddr[0] != 0);
472 if(AFSDIR_SERVER_NETRESTRICT_FILEPATH || AFSDIR_SERVER_NETINFO_FILEPATH) {
474 * Find addresses we are supposed to register as per the netrestrict file
475 * if it exists, else just register all the addresses we find on this
476 * host as returned by rx_getAllAddr (in NBO)
479 count=parseNetFiles(myAddr,NULL,NULL,UBIK_MAX_INTERFACE_ADDR,
480 reason,AFSDIR_SERVER_NETINFO_FILEPATH,
481 AFSDIR_SERVER_NETRESTRICT_FILEPATH);
483 ubik_print("ubik: Can't register any valid addresses:%s\n",reason);
484 ubik_print("Aborting..\n");
489 /* get all my interface addresses in net byte order */
490 count = rx_getAllAddr(myAddr, UBIK_MAX_INTERFACE_ADDR);
494 if ( count <= 0 ) /* no address found */
496 ubik_print("ubik: No network addresses found, aborting..");
500 /* verify that the My-address passed in by ubik is correct */
501 for ( j=0, found = 0; j < count; j++)
503 if ( *ame == myAddr[j] ) /* both in net byte order */
512 ubik_print("ubik: primary address %s does not exist\n",
513 afs_inet_ntoa(*ame));
517 /* if any of my addresses are there in serverList, then
518 ** use that as my primary addresses : the higher level
519 ** application screwed up in dealing with multihomed concepts
521 for ( j=0, found = 0; j < count; j++)
523 for ( i=0; i < totalServers; i++) {
525 tmpAddr = ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr);
527 tmpAddr = aservers[i];
528 if ( myAddr[j] == tmpAddr) {
537 ubik_print("Using %s as my primary address\n", afs_inet_ntoa(*ame) );
540 /* get rid of servers which were purged because all
541 ** those interface addresses are myself
543 for ( start=0, end=totalServers-1; (start<end) ; start++, end--)
545 /* find the first zero entry from the beginning */
546 for ( ; (start < end) && ( aservers[start] ); start++);
548 /* find the last non-zero entry from the end */
549 for ( ; (end >= 0) && ( !aservers[end] ); end-- );
551 /* if there is nothing more to purge, exit from loop */
552 if ( start >= end ) break;
555 aservers[start] = aservers[end];
556 aservers[end] = 0; /* this entry was moved */
560 /* update all my addresses in ubik_host in such a way
561 ** that ubik_host[0] has the primary address
564 for ( j=0, i=1; j < count; j++)
565 if ( *ame != myAddr[j] )
566 ubik_host[i++] = myAddr[j];
568 return 0; /* return success */
573 * Input Param : ubik_host is an array containing all my IP addresses.
575 * Algorithm : Do an RPC to all remote ubik servers infroming them
576 * about my IP addresses. Get their IP addresses and
577 * update my linked list of ubik servers 'ubik_servers'
579 * Return Values : 0 on success, non-zero on failure
582 updateUbikNetworkAddress(ubik_host)
583 afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR];
585 int j, count, code = 0;
586 UbikInterfaceAddr inAddr, outAddr;
587 struct rx_connection *conns[MAXSERVERS];
588 struct ubik_server *ts, *server[MAXSERVERS];
591 for ( count = 0, ts=ubik_servers; ts; count++, ts = ts->next )
593 conns[count] = ts->disk_rxcid;
598 /* inform all other servers only if there are more than one
599 database servers in the cell */
603 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR; j++)
604 inAddr.hostAddr[j] = ntohl(ubik_host[j]);
607 /* do the multi-RX RPC to all other servers */
608 multi_Rx(conns, count) {
609 multi_DISK_UpdateInterfaceAddr(&inAddr, &outAddr);
610 ts = server[multi_i]; /* reply received from this server */
611 if ( !multi_error ) {
612 if ( ts->addr[0] != htonl(outAddr.hostAddr[0]) ) {
614 strcpy(buffer, (char*)afs_inet_ntoa(ts->addr[0]));
615 ubik_print("ubik:Two primary addresses for same server \
616 %s %s\n", buffer, afs_inet_ntoa(htonl(outAddr.hostAddr[0])));
619 for ( j=1; j < UBIK_MAX_INTERFACE_ADDR; j++)
620 ts->addr[j] = htonl(outAddr.hostAddr[j]);
623 else if ( multi_error == RXGEN_OPCODE ) {/* pre 3.5 remote server */
624 ubik_print("ubik server %s does not support UpdateInterfaceAddr RPC\n", afs_inet_ntoa(ts->addr[0]));
626 else if ( multi_error == UBADHOST ) {
627 code = UBADHOST; /* remote CellServDB inconsistency */
628 ubik_print("Inconsistent Cell Info on server: ");
629 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
630 ubik_print("%s ", afs_inet_ntoa(ts->addr[j]));
634 ts->up= 0; /* mark the remote server as down */