2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
22 #include <sys/socket.h>
23 #include <netinet/in.h>
30 #include <rx/rx_multi.h>
31 #include <afs/cellconfig.h>
33 #include <afs/afsutil.h>
34 #include <afs/netutils.h>
37 #define UBIK_INTERNALS
41 /* statics used to determine if we're the sync site */
42 static afs_int32 syncSiteUntil = 0; /* valid only if amSyncSite */
43 int ubik_amSyncSite = 0; /* flag telling if I'm sync site */
44 static nServers; /* total number of servers */
45 static char amIMagic=0; /* is this host the magic host */
46 char amIClone=0; /* is this a clone which doesn't vote */
47 static char ubik_singleServer = 0;
48 extern struct rx_securityClass *rxnull_NewClientSecurityObject();
49 int (*ubik_CRXSecurityProc)();
50 char *ubik_CRXSecurityRock;
51 afs_int32 ubikSecIndex;
52 struct rx_securityClass *ubikSecClass;
55 /* Module responsible for both deciding if we're currently the sync site,
56 * and keeping collecting votes so as to stay sync site.
58 * The basic module contacts all of the servers it can, trying to get them to vote
59 * for this server for sync site. The vote request message (called a beacon message)
60 * also specifies until which time this site claims to be the sync site, if at all, thus enabling
61 * receiving sites to know how long the sync site guarantee is made for.
63 * Each of these beacon messages is thus both a declaration of how long this site will
64 * remain sync site, and an attempt to extend that time by collecting votes for a later
65 * sync site extension.
67 * The voting module is responsible for choosing a reasonable time until which it promises
68 * not to vote for someone else. This parameter (BIG seconds) is not actually passed in
69 * the interface (perhaps it should be?) but is instead a compile time constant that both
72 * The beacon and vote modules work intimately together; the vote module decides how long
73 * it should promise the beacon module its vote, and the beacon module takes all of these
74 * votes and decides for how long it is the synchronization site.
77 /* procedure called from debug rpc call to get this module's state for debugging */
79 register struct ubik_debug *aparm; {
80 /* fill in beacon's state fields in the ubik_debug structure */
81 aparm->syncSiteUntil = syncSiteUntil;
82 aparm->nServers = nServers;
85 /* procedure that determines whether this site has enough current votes to remain sync site.
86 * called from higher-level modules (everything but the vote module).
88 * If we're the sync site, check that our guarantees, obtained by the ubeacon_Interact
89 * light-weight process, haven't expired. We're sync site as long as a majority of the
90 * servers in existence have promised us unexpired guarantees. The variable ubik_syncSiteUntil
91 * contains the time at which the latest of the majority of the sync site guarantees expires
92 * (if the variable ubik_amSyncSite is true)
93 * This module also calls up to the recovery module if it thinks that the recovery module
94 * may have to pick up a new database (which offucr sif we lose the sync site votes).
96 ubeacon_AmSyncSite() {
97 register afs_int32 now;
98 register afs_int32 rcode;
100 /* special case for fast startup */
101 if (nServers == 1 && !amIClone) {
102 return 1; /* one guy is always the sync site */
105 if (ubik_amSyncSite == 0 || amIClone) rcode = 0; /* if I don't think I'm the sync site, say so */
107 now = FT_ApproxTime();
108 if (syncSiteUntil <= now) { /* if my votes have expired, say so */
109 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
114 rcode = 1; /* otherwise still have the required votes */
117 if (rcode == 0) urecovery_ResetState(); /* force recovery to re-execute */
118 ubik_dprint("beacon: amSyncSite is %d\n", rcode);
122 /* setup server list; called with two parms, first is my address, second is list of other servers
123 * called only at initialization to set up the list of servers to contact for votes. Just creates
124 * the server structure. Note that there are two connections in every server structure, one for
125 * vote calls (which must always go through quickly) and one for database operations, which
126 * are subject to waiting for locks. If we used only one, the votes would sometimes get
127 * held up behind database operations, and the sync site guarantees would timeout
128 * even though the host would be up for communication.
130 * The "magic" host is the one with the lowest internet address. It is
131 * magic because its vote counts epsilon more than the others. This acts
132 * as a tie-breaker when we have an even number of hosts in the system.
133 * For example, if the "magic" host is up in a 2 site system, then it
134 * is sync site. Without the magic host hack, if anyone crashed in a 2
135 * site system, we'd be out of business.
137 ubeacon_InitServerListByInfo(ame, info, clones)
139 struct afsconf_cell *info;
144 code = ubeacon_InitServerListCommon(ame, info, clones, 0);
148 ubeacon_InitServerList(ame, aservers)
150 register afs_int32 aservers[];
154 code = ubeacon_InitServerListCommon(ame, (struct afsconf_cell *)0, 0,
159 ubeacon_InitServerListCommon(ame, info, clones, aservers)
161 struct afsconf_cell *info;
163 register afs_int32 aservers[];
165 register struct ubik_server *ts;
167 register afs_int32 servAddr;
168 register afs_int32 i, code;
170 struct ubik_server *magicServer;
172 /* verify that the addresses passed in are correct */
173 if (code = verifyInterfaceAddress(&ame, info, aservers))
176 /* get the security index to use, if we can */
177 if (ubik_CRXSecurityProc) {
178 i = (*ubik_CRXSecurityProc)(ubik_CRXSecurityRock, &ubikSecClass, &ubikSecIndex);
182 /* don't have sec module yet */
184 ubikSecClass = rxnull_NewClientSecurityObject();
186 magicHost = ntohl(ame); /* do comparisons in host order */
187 magicServer = (struct ubik_server *) 0;
190 for (i = 0; i < info->numServers; i++) {
191 if (ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr) ==
192 ntohl((afs_uint32) ame)) {
201 for (i = 0; i < info->numServers; i++) {
202 if (i == me) continue;
203 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
204 memset(ts, 0, sizeof(struct ubik_server));
205 ts->next = ubik_servers;
207 ts->addr[0] = info->hostAddr[i].sin_addr.s_addr;
212 ntohl((afs_uint32) ts->addr[0]) < (afs_uint32) magicHost) {
213 magicHost = ntohl(ts->addr[0]);
219 ts->vote_rxcid = rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
222 ubikSecClass, ubikSecIndex);
224 ts->disk_rxcid = rx_NewConnection(info->hostAddr[i].sin_addr.s_addr,
226 DISK_SERVICE_ID, ubikSecClass,
232 while (servAddr = *aservers++) {
233 if (i >= MAXSERVERS) return UNHOSTS; /* too many hosts */
234 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
235 memset(ts, 0, sizeof(struct ubik_server));
236 ts->next = ubik_servers;
238 ts->addr[0] = servAddr; /* primary address in net byte order */
239 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
241 ubikSecClass, ubikSecIndex); /* for vote reqs */
242 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
243 DISK_SERVICE_ID, ubikSecClass,
244 ubikSecIndex); /* for disk reqs */
245 ts->isClone = 0; /* don't know about clones */
247 if (ntohl((afs_uint32) servAddr) < (afs_uint32) magicHost) {
248 magicHost = ntohl(servAddr);
254 if (magicServer) magicServer->magic = 1; /* remember for when counting votes */
256 if (!amIClone && !magicServer) amIMagic = 1;
259 ++nServers; /* count this server as well as the remotes */
261 nServers = i+1; /* count this server as well as the remotes */
263 ubik_quorum = (nServers>>1)+1; /* compute the majority figure */
264 /* send addrs to all other servers */
265 code = updateUbikNetworkAddress(ubik_host);
269 /* Shoud we set some defaults for RX??
271 r_nRetries = (RPCTIMEOUT/r_retryInterval);
274 if (!ubik_servers) /* special case 1 server */
275 ubik_singleServer = 1;
276 if (nServers == 1 && !amIClone) {
277 ubik_amSyncSite = 1; /* let's start as sync site */
278 syncSiteUntil = 0x7fffffff; /* and be it quite a while */
281 if (nServers == 1) /* special case 1 server */
282 ubik_singleServer = 1;
285 if (ubik_singleServer) {
286 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site - 1 server\n");
288 syncSiteUntil = 0x7fffffff; /* quite a while */
293 /* main lwp loop for code that sends out beacons. This code only runs while
294 * we're sync site or we want to be the sync site. It runs in its very own light-weight
298 register afs_int32 code;
300 struct rx_connection *connections[MAXSERVERS];
301 struct ubik_server *servers[MAXSERVERS];
302 register afs_int32 i;
303 register struct ubik_server *ts;
304 afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
305 struct ubik_tid ttid;
308 /* loop forever getting votes */
309 lastWakeupTime = 0; /* keep track of time we last started a vote collection */
312 /* don't wakeup more than every POLLTIME seconds */
313 temp = (lastWakeupTime + POLLTIME) - FT_ApproxTime();
314 /* don't sleep if last collection phase took too long (probably timed someone out ) */
316 if (temp > POLLTIME) temp = POLLTIME;
319 code = IOMGR_Select(0, 0, 0, 0, &tt);
323 lastWakeupTime = FT_ApproxTime(); /* started a new collection phase */
325 if (ubik_singleServer) continue; /* special-case 1 server for speedy startup */
327 if (!uvote_ShouldIRun()) continue; /* if voter has heard from a better candidate than us, don't bother running */
329 /* otherwise we should run for election, or we're the sync site (and have already won);
330 send out the beacon packets */
331 /* build list of all up hosts (noticing dead hosts are running again
332 is a task for the recovery module, not the beacon module), and
333 prepare to send them an r multi-call containing the beacon message */
334 i = 0; /* collect connections */
335 for(ts = ubik_servers; ts; ts=ts->next) {
336 if (ts->up && ts->addr[0] != ubik_host[0]) {
338 connections[i++] = ts->vote_rxcid;
341 servers[i] = (struct ubik_server *) 0; /* end of list */
342 /* note that we assume in the vote module that we'll always get at least BIGTIME
343 seconds of vote from anyone who votes for us, which means we can conservatively
344 assume we'll be fine until SMALLTIME seconds after we start collecting votes */
345 /* this next is essentially an expansion of rgen's ServBeacon routine */
347 ttid.epoch = ubik_epochTime;
348 if (ubik_dbase->flags & DBWRITING) {
350 * if a write is in progress, we have to send the writeTidCounter
351 * which holds the tid counter of the write transaction , and not
352 * send the tidCounter value which holds the tid counter of the
355 ttid.counter = ubik_dbase->writeTidCounter;
358 ttid.counter = ubik_dbase->tidCounter+1;
360 /* now analyze return codes, counting up our votes */
361 yesVotes = 0; /* count how many to ensure we have quorum */
362 oldestYesVote = 0x3fffffff; /* time quorum expires */
363 syncsite= ubeacon_AmSyncSite();
364 startTime = FT_ApproxTime();
366 * Don't waste time using mult Rx calls if there are no connections out there
369 multi_Rx(connections, i) {
370 multi_VOTE_Beacon(syncsite, startTime, &ubik_dbase->version, &ttid);
371 temp = FT_ApproxTime(); /* now, more or less */
372 ts = servers[multi_i];
373 ts->lastBeaconSent = temp;
375 /* note that the vote time (the return code) represents the time
376 the vote was computed, *not* the time the vote expires. We compute
377 the latter down below if we got enough votes to go with */
379 ts->lastVoteTime = code;
380 if (code < oldestYesVote) oldestYesVote = code;
384 if (ts->magic) yesVotes++; /* the extra epsilon */
385 ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
386 ts->beaconSinceDown = 1;
387 ubik_dprint("yes vote from host %s\n",afs_inet_ntoa(ts->addr[0]));
389 else if (code == 0) {
390 ts->lastVoteTime = temp;
392 ts->beaconSinceDown = 1;
393 ubik_dprint("no vote from %s\n", afs_inet_ntoa(ts->addr[0]));
397 ts->beaconSinceDown = 0;
398 urecovery_LostServer();
399 ubik_dprint("time out from %s\n", afs_inet_ntoa(ts->addr[0]));
403 /* now call our own voter module to see if we'll vote for ourself. Note that
404 the same restrictions apply for our voting for ourself as for our voting
406 i = SVOTE_Beacon((struct rx_connection *) 0, ubeacon_AmSyncSite(), startTime, &ubik_dbase->version, &ttid);
409 if (amIMagic) yesVotes++; /* extra epsilon */
410 if (i < oldestYesVote) oldestYesVote = i;
413 /* now decide if we have enough votes to become sync site.
414 Note that we can still get enough votes even if we didn't for ourself. */
415 if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
416 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site\n");
418 syncSiteUntil = oldestYesVote + SMALLTIME;
419 LWP_NoYieldSignal(&ubik_amSyncSite);
422 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
424 urecovery_ResetState(); /* tell recovery we're no longer the sync site */
431 * Input Param : ame is the pointer to my IP address specified in the
432 * CellServDB file. aservers is an array containing IP
433 * addresses of remote ubik servers. The array is
434 * terminated by a zero address.
436 * Algorithm : Verify that my IP addresses 'ame' does actually exist
437 * on this machine. If any of my IP addresses are there
438 * in the remote server list 'aserver', remove them from
439 * this list. Update global variable ubik_host[] with
442 * Return Values : 0 on success, non-zero on failure
444 static verifyInterfaceAddress(ame, info, aservers)
445 afs_uint32 *ame; /* one of my interface addr in net byte order */
446 struct afsconf_cell *info;
447 afs_uint32 aservers[]; /* list of all possible server addresses */
449 afs_uint32 myAddr[UBIK_MAX_INTERFACE_ADDR], *servList, tmpAddr;
450 int count, found, i, j, totalServers, start, end;
453 totalServers = info->numServers;
454 else { /* count the number of servers */
455 for ( totalServers=0, servList = aservers; *servList; servList++)
460 /* for now use getaddr(). use getAllAddr when implemented */
461 myAddr[0] = rxi_getaddr();
462 count = (myAddr[0] != 0);
464 if(AFSDIR_SERVER_NETRESTRICT_FILEPATH || AFSDIR_SERVER_NETINFO_FILEPATH) {
466 * Find addresses we are supposed to register as per the netrestrict file
467 * if it exists, else just register all the addresses we find on this
468 * host as returned by rx_getAllAddr (in NBO)
471 count=parseNetFiles(myAddr,NULL,NULL,UBIK_MAX_INTERFACE_ADDR,
472 reason,AFSDIR_SERVER_NETINFO_FILEPATH,
473 AFSDIR_SERVER_NETRESTRICT_FILEPATH);
475 ubik_print("ubik: Can't register any valid addresses:%s\n",reason);
476 ubik_print("Aborting..\n");
481 /* get all my interface addresses in net byte order */
482 count = rx_getAllAddr(myAddr, UBIK_MAX_INTERFACE_ADDR);
486 if ( count <= 0 ) /* no address found */
488 ubik_print("ubik: No network addresses found, aborting..");
492 /* verify that the My-address passed in by ubik is correct */
493 for ( j=0, found = 0; j < count; j++)
495 if ( *ame == myAddr[j] ) /* both in net byte order */
504 ubik_print("ubik: primary address %s does not exist\n",
505 afs_inet_ntoa(*ame));
509 /* if any of my addresses are there in serverList, then
510 ** use that as my primary addresses : the higher level
511 ** application screwed up in dealing with multihomed concepts
513 for ( j=0, found = 0; j < count; j++)
515 for ( i=0; i < totalServers; i++) {
517 tmpAddr = ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr);
519 tmpAddr = aservers[i];
520 if ( myAddr[j] == tmpAddr) {
529 ubik_print("Using %s as my primary address\n", afs_inet_ntoa(*ame) );
532 /* get rid of servers which were purged because all
533 ** those interface addresses are myself
535 for ( start=0, end=totalServers-1; (start<end) ; start++, end--)
537 /* find the first zero entry from the beginning */
538 for ( ; (start < end) && ( aservers[start] ); start++);
540 /* find the last non-zero entry from the end */
541 for ( ; (end >= 0) && ( !aservers[end] ); end-- );
543 /* if there is nothing more to purge, exit from loop */
544 if ( start >= end ) break;
547 aservers[start] = aservers[end];
548 aservers[end] = 0; /* this entry was moved */
552 /* update all my addresses in ubik_host in such a way
553 ** that ubik_host[0] has the primary address
556 for ( j=0, i=1; j < count; j++)
557 if ( *ame != myAddr[j] )
558 ubik_host[i++] = myAddr[j];
560 return 0; /* return success */
565 * Input Param : ubik_host is an array containing all my IP addresses.
567 * Algorithm : Do an RPC to all remote ubik servers infroming them
568 * about my IP addresses. Get their IP addresses and
569 * update my linked list of ubik servers 'ubik_servers'
571 * Return Values : 0 on success, non-zero on failure
574 updateUbikNetworkAddress(ubik_host)
575 afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR];
577 int j, count, code = 0;
578 UbikInterfaceAddr inAddr, outAddr;
579 struct rx_connection *conns[MAXSERVERS];
580 struct ubik_server *ts, *server[MAXSERVERS];
583 for ( count = 0, ts=ubik_servers; ts; count++, ts = ts->next )
585 conns[count] = ts->disk_rxcid;
590 /* inform all other servers only if there are more than one
591 database servers in the cell */
595 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR; j++)
596 inAddr.hostAddr[j] = ntohl(ubik_host[j]);
599 /* do the multi-RX RPC to all other servers */
600 multi_Rx(conns, count) {
601 multi_DISK_UpdateInterfaceAddr(&inAddr, &outAddr);
602 ts = server[multi_i]; /* reply received from this server */
603 if ( !multi_error ) {
604 if ( ts->addr[0] != htonl(outAddr.hostAddr[0]) ) {
606 strcpy(buffer, (char*)afs_inet_ntoa(ts->addr[0]));
607 ubik_print("ubik:Two primary addresses for same server \
608 %s %s\n", buffer, afs_inet_ntoa(htonl(outAddr.hostAddr[0])));
611 for ( j=1; j < UBIK_MAX_INTERFACE_ADDR; j++)
612 ts->addr[j] = htonl(outAddr.hostAddr[j]);
615 else if ( multi_error == RXGEN_OPCODE ) {/* pre 3.5 remote server */
616 ubik_print("ubik server %s does not support UpdateInterfaceAddr RPC\n", afs_inet_ntoa(ts->addr[0]));
618 else if ( multi_error == UBADHOST ) {
619 code = UBADHOST; /* remote CellServDB inconsistency */
620 ubik_print("Inconsistent Cell Info on server: ");
621 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
622 ubik_print("%s ", afs_inet_ntoa(ts->addr[j]));
626 ts->up= 0; /* mark the remote server as down */