2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afs/param.h>
11 #include <sys/types.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
26 #include <rx/rx_multi.h>
27 #include <afs/cellconfig.h>
29 #include <afs/afsutil.h>
30 #include <afs/netutils.h>
33 #define UBIK_INTERNALS
37 /* statics used to determine if we're the sync site */
38 static afs_int32 syncSiteUntil = 0; /* valid only if amSyncSite */
39 int ubik_amSyncSite = 0; /* flag telling if I'm sync site */
40 static nServers; /* total number of servers */
41 static char amIMagic=0; /* is this host the magic host */
42 char amIClone=0; /* is this a clone which doesn't vote */
43 static char ubik_singleServer = 0;
44 extern struct rx_securityClass *rxnull_NewClientSecurityObject();
45 int (*ubik_CRXSecurityProc)();
46 char *ubik_CRXSecurityRock;
47 afs_int32 ubikSecIndex;
48 struct rx_securityClass *ubikSecClass;
51 /* Module responsible for both deciding if we're currently the sync site,
52 * and keeping collecting votes so as to stay sync site.
54 * The basic module contacts all of the servers it can, trying to get them to vote
55 * for this server for sync site. The vote request message (called a beacon message)
56 * also specifies until which time this site claims to be the sync site, if at all, thus enabling
57 * receiving sites to know how long the sync site guarantee is made for.
59 * Each of these beacon messages is thus both a declaration of how long this site will
60 * remain sync site, and an attempt to extend that time by collecting votes for a later
61 * sync site extension.
63 * The voting module is responsible for choosing a reasonable time until which it promises
64 * not to vote for someone else. This parameter (BIG seconds) is not actually passed in
65 * the interface (perhaps it should be?) but is instead a compile time constant that both
68 * The beacon and vote modules work intimately together; the vote module decides how long
69 * it should promise the beacon module its vote, and the beacon module takes all of these
70 * votes and decides for how long it is the synchronization site.
73 /* procedure called from debug rpc call to get this module's state for debugging */
75 register struct ubik_debug *aparm; {
76 /* fill in beacon's state fields in the ubik_debug structure */
77 aparm->syncSiteUntil = syncSiteUntil;
78 aparm->nServers = nServers;
81 /* procedure that determines whether this site has enough current votes to remain sync site.
82 * called from higher-level modules (everything but the vote module).
84 * If we're the sync site, check that our guarantees, obtained by the ubeacon_Interact
85 * light-weight process, haven't expired. We're sync site as long as a majority of the
86 * servers in existence have promised us unexpired guarantees. The variable ubik_syncSiteUntil
87 * contains the time at which the latest of the majority of the sync site guarantees expires
88 * (if the variable ubik_amSyncSite is true)
89 * This module also calls up to the recovery module if it thinks that the recovery module
90 * may have to pick up a new database (which offucr sif we lose the sync site votes).
92 ubeacon_AmSyncSite() {
93 register afs_int32 now;
94 register afs_int32 rcode;
96 /* special case for fast startup */
97 if (nServers == 1 && !amIClone) {
98 return 1; /* one guy is always the sync site */
101 if (ubik_amSyncSite == 0 || amIClone) rcode = 0; /* if I don't think I'm the sync site, say so */
103 now = FT_ApproxTime();
104 if (syncSiteUntil <= now) { /* if my votes have expired, say so */
105 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
110 rcode = 1; /* otherwise still have the required votes */
113 if (rcode == 0) urecovery_ResetState(); /* force recovery to re-execute */
114 ubik_dprint("beacon: amSyncSite is %d\n", rcode);
118 /* setup server list; called with two parms, first is my address, second is list of other servers
119 * called only at initialization to set up the list of servers to contact for votes. Just creates
120 * the server structure. Note that there are two connections in every server structure, one for
121 * vote calls (which must always go through quickly) and one for database operations, which
122 * are subject to waiting for locks. If we used only one, the votes would sometimes get
123 * held up behind database operations, and the sync site guarantees would timeout
124 * even though the host would be up for communication.
126 * The "magic" host is the one with the lowest internet address. It is
127 * magic because its vote counts epsilon more than the others. This acts
128 * as a tie-breaker when we have an even number of hosts in the system.
129 * For example, if the "magic" host is up in a 2 site system, then it
130 * is sync site. Without the magic host hack, if anyone crashed in a 2
131 * site system, we'd be out of business.
133 ubeacon_InitServerListByInfo(ame, info, clones)
135 struct afsconf_cell *info;
140 code = ubeacon_InitServerListCommon(ame, info, clones, 0);
144 ubeacon_InitServerList(ame, aservers)
146 register afs_int32 aservers[];
150 code = ubeacon_InitServerListCommon(ame, (struct afsconf_cell *)0, 0,
155 ubeacon_InitServerListCommon(ame, info, clones, aservers)
157 struct afsconf_cell *info;
159 register afs_int32 aservers[];
161 register struct ubik_server *ts;
163 register afs_int32 servAddr;
164 register afs_int32 i, code;
166 struct ubik_server *magicServer;
168 /* verify that the addresses passed in are correct */
169 if (code = verifyInterfaceAddress(&ame, info, aservers))
172 /* get the security index to use, if we can */
173 if (ubik_CRXSecurityProc) {
174 i = (*ubik_CRXSecurityProc)(ubik_CRXSecurityRock, &ubikSecClass, &ubikSecIndex);
178 /* don't have sec module yet */
180 ubikSecClass = rxnull_NewClientSecurityObject();
182 magicHost = ntohl(ame); /* do comparisons in host order */
183 magicServer = (struct ubik_server *) 0;
186 for (i = 0; i < info->numServers; i++) {
187 if (ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr) ==
188 ntohl((afs_uint32) ame)) {
197 for (i = 0; i < info->numServers; i++) {
198 if (i == me) continue;
199 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
200 bzero(ts, sizeof(struct ubik_server));
201 ts->next = ubik_servers;
203 ts->addr[0] = info->hostAddr[i].sin_addr.s_addr;
208 ntohl((afs_uint32) ts->addr[0]) < (afs_uint32) magicHost) {
209 magicHost = ntohl(ts->addr[0]);
214 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
216 ubikSecClass, ubikSecIndex); /* for vote reqs */
217 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
218 DISK_SERVICE_ID, ubikSecClass,
219 ubikSecIndex); /* for disk reqs */
224 while (servAddr = *aservers++) {
225 if (i >= MAXSERVERS) return UNHOSTS; /* too many hosts */
226 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
227 bzero(ts, sizeof(struct ubik_server));
228 ts->next = ubik_servers;
230 ts->addr[0] = servAddr; /* primary address in net byte order */
231 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
233 ubikSecClass, ubikSecIndex); /* for vote reqs */
234 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal,
235 DISK_SERVICE_ID, ubikSecClass,
236 ubikSecIndex); /* for disk reqs */
237 ts->isClone = 0; /* don't know about clones */
239 if (ntohl((afs_uint32) servAddr) < (afs_uint32) magicHost) {
240 magicHost = ntohl(servAddr);
246 if (magicServer) magicServer->magic = 1; /* remember for when counting votes */
248 if (!amIClone && !magicServer) amIMagic = 1;
251 ++nServers; /* count this server as well as the remotes */
253 nServers = i+1; /* count this server as well as the remotes */
255 ubik_quorum = (nServers>>1)+1; /* compute the majority figure */
256 /* send addrs to all other servers */
257 code = updateUbikNetworkAddress(ubik_host);
261 /* Shoud we set some defaults for RX??
263 r_nRetries = (RPCTIMEOUT/r_retryInterval);
266 if (!ubik_servers) /* special case 1 server */
267 ubik_singleServer = 1;
268 if (nServers == 1 && !amIClone) {
269 ubik_amSyncSite = 1; /* let's start as sync site */
270 syncSiteUntil = 0x7fffffff; /* and be it quite a while */
273 if (nServers == 1) /* special case 1 server */
274 ubik_singleServer = 1;
277 if (ubik_singleServer) {
278 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site - 1 server\n");
280 syncSiteUntil = 0x7fffffff; /* quite a while */
285 /* main lwp loop for code that sends out beacons. This code only runs while
286 * we're sync site or we want to be the sync site. It runs in its very own light-weight
290 register afs_int32 code;
292 struct rx_connection *connections[MAXSERVERS];
293 struct ubik_server *servers[MAXSERVERS];
294 register afs_int32 i;
295 register struct ubik_server *ts;
296 afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
297 struct ubik_tid ttid;
300 /* loop forever getting votes */
301 lastWakeupTime = 0; /* keep track of time we last started a vote collection */
304 /* don't wakeup more than every POLLTIME seconds */
305 temp = (lastWakeupTime + POLLTIME) - FT_ApproxTime();
306 /* don't sleep if last collection phase took too long (probably timed someone out ) */
308 if (temp > POLLTIME) temp = POLLTIME;
311 code = IOMGR_Select(0, 0, 0, 0, &tt);
315 lastWakeupTime = FT_ApproxTime(); /* started a new collection phase */
317 if (ubik_singleServer) continue; /* special-case 1 server for speedy startup */
319 if (!uvote_ShouldIRun()) continue; /* if voter has heard from a better candidate than us, don't bother running */
321 /* otherwise we should run for election, or we're the sync site (and have already won);
322 send out the beacon packets */
323 /* build list of all up hosts (noticing dead hosts are running again
324 is a task for the recovery module, not the beacon module), and
325 prepare to send them an r multi-call containing the beacon message */
326 i = 0; /* collect connections */
327 for(ts = ubik_servers; ts; ts=ts->next) {
328 if (ts->up && ts->addr[0] != ubik_host[0]) {
330 connections[i++] = ts->vote_rxcid;
333 servers[i] = (struct ubik_server *) 0; /* end of list */
334 /* note that we assume in the vote module that we'll always get at least BIGTIME
335 seconds of vote from anyone who votes for us, which means we can conservatively
336 assume we'll be fine until SMALLTIME seconds after we start collecting votes */
337 /* this next is essentially an expansion of rgen's ServBeacon routine */
339 ttid.epoch = ubik_epochTime;
340 if (ubik_dbase->flags & DBWRITING) {
342 * if a write is in progress, we have to send the writeTidCounter
343 * which holds the tid counter of the write transaction , and not
344 * send the tidCounter value which holds the tid counter of the
347 ttid.counter = ubik_dbase->writeTidCounter;
350 ttid.counter = ubik_dbase->tidCounter+1;
352 /* now analyze return codes, counting up our votes */
353 yesVotes = 0; /* count how many to ensure we have quorum */
354 oldestYesVote = 0x3fffffff; /* time quorum expires */
355 syncsite= ubeacon_AmSyncSite();
356 startTime = FT_ApproxTime();
358 * Don't waste time using mult Rx calls if there are no connections out there
361 multi_Rx(connections, i) {
362 multi_VOTE_Beacon(syncsite, startTime, &ubik_dbase->version, &ttid);
363 temp = FT_ApproxTime(); /* now, more or less */
364 ts = servers[multi_i];
365 ts->lastBeaconSent = temp;
367 /* note that the vote time (the return code) represents the time
368 the vote was computed, *not* the time the vote expires. We compute
369 the latter down below if we got enough votes to go with */
371 ts->lastVoteTime = code;
372 if (code < oldestYesVote) oldestYesVote = code;
376 if (ts->magic) yesVotes++; /* the extra epsilon */
377 ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
378 ts->beaconSinceDown = 1;
379 ubik_dprint("yes vote from host %s\n",afs_inet_ntoa(ts->addr[0]));
381 else if (code == 0) {
382 ts->lastVoteTime = temp;
384 ts->beaconSinceDown = 1;
385 ubik_dprint("no vote from %s\n", afs_inet_ntoa(ts->addr[0]));
389 ts->beaconSinceDown = 0;
390 urecovery_LostServer();
391 ubik_dprint("time out from %s\n", afs_inet_ntoa(ts->addr[0]));
395 /* now call our own voter module to see if we'll vote for ourself. Note that
396 the same restrictions apply for our voting for ourself as for our voting
398 i = SVOTE_Beacon((struct rx_connection *) 0, ubeacon_AmSyncSite(), startTime, &ubik_dbase->version, &ttid);
401 if (amIMagic) yesVotes++; /* extra epsilon */
402 if (i < oldestYesVote) oldestYesVote = i;
405 /* now decide if we have enough votes to become sync site.
406 Note that we can still get enough votes even if we didn't for ourself. */
407 if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
408 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site\n");
410 syncSiteUntil = oldestYesVote + SMALLTIME;
411 LWP_NoYieldSignal(&ubik_amSyncSite);
414 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
416 urecovery_ResetState(); /* tell recovery we're no longer the sync site */
423 * Input Param : ame is the pointer to my IP address specified in the
424 * CellServDB file. aservers is an array containing IP
425 * addresses of remote ubik servers. The array is
426 * terminated by a zero address.
428 * Algorithm : Verify that my IP addresses 'ame' does actually exist
429 * on this machine. If any of my IP addresses are there
430 * in the remote server list 'aserver', remove them from
431 * this list. Update global variable ubik_host[] with
434 * Return Values : 0 on success, non-zero on failure
436 static verifyInterfaceAddress(ame, info, aservers)
437 struct afsconf_cell *info;
438 afs_uint32 aservers[]; /* list of all possible server addresses */
439 afs_uint32 *ame; /* one of my interface addr in net byte order */
441 afs_uint32 myAddr[UBIK_MAX_INTERFACE_ADDR], *servList, tmpAddr;
442 int count, index, found, i, j, totalServers, start, end;
445 totalServers = info->numServers;
446 else { /* count the number of servers */
447 for ( totalServers=0, servList = aservers; *servList; servList++)
452 /* for now use getaddr(). use getAllAddr when implemented */
453 myAddr[0] = rxi_getaddr();
454 count = (myAddr[0] != 0);
456 if(AFSDIR_SERVER_NETRESTRICT_FILEPATH || AFSDIR_SERVER_NETINFO_FILEPATH) {
458 * Find addresses we are supposed to register as per the netrestrict file
459 * if it exists, else just register all the addresses we find on this
460 * host as returned by rx_getAllAddr (in NBO)
463 count=parseNetFiles(myAddr,NULL,NULL,UBIK_MAX_INTERFACE_ADDR,
464 reason,AFSDIR_SERVER_NETINFO_FILEPATH,
465 AFSDIR_SERVER_NETRESTRICT_FILEPATH);
467 ubik_print("ubik: Can't register any valid addresses:%s\n",reason);
468 ubik_print("Aborting..\n");
473 /* get all my interface addresses in net byte order */
474 count = rx_getAllAddr(myAddr, UBIK_MAX_INTERFACE_ADDR);
478 if ( count <= 0 ) /* no address found */
480 ubik_print("ubik: No network addresses found, aborting..");
484 /* verify that the My-address passed in by ubik is correct */
485 for ( j=0, found = 0; j < count; j++)
487 if ( *ame == myAddr[j] ) /* both in net byte order */
496 ubik_print("ubik: primary address %s does not exist\n",
497 afs_inet_ntoa(*ame));
501 /* if any of my addresses are there in serverList, then
502 ** use that as my primary addresses : the higher level
503 ** application screwed up in dealing with multihomed concepts
505 for ( j=0, found = 0; j < count; j++)
507 for ( i=0; i < totalServers; i++) {
509 tmpAddr = ntohl((afs_uint32) info->hostAddr[i].sin_addr.s_addr);
511 tmpAddr = aservers[i];
512 if ( myAddr[j] == tmpAddr) {
521 ubik_print("Using %s as my primary address\n", afs_inet_ntoa(*ame) );
524 /* get rid of servers which were purged because all
525 ** those interface addresses are myself
527 for ( start=0, end=totalServers-1; (start<end) ; start++, end--)
529 /* find the first zero entry from the beginning */
530 for ( ; (start < end) && ( aservers[start] ); start++);
532 /* find the last non-zero entry from the end */
533 for ( ; (end >= 0) && ( !aservers[end] ); end-- );
535 /* if there is nothing more to purge, exit from loop */
536 if ( start >= end ) break;
539 aservers[start] = aservers[end];
540 aservers[end] = 0; /* this entry was moved */
544 /* update all my addresses in ubik_host in such a way
545 ** that ubik_host[0] has the primary address
548 for ( j=0, i=1; j < count; j++)
549 if ( *ame != myAddr[j] )
550 ubik_host[i++] = myAddr[j];
552 return 0; /* return success */
557 * Input Param : ubik_host is an array containing all my IP addresses.
559 * Algorithm : Do an RPC to all remote ubik servers infroming them
560 * about my IP addresses. Get their IP addresses and
561 * update my linked list of ubik servers 'ubik_servers'
563 * Return Values : 0 on success, non-zero on failure
566 updateUbikNetworkAddress(ubik_host)
567 afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR];
569 int j, count, found, index, code = 0;
570 UbikInterfaceAddr inAddr, outAddr;
571 struct rx_connection *conns[MAXSERVERS];
572 struct ubik_server *ts, *server[MAXSERVERS];
575 for ( count = 0, ts=ubik_servers; ts; count++, ts = ts->next )
577 conns[count] = ts->disk_rxcid;
582 /* inform all other servers only if there are more than one
583 database servers in the cell */
587 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR; j++)
588 inAddr.hostAddr[j] = ntohl(ubik_host[j]);
591 /* do the multi-RX RPC to all other servers */
592 multi_Rx(conns, count) {
593 multi_DISK_UpdateInterfaceAddr(&inAddr, &outAddr);
594 ts = server[multi_i]; /* reply received from this server */
595 if ( !multi_error ) {
596 if ( ts->addr[0] != htonl(outAddr.hostAddr[0]) ) {
598 strcpy(buffer, (char*)afs_inet_ntoa(ts->addr[0]));
599 ubik_print("ubik:Two primary addresses for same server \
600 %s %s\n", buffer, afs_inet_ntoa(htonl(outAddr.hostAddr[0])));
603 for ( j=1; j < UBIK_MAX_INTERFACE_ADDR; j++)
604 ts->addr[j] = htonl(outAddr.hostAddr[j]);
607 else if ( multi_error == RXGEN_OPCODE ) {/* pre 3.5 remote server */
608 ubik_print("ubik server %s does not support UpdateInterfaceAddr RPC\n", afs_inet_ntoa(ts->addr[0]));
610 else if ( multi_error == UBADHOST ) {
611 code = UBADHOST; /* remote CellServDB inconsistency */
612 ubik_print("Inconsistent Cell Info on server: ");
613 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
614 printf("%s ", afs_inet_ntoa(ts->addr[j]));
616 fflush(stdout); fflush(stderr);
619 ts->up= 0; /* mark the remote server as down */