2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afs/param.h>
11 #include <sys/types.h>
18 #include <sys/socket.h>
19 #include <netinet/in.h>
26 #include <rx/rx_multi.h>
28 #include <afs/afsutil.h>
29 #include <afs/netutils.h>
32 #define UBIK_INTERNALS
36 /* statics used to determine if we're the sync site */
37 static afs_int32 syncSiteUntil = 0; /* valid only if amSyncSite */
38 int ubik_amSyncSite = 0; /* flag telling if I'm sync site */
39 static nServers; /* total number of servers */
40 static char amIMagic=0; /* is this host the magic host */
41 extern struct rx_securityClass *rxnull_NewClientSecurityObject();
42 int (*ubik_CRXSecurityProc)();
43 char *ubik_CRXSecurityRock;
44 afs_int32 ubikSecIndex;
45 struct rx_securityClass *ubikSecClass;
48 /* Module responsible for both deciding if we're currently the sync site,
49 * and keeping collecting votes so as to stay sync site.
51 * The basic module contacts all of the servers it can, trying to get them to vote
52 * for this server for sync site. The vote request message (called a beacon message)
53 * also specifies until which time this site claims to be the sync site, if at all, thus enabling
54 * receiving sites to know how long the sync site guarantee is made for.
56 * Each of these beacon messages is thus both a declaration of how long this site will
57 * remain sync site, and an attempt to extend that time by collecting votes for a later
58 * sync site extension.
60 * The voting module is responsible for choosing a reasonable time until which it promises
61 * not to vote for someone else. This parameter (BIG seconds) is not actually passed in
62 * the interface (perhaps it should be?) but is instead a compile time constant that both
65 * The beacon and vote modules work intimately together; the vote module decides how long
66 * it should promise the beacon module its vote, and the beacon module takes all of these
67 * votes and decides for how long it is the synchronization site.
70 /* procedure called from debug rpc call to get this module's state for debugging */
72 register struct ubik_debug *aparm; {
73 /* fill in beacon's state fields in the ubik_debug structure */
74 aparm->syncSiteUntil = syncSiteUntil;
75 aparm->nServers = nServers;
78 /* procedure that determines whether this site has enough current votes to remain sync site.
79 * called from higher-level modules (everything but the vote module).
81 * If we're the sync site, check that our guarantees, obtained by the ubeacon_Interact
82 * light-weight process, haven't expired. We're sync site as long as a majority of the
83 * servers in existence have promised us unexpired guarantees. The variable ubik_syncSiteUntil
84 * contains the time at which the latest of the majority of the sync site guarantees expires
85 * (if the variable ubik_amSyncSite is true)
86 * This module also calls up to the recovery module if it thinks that the recovery module
87 * may have to pick up a new database (which offucr sif we lose the sync site votes).
89 ubeacon_AmSyncSite() {
90 register afs_int32 now;
91 register afs_int32 rcode;
93 /* special case for fast startup */
95 return 1; /* one guy is always the sync site */
98 if (ubik_amSyncSite == 0) rcode = 0; /* if I don't think I'm the sync site, say so */
100 now = FT_ApproxTime();
101 if (syncSiteUntil <= now) { /* if my votes have expired, say so */
102 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
107 rcode = 1; /* otherwise still have the required votes */
110 if (rcode == 0) urecovery_ResetState(); /* force recovery to re-execute */
111 ubik_dprint("beacon: amSyncSite is %d\n", rcode);
115 /* setup server list; called with two parms, first is my address, second is list of other servers
116 * called only at initialization to set up the list of servers to contact for votes. Just creates
117 * the server structure. Note that there are two connections in every server structure, one for
118 * vote calls (which must always go through quickly) and one for database operations, which
119 * are subject to waiting for locks. If we used only one, the votes would sometimes get
120 * held up behind database operations, and the sync site guarantees would timeout
121 * even though the host would be up for communication.
123 * The "magic" host is the one with the lowest internet address. It is
124 * magic because its vote counts epsilon more than the others. This acts
125 * as a tie-breaker when we have an even number of hosts in the system.
126 * For example, if the "magic" host is up in a 2 site system, then it
127 * is sync site. Without the magic host hack, if anyone crashed in a 2
128 * site system, we'd be out of business.
130 ubeacon_InitServerList(ame, aservers)
132 register afs_int32 aservers[]; {
133 register struct ubik_server *ts;
134 register afs_int32 servAddr;
135 register afs_int32 i, code;
137 struct ubik_server *magicServer;
139 /* verify that the addresses passed in are correct */
140 if ( code = verifyInterfaceAddress(&ame, aservers ))
143 /* get the security index to use, if we can */
144 if (ubik_CRXSecurityProc) {
145 i = (*ubik_CRXSecurityProc)(ubik_CRXSecurityRock, &ubikSecClass, &ubikSecIndex);
149 /* don't have sec module yet */
151 ubikSecClass = rxnull_NewClientSecurityObject();
154 magicHost = ntohl(ame); /* do comparisons in host order */
155 magicServer = (struct ubik_server *) 0;
156 while (servAddr = *aservers++) {
157 if (i >= MAXSERVERS) return UNHOSTS; /* too many hosts */
158 ts = (struct ubik_server *) malloc(sizeof(struct ubik_server));
159 bzero(ts, sizeof(struct ubik_server));
160 ts->next = ubik_servers;
162 ts->addr[0] = servAddr; /* primary address in net byte order */
163 ts->vote_rxcid = rx_NewConnection(servAddr, ubik_callPortal, VOTE_SERVICE_ID, ubikSecClass, ubikSecIndex); /* for vote reqs */
164 ts->disk_rxcid = rx_NewConnection(servAddr, ubik_callPortal, DISK_SERVICE_ID, ubikSecClass, ubikSecIndex); /* for disk reqs */
166 if (ntohl((afs_uint32) servAddr) < (afs_uint32) magicHost) {
167 magicHost = ntohl(servAddr);
172 if (magicServer) magicServer->magic = 1; /* remember for when counting votes */
174 nServers = i+1; /* count this server as well as the remotes */
175 ubik_quorum = (nServers>>1)+1; /* compute the majority figure */
176 /* send addrs to all other servers */
177 code = updateUbikNetworkAddress(ubik_host);
181 /* Shoud we set some defaults for RX??
183 r_nRetries = (RPCTIMEOUT/r_retryInterval);
185 if (nServers == 1) { /* special case 1 server */
186 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site - 1 server\n");
188 syncSiteUntil = 0x7fffffff; /* quite a while */
193 /* main lwp loop for code that sends out beacons. This code only runs while
194 * we're sync site or we want to be the sync site. It runs in its very own light-weight
198 register afs_int32 code;
200 struct rx_connection *connections[MAXSERVERS];
201 struct ubik_server *servers[MAXSERVERS];
202 register afs_int32 i;
203 register struct ubik_server *ts;
204 afs_int32 temp, yesVotes, lastWakeupTime, oldestYesVote, syncsite;
205 struct ubik_tid ttid;
208 /* loop forever getting votes */
209 lastWakeupTime = 0; /* keep track of time we last started a vote collection */
212 /* don't wakeup more than every POLLTIME seconds */
213 temp = (lastWakeupTime + POLLTIME) - FT_ApproxTime();
214 /* don't sleep if last collection phase took too long (probably timed someone out ) */
216 if (temp > POLLTIME) temp = POLLTIME;
219 code = IOMGR_Select(0, 0, 0, 0, &tt);
223 lastWakeupTime = FT_ApproxTime(); /* started a new collection phase */
225 if (nServers == 1) continue; /* special-case 1 server for speedy startup */
227 if (!uvote_ShouldIRun()) continue; /* if voter has heard from a better candidate than us, don't bother running */
229 /* otherwise we should run for election, or we're the sync site (and have already won);
230 send out the beacon packets */
231 /* build list of all up hosts (noticing dead hosts are running again
232 is a task for the recovery module, not the beacon module), and
233 prepare to send them an r multi-call containing the beacon message */
234 i = 0; /* collect connections */
235 for(ts = ubik_servers; ts; ts=ts->next) {
236 if (ts->up && ts->addr[0] != ubik_host[0]) {
238 connections[i++] = ts->vote_rxcid;
241 servers[i] = (struct ubik_server *) 0; /* end of list */
242 /* note that we assume in the vote module that we'll always get at least BIGTIME
243 seconds of vote from anyone who votes for us, which means we can conservatively
244 assume we'll be fine until SMALLTIME seconds after we start collecting votes */
245 /* this next is essentially an expansion of rgen's ServBeacon routine */
247 ttid.epoch = ubik_epochTime;
248 if (ubik_dbase->flags & DBWRITING) {
250 * if a write is in progress, we have to send the writeTidCounter
251 * which holds the tid counter of the write transaction , and not
252 * send the tidCounter value which holds the tid counter of the
255 ttid.counter = ubik_dbase->writeTidCounter;
258 ttid.counter = ubik_dbase->tidCounter+1;
260 /* now analyze return codes, counting up our votes */
261 yesVotes = 0; /* count how many to ensure we have quorum */
262 oldestYesVote = 0x3fffffff; /* time quorum expires */
263 syncsite= ubeacon_AmSyncSite();
264 startTime = FT_ApproxTime();
266 * Don't waste time using mult Rx calls if there are no connections out there
269 multi_Rx(connections, i) {
270 multi_VOTE_Beacon(syncsite, startTime, &ubik_dbase->version, &ttid);
271 temp = FT_ApproxTime(); /* now, more or less */
272 ts = servers[multi_i];
273 ts->lastBeaconSent = temp;
275 /* note that the vote time (the return code) represents the time
276 the vote was computed, *not* the time the vote expires. We compute
277 the latter down below if we got enough votes to go with */
279 ts->lastVoteTime = code;
280 if (code < oldestYesVote) oldestYesVote = code;
283 if (ts->magic) yesVotes++; /* the extra epsilon */
284 ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
285 ts->beaconSinceDown = 1;
286 ubik_dprint("yes vote from host %s\n",afs_inet_ntoa(ts->addr[0]));
288 else if (code == 0) {
289 ts->lastVoteTime = temp;
291 ts->beaconSinceDown = 1;
292 ubik_dprint("no vote from %s\n", afs_inet_ntoa(ts->addr[0]));
296 ts->beaconSinceDown = 0;
297 urecovery_LostServer();
298 ubik_dprint("time out from %s\n", afs_inet_ntoa(ts->addr[0]));
302 /* now call our own voter module to see if we'll vote for ourself. Note that
303 the same restrictions apply for our voting for ourself as for our voting
305 i = SVOTE_Beacon((struct rx_connection *) 0, ubeacon_AmSyncSite(), startTime, &ubik_dbase->version, &ttid);
308 if (amIMagic) yesVotes++; /* extra epsilon */
309 if (i < oldestYesVote) oldestYesVote = i;
312 /* now decide if we have enough votes to become sync site.
313 Note that we can still get enough votes even if we didn't for ourself. */
314 if (yesVotes > nServers) { /* yesVotes is bumped by 2 or 3 for each site */
315 if (!ubik_amSyncSite) ubik_dprint("Ubik: I am the sync site\n");
317 syncSiteUntil = oldestYesVote + SMALLTIME;
318 LWP_NoYieldSignal(&ubik_amSyncSite);
321 if (ubik_amSyncSite) ubik_dprint("Ubik: I am no longer the sync site\n");
323 urecovery_ResetState(); /* tell recovery we're no longer the sync site */
330 * Input Param : ame is the pointer to my IP address specified in the
331 * CellServDB file. aservers is an array containing IP
332 * addresses of remote ubik servers. The array is
333 * terminated by a zero address.
335 * Algorithm : Verify that my IP addresses 'ame' does actually exist
336 * on this machine. If any of my IP addresses are there
337 * in the remote server list 'aserver', remove them from
338 * this list. Update global variable ubik_host[] with
341 * Return Values : 0 on success, non-zero on failure
343 verifyInterfaceAddress(ame, aservers)
344 afs_uint32 *ame; /* one of my interface addr in net byte order */
345 afs_uint32 aservers[]; /* list of all possible server addresses */
347 afs_uint32 myAddr[UBIK_MAX_INTERFACE_ADDR], *servList;
348 int count, index, found, i, j, totalServers, start, end;
350 /* count the number of servers */
351 for ( totalServers=0, servList = aservers; *servList; servList++)
355 /* for now use getaddr(). use getAllAddr when implemented */
356 myAddr[0] = rxi_getaddr();
357 count = (myAddr[0] != 0);
359 if(AFSDIR_SERVER_NETRESTRICT_FILEPATH || AFSDIR_SERVER_NETINFO_FILEPATH) {
361 * Find addresses we are supposed to register as per the netrestrict file
362 * if it exists, else just register all the addresses we find on this
363 * host as returned by rx_getAllAddr (in NBO)
366 count=parseNetFiles(myAddr,NULL,NULL,UBIK_MAX_INTERFACE_ADDR,
367 reason,AFSDIR_SERVER_NETINFO_FILEPATH,
368 AFSDIR_SERVER_NETRESTRICT_FILEPATH);
370 ubik_print("ubik: Can't register any valid addresses:%s\n",reason);
371 ubik_print("Aborting..\n");
376 /* get all my interface addresses in net byte order */
377 count = rx_getAllAddr(myAddr, UBIK_MAX_INTERFACE_ADDR);
381 if ( count <= 0 ) /* no address found */
383 ubik_print("ubik: No network addresses found, aborting..");
387 /* verify that the My-address passed in by ubik is correct */
388 for ( j=0, found = 0; j < count; j++)
390 if ( *ame == myAddr[j] ) /* both in net byte order */
399 ubik_print("ubik: primary address %s does not exist\n",
400 afs_inet_ntoa(*ame));
404 /* if any of my addresses are there in serverList, then
405 ** use that as my primary addresses : the higher level
406 ** application screwed up in dealing with multihomed concepts
408 for ( j=0, found = 0; j < count; j++)
410 for ( i=0; i < totalServers; i++)
411 if ( myAddr[j] == aservers[i] )
419 ubik_print("Using %s as my primary address\n", afs_inet_ntoa(*ame) );
421 /* get rid of servers which were purged because all
422 ** those interface addresses are myself
424 for ( start=0, end=totalServers-1; (start<end) ; start++, end--)
426 /* find the first zero entry from the beginning */
427 for ( ; (start < end) && ( aservers[start] ); start++);
429 /* find the last non-zero entry from the end */
430 for ( ; (end >= 0) && ( !aservers[end] ); end-- );
432 /* if there is nothing more to purge, exit from loop */
433 if ( start >= end ) break;
436 aservers[start] = aservers[end];
437 aservers[end] = 0; /* this entry was moved */
440 /* update all my addresses in ubik_host in such a way
441 ** that ubik_host[0] has the primary address
444 for ( j=0, i=1; j < count; j++)
445 if ( *ame != myAddr[j] )
446 ubik_host[i++] = myAddr[j];
448 return 0; /* return success */
453 * Input Param : ubik_host is an array containing all my IP addresses.
455 * Algorithm : Do an RPC to all remote ubik servers infroming them
456 * about my IP addresses. Get their IP addresses and
457 * update my linked list of ubik servers 'ubik_servers'
459 * Return Values : 0 on success, non-zero on failure
462 updateUbikNetworkAddress(ubik_host)
463 afs_uint32 ubik_host[UBIK_MAX_INTERFACE_ADDR];
465 int j, count, found, index, code = 0;
466 UbikInterfaceAddr inAddr, outAddr;
467 struct rx_connection *conns[MAXSERVERS];
468 struct ubik_server *ts, *server[MAXSERVERS];
471 for ( count = 0, ts=ubik_servers; ts; count++, ts = ts->next )
473 conns[count] = ts->disk_rxcid;
478 /* inform all other servers only if there are more than one
479 database servers in the cell */
483 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR; j++)
484 inAddr.hostAddr[j] = ntohl(ubik_host[j]);
487 /* do the multi-RX RPC to all other servers */
488 multi_Rx(conns, count) {
489 multi_DISK_UpdateInterfaceAddr(&inAddr, &outAddr);
490 ts = server[multi_i]; /* reply received from this server */
491 if ( !multi_error ) {
492 if ( ts->addr[0] != htonl(outAddr.hostAddr[0]) ) {
494 strcpy(buffer, (char*)afs_inet_ntoa(ts->addr[0]));
495 ubik_print("ubik:Two primary addresses for same server \
496 %s %s\n", buffer, afs_inet_ntoa(htonl(outAddr.hostAddr[0])));
499 for ( j=1; j < UBIK_MAX_INTERFACE_ADDR; j++)
500 ts->addr[j] = htonl(outAddr.hostAddr[j]);
503 else if ( multi_error == RXGEN_OPCODE ) {/* pre 3.5 remote server */
504 ubik_print("ubik server %s does not support UpdateInterfaceAddr RPC\n", afs_inet_ntoa(ts->addr[0]));
506 else if ( multi_error == UBADHOST ) {
507 code = UBADHOST; /* remote CellServDB inconsistency */
508 ubik_print("Inconsistent Cell Info on server: ");
509 for ( j=0; j < UBIK_MAX_INTERFACE_ADDR && ts->addr[j]; j++)
510 printf("%s ", afs_inet_ntoa(ts->addr[j]));
512 fflush(stdout); fflush(stderr);
515 ts->up= 0; /* mark the remote server as down */