2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
24 #include <netinet/in.h>
31 #include <afs/afsutil.h>
33 #define UBIK_INTERNALS
37 static void printServerInfo(void);
40 * routines for handling requests remotely-submitted by the sync site. These are
41 * only write transactions (we don't propagate read trans), and there is at most one
42 * write transaction extant at any one time.
45 struct ubik_trans *ubik_currentTrans = 0;
49 /* the rest of these guys handle remote execution of write
50 * transactions: this is the code executed on the other servers when a
51 * sync site is executing a write transaction.
54 SDISK_Begin(struct rx_call *rxcall, struct ubik_tid *atid)
58 if ((code = ubik_CheckAuth(rxcall))) {
62 urecovery_CheckTid(atid, 1);
63 code = udisk_begin(ubik_dbase, UBIK_WRITETRANS, &ubik_currentTrans);
64 if (!code && ubik_currentTrans) {
65 /* label this trans with the right trans id */
66 ubik_currentTrans->tid.epoch = atid->epoch;
67 ubik_currentTrans->tid.counter = atid->counter;
75 SDISK_Commit(struct rx_call *rxcall, struct ubik_tid *atid)
78 struct ubik_dbase *dbase;
80 if ((code = ubik_CheckAuth(rxcall))) {
84 if (!ubik_currentTrans) {
88 * sanity check to make sure only write trans appear here
90 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
94 dbase = ubik_currentTrans->dbase;
96 ObtainWriteLock(&dbase->cache_lock);
100 urecovery_CheckTid(atid, 0);
101 if (!ubik_currentTrans) {
103 ReleaseWriteLock(&dbase->cache_lock);
107 code = udisk_commit(ubik_currentTrans);
109 /* sync site should now match */
110 ubik_dbVersion = ubik_dbase->version;
113 ReleaseWriteLock(&dbase->cache_lock);
118 SDISK_ReleaseLocks(struct rx_call *rxcall, struct ubik_tid *atid)
120 struct ubik_dbase *dbase;
123 if ((code = ubik_CheckAuth(rxcall))) {
127 if (!ubik_currentTrans) {
130 /* sanity check to make sure only write trans appear here */
131 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
135 dbase = ubik_currentTrans->dbase;
137 urecovery_CheckTid(atid, 0);
138 if (!ubik_currentTrans) {
143 /* If the thread is not waiting for lock - ok to end it */
144 if (ubik_currentTrans->locktype != LOCKWAIT) {
145 udisk_end(ubik_currentTrans);
147 ubik_currentTrans = (struct ubik_trans *)0;
153 SDISK_Abort(struct rx_call *rxcall, struct ubik_tid *atid)
156 struct ubik_dbase *dbase;
158 if ((code = ubik_CheckAuth(rxcall))) {
162 if (!ubik_currentTrans) {
165 /* sanity check to make sure only write trans appear here */
166 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
170 dbase = ubik_currentTrans->dbase;
172 urecovery_CheckTid(atid, 0);
173 if (!ubik_currentTrans) {
178 code = udisk_abort(ubik_currentTrans);
179 /* If the thread is not waiting for lock - ok to end it */
180 if (ubik_currentTrans->locktype != LOCKWAIT) {
181 udisk_end(ubik_currentTrans);
183 ubik_currentTrans = (struct ubik_trans *)0;
188 /* apos and alen are not used */
190 SDISK_Lock(struct rx_call *rxcall, struct ubik_tid *atid,
191 afs_int32 afile, afs_int32 apos, afs_int32 alen, afs_int32 atype)
194 struct ubik_dbase *dbase;
195 struct ubik_trans *ubik_thisTrans;
197 if ((code = ubik_CheckAuth(rxcall))) {
200 if (!ubik_currentTrans) {
203 /* sanity check to make sure only write trans appear here */
204 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
210 dbase = ubik_currentTrans->dbase;
212 urecovery_CheckTid(atid, 0);
213 if (!ubik_currentTrans) {
218 ubik_thisTrans = ubik_currentTrans;
219 code = ulock_getLock(ubik_currentTrans, atype, 1);
221 /* While waiting, the transaction may have been ended/
222 * aborted from under us (urecovery_CheckTid). In that
223 * case, end the transaction here.
225 if (!code && (ubik_currentTrans != ubik_thisTrans)) {
226 udisk_end(ubik_thisTrans);
235 * \brief Write a vector of data
238 SDISK_WriteV(struct rx_call *rxcall, struct ubik_tid *atid,
239 iovec_wrt *io_vector, iovec_buf *io_buffer)
241 afs_int32 code, i, offset;
242 struct ubik_dbase *dbase;
243 struct ubik_iovec *iovec;
246 if ((code = ubik_CheckAuth(rxcall))) {
249 if (!ubik_currentTrans) {
252 /* sanity check to make sure only write trans appear here */
253 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
257 dbase = ubik_currentTrans->dbase;
259 urecovery_CheckTid(atid, 0);
260 if (!ubik_currentTrans) {
265 iovec = (struct ubik_iovec *)io_vector->iovec_wrt_val;
266 iobuf = (char *)io_buffer->iovec_buf_val;
267 for (i = 0, offset = 0; i < io_vector->iovec_wrt_len; i++) {
268 /* Sanity check for going off end of buffer */
269 if ((offset + iovec[i].length) > io_buffer->iovec_buf_len) {
273 udisk_write(ubik_currentTrans, iovec[i].file, &iobuf[offset],
274 iovec[i].position, iovec[i].length);
279 offset += iovec[i].length;
287 SDISK_Write(struct rx_call *rxcall, struct ubik_tid *atid,
288 afs_int32 afile, afs_int32 apos, bulkdata *adata)
291 struct ubik_dbase *dbase;
293 if ((code = ubik_CheckAuth(rxcall))) {
296 if (!ubik_currentTrans) {
299 /* sanity check to make sure only write trans appear here */
300 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
304 dbase = ubik_currentTrans->dbase;
306 urecovery_CheckTid(atid, 0);
307 if (!ubik_currentTrans) {
312 udisk_write(ubik_currentTrans, afile, adata->bulkdata_val, apos,
313 adata->bulkdata_len);
319 SDISK_Truncate(struct rx_call *rxcall, struct ubik_tid *atid,
320 afs_int32 afile, afs_int32 alen)
323 struct ubik_dbase *dbase;
325 if ((code = ubik_CheckAuth(rxcall))) {
328 if (!ubik_currentTrans) {
331 /* sanity check to make sure only write trans appear here */
332 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
336 dbase = ubik_currentTrans->dbase;
338 urecovery_CheckTid(atid, 0);
339 if (!ubik_currentTrans) {
343 code = udisk_truncate(ubik_currentTrans, afile, alen);
349 SDISK_GetVersion(struct rx_call *rxcall,
350 struct ubik_version *aversion)
354 if ((code = ubik_CheckAuth(rxcall))) {
359 * If we are the sync site, recovery shouldn't be running on any
360 * other site. We shouldn't be getting this RPC as long as we are
361 * the sync site. To prevent any unforseen activity, we should
362 * reject this RPC until we have recognized that we are not the
363 * sync site anymore, and/or if we have any pending WRITE
364 * transactions that have to complete. This way we can be assured
365 * that this RPC would not block any pending transactions that
366 * should either fail or pass. If we have recognized the fact that
367 * we are not the sync site any more, all write transactions would
368 * fail with UNOQUORUM anyway.
370 if (ubeacon_AmSyncSite()) {
375 code = (*ubik_dbase->getlabel) (ubik_dbase, 0, aversion);
378 /* tell other side there's no dbase */
380 aversion->counter = 0;
386 SDISK_GetFile(struct rx_call *rxcall, afs_int32 file,
387 struct ubik_version *version)
390 struct ubik_dbase *dbase;
392 struct ubik_stat ubikstat;
397 if ((code = ubik_CheckAuth(rxcall))) {
400 /* temporarily disabled because it causes problems for migration tool. Hey, it's just
401 * a sanity check, anyway.
402 if (ubeacon_AmSyncSite()) {
408 code = (*dbase->stat) (dbase, file, &ubikstat);
413 length = ubikstat.size;
414 tlen = htonl(length);
415 code = rx_Write(rxcall, (char *)&tlen, sizeof(afs_int32));
416 if (code != sizeof(afs_int32)) {
418 ubik_dprint("Rx-write length error=%d\n", code);
423 tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length);
424 code = (*dbase->read) (dbase, file, tbuffer, offset, tlen);
427 ubik_dprint("read failed error=%d\n", code);
430 code = rx_Write(rxcall, tbuffer, tlen);
433 ubik_dprint("Rx-write length error=%d\n", code);
439 code = (*dbase->getlabel) (dbase, file, version); /* return the dbase, too */
445 SDISK_SendFile(struct rx_call *rxcall, afs_int32 file,
446 afs_int32 length, struct ubik_version *avers)
449 struct ubik_dbase *dbase = NULL;
452 struct ubik_version tversion;
454 struct rx_peer *tpeer;
455 struct rx_connection *tconn;
456 afs_uint32 otherHost = 0;
463 /* send the file back to the requester */
467 if ((code = ubik_CheckAuth(rxcall))) {
472 /* next, we do a sanity check to see if the guy sending us the database is
473 * the guy we think is the sync site. It turns out that we might not have
474 * decided yet that someone's the sync site, but they could have enough
475 * votes from others to be sync site anyway, and could send us the database
476 * in advance of getting our votes. This is fine, what we're really trying
477 * to check is that some authenticated bogon isn't sending a random database
478 * into another configuration. This could happen on a bad configuration
479 * screwup. Thus, we only object if we're sure we know who the sync site
480 * is, and it ain't the guy talking to us.
482 offset = uvote_GetSyncSite();
483 tconn = rx_ConnectionOf(rxcall);
484 tpeer = rx_PeerOf(tconn);
485 otherHost = ubikGetPrimaryInterfaceAddr(rx_HostOf(tpeer));
486 if (offset && offset != otherHost) {
487 /* we *know* this is the wrong guy */
495 /* abort any active trans that may scribble over the database */
496 urecovery_AbortAll(dbase);
498 ubik_print("Ubik: Synchronize database with server %s\n",
499 afs_inet_ntoa_r(otherHost, hoststr));
502 epoch = tversion.epoch = 0; /* start off by labelling in-transit db as invalid */
503 (*dbase->setlabel) (dbase, file, &tversion); /* setlabel does sync */
504 snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP",
505 ubik_dbase->pathName, (file<0)?"SYS":"",
506 (file<0)?-file:file);
507 fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600);
512 code = lseek(fd, HDRSIZE, 0);
513 if (code != HDRSIZE) {
518 memcpy(&ubik_dbase->version, &tversion, sizeof(struct ubik_version));
520 tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length);
521 #if !defined(AFS_PTHREAD_ENV)
525 code = rx_Read(rxcall, tbuffer, tlen);
527 ubik_dprint("Rx-read length error=%d\n", code);
532 code = write(fd, tbuffer, tlen);
535 ubik_dprint("write failed error=%d\n", code);
547 /* sync data first, then write label and resync (resync done by setlabel call).
548 * This way, good label is only on good database. */
549 snprintf(tbuffer, sizeof(tbuffer), "%s.DB%s%d",
550 ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
552 snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD",
553 ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
554 code = unlink(pbuffer);
556 code = rename(tbuffer, pbuffer);
557 snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.TMP",
558 ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
561 code = rename(pbuffer, tbuffer);
563 (*ubik_dbase->open) (ubik_dbase, file);
564 code = (*ubik_dbase->setlabel) (dbase, file, avers);
567 snprintf(pbuffer, sizeof(pbuffer), "%s.DB%s%d.OLD",
568 ubik_dbase->pathName, (file<0)?"SYS":"", (file<0)?-file:file);
571 memcpy(&ubik_dbase->version, avers, sizeof(struct ubik_version));
572 udisk_Invalidate(dbase, file); /* new dbase, flush disk buffers */
573 #ifdef AFS_PTHREAD_ENV
574 assert(pthread_cond_broadcast(&dbase->version_cond) == 0);
576 LWP_NoYieldSignal(&dbase->version);
582 /* Failed to sync. Allow reads again for now. */
584 tversion.epoch = epoch;
585 (*dbase->setlabel) (dbase, file, &tversion);
588 ("Ubik: Synchronize database with server %s failed (error = %d)\n",
589 afs_inet_ntoa_r(otherHost, hoststr), code);
591 ubik_print("Ubik: Synchronize database completed\n");
599 SDISK_Probe(struct rx_call *rxcall)
605 * \brief Update remote machines addresses in my server list
607 * Send back my addresses to caller of this RPC
608 * \return zero on success, else 1.
611 SDISK_UpdateInterfaceAddr(struct rx_call *rxcall,
612 UbikInterfaceAddr *inAddr,
613 UbikInterfaceAddr *outAddr)
615 struct ubik_server *ts, *tmp;
616 afs_uint32 remoteAddr; /* in net byte order */
617 int i, j, found = 0, probableMatch = 0;
620 /* copy the output parameters */
621 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR; i++)
622 outAddr->hostAddr[i] = ntohl(ubik_host[i]);
624 remoteAddr = htonl(inAddr->hostAddr[0]);
625 for (ts = ubik_servers; ts; ts = ts->next)
626 if (ts->addr[0] == remoteAddr) { /* both in net byte order */
632 /* verify that all addresses in the incoming RPC are
633 ** not part of other server entries in my CellServDB
635 for (i = 0; !found && (i < UBIK_MAX_INTERFACE_ADDR)
636 && inAddr->hostAddr[i]; i++) {
637 remoteAddr = htonl(inAddr->hostAddr[i]);
638 for (tmp = ubik_servers; (!found && tmp); tmp = tmp->next) {
639 if (ts == tmp) /* this is my server */
641 for (j = 0; (j < UBIK_MAX_INTERFACE_ADDR) && tmp->addr[j];
643 if (remoteAddr == tmp->addr[j]) {
651 /* if (probableMatch) */
652 /* inconsistent addresses in CellServDB */
653 if (!probableMatch || found) {
654 ubik_print("Inconsistent Cell Info from server: ");
655 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR && inAddr->hostAddr[i]; i++)
656 ubik_print("%s ", afs_inet_ntoa_r(htonl(inAddr->hostAddr[i]), hoststr));
664 /* update our data structures */
665 for (i = 1; i < UBIK_MAX_INTERFACE_ADDR; i++)
666 ts->addr[i] = htonl(inAddr->hostAddr[i]);
668 ubik_print("ubik: A Remote Server has addresses: ");
669 for (i = 0; i < UBIK_MAX_INTERFACE_ADDR && ts->addr[i]; i++)
670 ubik_print("%s ", afs_inet_ntoa_r(ts->addr[i], hoststr));
677 printServerInfo(void)
679 struct ubik_server *ts;
683 ubik_print("Local CellServDB:");
684 for (ts = ubik_servers; ts; ts = ts->next, j++) {
685 ubik_print("Server %d: ", j);
686 for (i = 0; (i < UBIK_MAX_INTERFACE_ADDR) && ts->addr[i]; i++)
687 ubik_print("%s ", afs_inet_ntoa_r(ts->addr[i], hoststr));
693 SDISK_SetVersion(struct rx_call *rxcall, struct ubik_tid *atid,
694 struct ubik_version *oldversionp,
695 struct ubik_version *newversionp)
698 struct ubik_dbase *dbase;
700 if ((code = ubik_CheckAuth(rxcall))) {
704 if (!ubik_currentTrans) {
707 /* sanity check to make sure only write trans appear here */
708 if (ubik_currentTrans->type != UBIK_WRITETRANS) {
712 /* Should not get this for the sync site */
713 if (ubeacon_AmSyncSite()) {
717 dbase = ubik_currentTrans->dbase;
719 urecovery_CheckTid(atid, 0);
720 if (!ubik_currentTrans) {
725 /* Set the label if its version matches the sync-site's */
726 if ((oldversionp->epoch == ubik_dbVersion.epoch)
727 && (oldversionp->counter == ubik_dbVersion.counter)) {
728 code = (*dbase->setlabel) (ubik_dbase, 0, newversionp);
730 ubik_dbase->version = *newversionp;
731 ubik_dbVersion = *newversionp;