src/ubik/recovery.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 RCSID
  14     ("$Header$");
  15
  16 #include <sys/types.h>
  17 #include <string.h>
  18 #include <stdarg.h>
  19 #include <errno.h>
  20 #include <assert.h>
  21
  22 #ifdef AFS_NT40_ENV
  23 #include <winsock2.h>
  24 #include <time.h>
  25 #include <fcntl.h>
  26 #else
  27 #include <sys/file.h>
  28 #include <netinet/in.h>
  29 #include <sys/time.h>
  30 #endif
  31
  32 #include <lock.h>
  33 #include <rx/xdr.h>
  34 #include <rx/rx.h>
  35 #include <afs/afsutil.h>
  36
  37 #define UBIK_INTERNALS
  38 #include "ubik.h"
  39 #include "ubik_int.h"
  40
  41 /*! \file
  42  * This module is responsible for determining when the system has
  43  * recovered to the point that it can handle new transactions.  It
  44  * replays logs, polls to determine the current dbase after a crash,
  45  * and distributes the new database to the others.
  46  *
  47  * The sync site associates a version number with each database.  It
  48  * broadcasts the version associated with its current dbase in every
  49  * one of its beacon messages.  When the sync site send a dbase to a
  50  * server, it also sends the db's version.  A non-sync site server can
  51  * tell if it has the right dbase version by simply comparing the
  52  * version from the beacon message \p uvote_dbVersion with the version
  53  * associated with the database \p ubik_dbase->version.  The sync site
  54  * itself simply has one counter to keep track of all of this (again
  55  * \p ubik_dbase->version).
  56  *
  57  * sync site: routine called when the sync site loses its quorum; this
  58  * procedure is called "up" from the beacon package.  It resyncs the
  59  * dbase and nudges the recovery daemon to try to propagate out the
  60  * changes.  It also resets the recovery daemon's state, since
  61  * recovery must potentially find a new dbase to propagate out.  This
  62  * routine should not do anything with variables used by non-sync site
  63  * servers.
  64  */
  65
  66 /*!
  67  * if this flag is set, then ubik will use only the primary address
  68  * (the address specified in the CellServDB) to contact other
  69  * ubik servers. Ubik recovery will not try opening connections
  70  * to the alternate interface addresses.
  71  */
  72 int ubikPrimaryAddrOnly;
  73
  74 int
  75 urecovery_ResetState(void)
  76 {
  77     urecovery_state = 0;
  78 #if !defined(AFS_PTHREAD_ENV)
  79     /*  No corresponding LWP_WaitProcess found anywhere for this -- klm */
  80     LWP_NoYieldSignal(&urecovery_state);
  81 #endif
  82     return 0;
  83 }
  84
  85 /*!
  86  * \brief sync site
  87  *
  88  * routine called when a non-sync site server goes down; restarts recovery
  89  * process to send missing server the new db when it comes back up.
  90  *
  91  * \note This routine should not do anything with variables used by non-sync site servers.
  92  */
  93 int
  94 urecovery_LostServer(void)
  95 {
  96 #if !defined(AFS_PTHREAD_ENV)
  97     /*  No corresponding LWP_WaitProcess found anywhere for this -- klm */
  98     LWP_NoYieldSignal(&urecovery_state);
  99     return 0;
 100 #endif
 101 }
 102
 103 /*!
 104  * return true iff we have a current database (called by both sync
 105  * sites and non-sync sites) How do we determine this?  If we're the
 106  * sync site, we wait until recovery has finished fetching and
 107  * re-labelling its dbase (it may still be trying to propagate it out
 108  * to everyone else; that's THEIR problem).  If we're not the sync
 109  * site, then we must have a dbase labelled with the right version,
 110  * and we must have a currently-good sync site.
 111  */
 112 int
 113 urecovery_AllBetter(register struct ubik_dbase *adbase, int areadAny)
 114 {
 115     register afs_int32 rcode;
 116
 117     ubik_dprint("allbetter checking\n");
 118     rcode = 0;
 119
 120
 121     if (areadAny) {
 122         if (ubik_dbase->version.epoch > 1)
 123             rcode = 1;          /* Happy with any good version of database */
 124     }
 125
 126     /* Check if we're sync site and we've got the right data */
 127     else if (ubeacon_AmSyncSite() && (urecovery_state & UBIK_RECHAVEDB)) {
 128         rcode = 1;
 129     }
 130
 131     /* next, check if we're aux site, and we've ever been sent the
 132      * right data (note that if a dbase update fails, we won't think
 133      * that the sync site is still the sync site, 'cause it won't talk
 134      * to us until a timeout period has gone by.  When we recover, we
 135      * leave this clear until we get a new dbase */
 136     else if ((uvote_GetSyncSite() && (vcmp(ubik_dbVersion, ubik_dbase->version) == 0))) {       /* && order is important */
 137         rcode = 1;
 138     }
 139
 140     ubik_dprint("allbetter: returning %d\n", rcode);
 141     return rcode;
 142 }
 143
 144 /*!
 145  * \brief abort all transactions on this database
 146  */
 147 int
 148 urecovery_AbortAll(struct ubik_dbase *adbase)
 149 {
 150     register struct ubik_trans *tt;
 151     for (tt = adbase->activeTrans; tt; tt = tt->next) {
 152         udisk_abort(tt);
 153     }
 154     return 0;
 155 }
 156
 157 /*!
 158  * \brief this routine aborts the current remote transaction, if any, if the tid is wrong
 159  */
 160 int
 161 urecovery_CheckTid(register struct ubik_tid *atid)
 162 {
 163     if (ubik_currentTrans) {
 164         /* there is remote write trans, see if we match, see if this
 165          * is a new transaction */
 166         if (atid->epoch != ubik_currentTrans->tid.epoch
 167             || atid->counter > ubik_currentTrans->tid.counter) {
 168             /* don't match, abort it */
 169             /* If the thread is not waiting for lock - ok to end it */
 170 #if !defined(UBIK_PAUSE)
 171             if (ubik_currentTrans->locktype != LOCKWAIT) {
 172 #endif /* UBIK_PAUSE */
 173                 udisk_end(ubik_currentTrans);
 174 #if !defined(UBIK_PAUSE)
 175             }
 176 #endif /* UBIK_PAUSE */
 177             ubik_currentTrans = (struct ubik_trans *)0;
 178         }
 179     }
 180     return 0;
 181 }
 182
 183 /*!
 184  * \brief replay logs
 185  *
 186  * log format is defined here, and implicitly in disk.c
 187  *
 188  * 4 byte opcode, followed by parameters, each 4 bytes long.  All integers
 189  * are in logged in network standard byte order, in case we want to move logs
 190  * from machine-to-machine someday.
 191  *
 192  * Begin transaction: opcode \n
 193  * Commit transaction: opcode, version (8 bytes) \n
 194  * Truncate file: opcode, file number, length \n
 195  * Abort transaction: opcode \n
 196  * Write data: opcode, file, position, length, <length> data bytes \n
 197  *
 198  * A very simple routine, it just replays the log.  Note that this is a new-value only log, which
 199  * implies that no uncommitted data is written to the dbase: one writes data to the log, including
 200  * the commit record, then we allow data to be written through to the dbase.  In our particular
 201  * implementation, once a transaction is done, we write out the pages to the database, so that
 202  * our buffer package doesn't have to know about stable and uncommitted data in the memory buffers:
 203  * any changed data while there is an uncommitted write transaction can be zapped during an
 204  * abort and the remaining dbase on the disk is exactly the right dbase, without having to read
 205  * the log.
 206  */
 207 static int
 208 ReplayLog(register struct ubik_dbase *adbase)
 209 {
 210     afs_int32 opcode;
 211     register afs_int32 code, tpos;
 212     int logIsGood;
 213     afs_int32 len, thisSize, tfile, filePos;
 214     afs_int32 buffer[4];
 215     afs_int32 syncFile = -1;
 216     afs_int32 data[1024];
 217
 218     /* read the lock twice, once to see whether we have a transaction to deal
 219      * with that committed, (theoretically, we should support more than one
 220      * trans in the log at once, but not yet), and once replaying the
 221      * transactions.  */
 222     tpos = 0;
 223     logIsGood = 0;
 224     /* for now, assume that all ops in log pertain to one transaction; see if there's a commit */
 225     while (1) {
 226         code =
 227             (*adbase->read) (adbase, LOGFILE, (char *)&opcode, tpos,
 228                              sizeof(afs_int32));
 229         if (code != sizeof(afs_int32))
 230             break;
 231         if (opcode == LOGNEW) {
 232             /* handle begin trans */
 233             tpos += sizeof(afs_int32);
 234         } else if (opcode == LOGABORT)
 235             break;
 236         else if (opcode == LOGEND) {
 237             logIsGood = 1;
 238             break;
 239         } else if (opcode == LOGTRUNCATE) {
 240             tpos += 4;
 241             code =
 242                 (*adbase->read) (adbase, LOGFILE, (char *)buffer, tpos,
 243                                  2 * sizeof(afs_int32));
 244             if (code != 2 * sizeof(afs_int32))
 245                 break;          /* premature eof or io error */
 246             tpos += 2 * sizeof(afs_int32);
 247         } else if (opcode == LOGDATA) {
 248             tpos += 4;
 249             code =
 250                 (*adbase->read) (adbase, LOGFILE, (char *)buffer, tpos,
 251                                  3 * sizeof(afs_int32));
 252             if (code != 3 * sizeof(afs_int32))
 253                 break;
 254             /* otherwise, skip over the data bytes, too */
 255             tpos += buffer[2] + 3 * sizeof(afs_int32);
 256         } else {
 257             ubik_dprint("corrupt log opcode (%d) at position %d\n", opcode,
 258                         tpos);
 259             break;              /* corrupt log! */
 260         }
 261     }
 262     if (logIsGood) {
 263         /* actually do the replay; log should go all the way through the commit record, since
 264          * we just read it above. */
 265         tpos = 0;
 266         logIsGood = 0;
 267         syncFile = -1;
 268         while (1) {
 269             code =
 270                 (*adbase->read) (adbase, LOGFILE, (char *)&opcode, tpos,
 271                                  sizeof(afs_int32));
 272             if (code != sizeof(afs_int32))
 273                 break;
 274             if (opcode == LOGNEW) {
 275                 /* handle begin trans */
 276                 tpos += sizeof(afs_int32);
 277             } else if (opcode == LOGABORT)
 278                 panic("log abort\n");
 279             else if (opcode == LOGEND) {
 280                 tpos += 4;
 281                 code =
 282                     (*adbase->read) (adbase, LOGFILE, (char *)buffer, tpos,
 283                                      2 * sizeof(afs_int32));
 284                 if (code != 2 * sizeof(afs_int32))
 285                     return UBADLOG;
 286                 code = (*adbase->setlabel) (adbase, 0, (ubik_version *)buffer);
 287                 if (code)
 288                     return code;
 289                 logIsGood = 1;
 290                 break;          /* all done now */
 291             } else if (opcode == LOGTRUNCATE) {
 292                 tpos += 4;
 293                 code =
 294                     (*adbase->read) (adbase, LOGFILE, (char *)buffer, tpos,
 295                                      2 * sizeof(afs_int32));
 296                 if (code != 2 * sizeof(afs_int32))
 297                     break;      /* premature eof or io error */
 298                 tpos += 2 * sizeof(afs_int32);
 299                 code =
 300                     (*adbase->truncate) (adbase, ntohl(buffer[0]),
 301                                          ntohl(buffer[1]));
 302                 if (code)
 303                     return code;
 304             } else if (opcode == LOGDATA) {
 305                 tpos += 4;
 306                 code =
 307                     (*adbase->read) (adbase, LOGFILE, (char *)buffer, tpos,
 308                                      3 * sizeof(afs_int32));
 309                 if (code != 3 * sizeof(afs_int32))
 310                     break;
 311                 tpos += 3 * sizeof(afs_int32);
 312                 /* otherwise, skip over the data bytes, too */
 313                 len = ntohl(buffer[2]); /* total number of bytes to copy */
 314                 filePos = ntohl(buffer[1]);
 315                 tfile = ntohl(buffer[0]);
 316                 /* try to minimize file syncs */
 317                 if (syncFile != tfile) {
 318                     if (syncFile >= 0)
 319                         code = (*adbase->sync) (adbase, syncFile);
 320                     else
 321                         code = 0;
 322                     syncFile = tfile;
 323                     if (code)
 324                         return code;
 325                 }
 326                 while (len > 0) {
 327                     thisSize = (len > sizeof(data) ? sizeof(data) : len);
 328                     /* copy sizeof(data) buffer bytes at a time */
 329                     code =
 330                         (*adbase->read) (adbase, LOGFILE, (char *)data, tpos,
 331                                          thisSize);
 332                     if (code != thisSize)
 333                         return UBADLOG;
 334                     code =
 335                         (*adbase->write) (adbase, tfile, (char *)data, filePos,
 336                                           thisSize);
 337                     if (code != thisSize)
 338                         return UBADLOG;
 339                     filePos += thisSize;
 340                     tpos += thisSize;
 341                     len -= thisSize;
 342                 }
 343             } else {
 344                 ubik_dprint("corrupt log opcode (%d) at position %d\n",
 345                             opcode, tpos);
 346                 break;          /* corrupt log! */
 347             }
 348         }
 349         if (logIsGood) {
 350             if (syncFile >= 0)
 351                 code = (*adbase->sync) (adbase, syncFile);
 352             if (code)
 353                 return code;
 354         } else {
 355             ubik_dprint("Log read error on pass 2\n");
 356             return UBADLOG;
 357         }
 358     }
 359
 360     /* now truncate the log, we're done with it */
 361     code = (*adbase->truncate) (adbase, LOGFILE, 0);
 362     return code;
 363 }
 364
 365 /*! \brief
 366  * Called at initialization to figure out version of the dbase we really have.
 367  *
 368  * This routine is called after replaying the log; it reads the restored labels.
 369  */
 370 static int
 371 InitializeDB(register struct ubik_dbase *adbase)
 372 {
 373     register afs_int32 code;
 374
 375     code = (*adbase->getlabel) (adbase, 0, &adbase->version);
 376     if (code) {
 377         /* try setting the label to a new value */
 378         adbase->version.epoch = 1;      /* value for newly-initialized db */
 379         adbase->version.counter = 1;
 380         code = (*adbase->setlabel) (adbase, 0, &adbase->version);
 381         if (code) {
 382             /* failed, try to set it back */
 383             adbase->version.epoch = 0;
 384             adbase->version.counter = 0;
 385             (*adbase->setlabel) (adbase, 0, &adbase->version);
 386         }
 387 #ifdef AFS_PTHREAD_ENV
 388         assert(pthread_cond_broadcast(&adbase->version_cond) == 0);
 389 #else
 390         LWP_NoYieldSignal(&adbase->version);
 391 #endif
 392     }
 393     return 0;
 394 }
 395
 396 /*!
 397  * \brief initialize the local ubik_dbase
 398  *
 399  * We replay the logs and then read the resulting file to figure out what version we've really got.
 400  */
 401 int
 402 urecovery_Initialize(register struct ubik_dbase *adbase)
 403 {
 404     register afs_int32 code;
 405
 406     code = ReplayLog(adbase);
 407     if (code)
 408         return code;
 409     code = InitializeDB(adbase);
 410     return code;
 411 }
 412
 413 /*!
 414  * \brief Main interaction loop for the recovery manager
 415  *
 416  * The recovery light-weight process only runs when you're the
 417  * synchronization site.  It performs the following tasks, if and only
 418  * if the prerequisite tasks have been performed successfully (it
 419  * keeps track of which ones have been performed in its bit map,
 420  * \p urecovery_state).
 421  *
 422  * First, it is responsible for probing that all servers are up.  This
 423  * is the only operation that must be performed even if this is not
 424  * yet the sync site, since otherwise this site may not notice that
 425  * enough other machines are running to even elect this guy to be the
 426  * sync site.
 427  *
 428  * After that, the recovery process does nothing until the beacon and
 429  * voting modules manage to get this site elected sync site.
 430  *
 431  * After becoming sync site, recovery first attempts to find the best
 432  * database available in the network (it must do this in order to
 433  * ensure finding the latest committed data).  After finding the right
 434  * database, it must fetch this dbase to the sync site.
 435  *
 436  * After fetching the dbase, it relabels it with a new version number,
 437  * to ensure that everyone recognizes this dbase as the most recent
 438  * dbase.
 439  *
 440  * One the dbase has been relabelled, this machine can start handling
 441  * requests.  However, the recovery module still has one more task:
 442  * propagating the dbase out to everyone who is up in the network.
 443  */
 444 void *
 445 urecovery_Interact(void *dummy)
 446 {
 447     afs_int32 code, tcode;
 448     struct ubik_server *bestServer = NULL;
 449     struct ubik_server *ts;
 450     int dbok, doingRPC, now;
 451     afs_int32 lastProbeTime, lastDBVCheck;
 452     /* if we're the sync site, the best db version we've found yet */
 453     static struct ubik_version bestDBVersion;
 454     struct ubik_version tversion;
 455     struct timeval tv;
 456     int length, tlen, offset, file, nbytes;
 457     struct rx_call *rxcall;
 458     char tbuffer[1024];
 459     struct ubik_stat ubikstat;
 460     struct in_addr inAddr;
 461 #ifndef OLD_URECOVERY
 462     char pbuffer[1028];
 463     int flen, fd = -1;
 464     afs_int32 pass;
 465 #endif
 466
 467     /* otherwise, begin interaction */
 468     urecovery_state = 0;
 469     lastProbeTime = 0;
 470     lastDBVCheck = 0;
 471     while (1) {
 472         /* Run through this loop every 4 seconds */
 473         tv.tv_sec = 4;
 474         tv.tv_usec = 0;
 475 #ifdef AFS_PTHREAD_ENV
 476         select(0, 0, 0, 0, &tv);
 477 #else
 478         IOMGR_Select(0, 0, 0, 0, &tv);
 479 #endif
 480
 481         ubik_dprint("recovery running in state %x\n", urecovery_state);
 482
 483         /* Every 30 seconds, check all the down servers and mark them
 484          * as up if they respond. When a server comes up or found to
 485          * not be current, then re-find the the best database and
 486          * propogate it.
 487          */
 488         if ((now = FT_ApproxTime()) > 30 + lastProbeTime) {
 489             for (ts = ubik_servers, doingRPC = 0; ts; ts = ts->next) {
 490                 if (!ts->up) {
 491                     doingRPC = 1;
 492                     code = DoProbe(ts);
 493                     if (code == 0) {
 494                         ts->up = 1;
 495                         urecovery_state &= ~UBIK_RECFOUNDDB;
 496                     }
 497                 } else if (!ts->currentDB) {
 498                     urecovery_state &= ~UBIK_RECFOUNDDB;
 499                 }
 500             }
 501             if (doingRPC)
 502                 now = FT_ApproxTime();
 503             lastProbeTime = now;
 504         }
 505
 506         /* Mark whether we are the sync site */
 507         if (!ubeacon_AmSyncSite()) {
 508             urecovery_state &= ~UBIK_RECSYNCSITE;
 509             continue;           /* nothing to do */
 510         }
 511         urecovery_state |= UBIK_RECSYNCSITE;
 512
 513         /* If a server has just come up or if we have not found the
 514          * most current database, then go find the most current db.
 515          */
 516         if (!(urecovery_state & UBIK_RECFOUNDDB)) {
 517             bestServer = (struct ubik_server *)0;
 518             bestDBVersion.epoch = 0;
 519             bestDBVersion.counter = 0;
 520             for (ts = ubik_servers; ts; ts = ts->next) {
 521                 if (!ts->up)
 522                     continue;   /* don't bother with these guys */
 523                 if (ts->isClone)
 524                     continue;
 525                 code = DISK_GetVersion(ts->disk_rxcid, &ts->version);
 526                 if (code == 0) {
 527                     /* perhaps this is the best version */
 528                     if (vcmp(ts->version, bestDBVersion) > 0) {
 529                         /* new best version */
 530                         bestDBVersion = ts->version;
 531                         bestServer = ts;
 532                     }
 533                 }
 534             }
 535             /* take into consideration our version. Remember if we,
 536              * the sync site, have the best version. Also note that
 537              * we may need to send the best version out.
 538              */
 539             if (vcmp(ubik_dbase->version, bestDBVersion) >= 0) {
 540                 bestDBVersion = ubik_dbase->version;
 541                 bestServer = (struct ubik_server *)0;
 542                 urecovery_state |= UBIK_RECHAVEDB;
 543             } else {
 544                 /* Clear the flag only when we know we have to retrieve
 545                  * the db. Because urecovery_AllBetter() looks at it.
 546                  */
 547                 urecovery_state &= ~UBIK_RECHAVEDB;
 548             }
 549             lastDBVCheck = FT_ApproxTime();
 550             urecovery_state |= UBIK_RECFOUNDDB;
 551             urecovery_state &= ~UBIK_RECSENTDB;
 552         }
 553 #if defined(UBIK_PAUSE)
 554         /* it's not possible for UBIK_RECFOUNDDB not to be set here.
 555          * However, we might have lost UBIK_RECSYNCSITE, and that
 556          * IS important.
 557          */
 558         if (!(urecovery_state & UBIK_RECSYNCSITE))
 559             continue;           /* lost sync */
 560 #else
 561         if (!(urecovery_state & UBIK_RECFOUNDDB))
 562             continue;           /* not ready */
 563 #endif /* UBIK_PAUSE */
 564
 565         /* If we, the sync site, do not have the best db version, then
 566          * go and get it from the server that does.
 567          */
 568         if ((urecovery_state & UBIK_RECHAVEDB) || !bestServer) {
 569             urecovery_state |= UBIK_RECHAVEDB;
 570         } else {
 571             /* we don't have the best version; we should fetch it. */
 572             DBHOLD(ubik_dbase);
 573             urecovery_AbortAll(ubik_dbase);
 574
 575             /* Rx code to do the Bulk fetch */
 576             file = 0;
 577             offset = 0;
 578             rxcall = rx_NewCall(bestServer->disk_rxcid);
 579
 580             ubik_print("Ubik: Synchronize database with server %s\n",
 581                        afs_inet_ntoa(bestServer->addr[0]));
 582
 583             code = StartDISK_GetFile(rxcall, file);
 584             if (code) {
 585                 ubik_dprint("StartDiskGetFile failed=%d\n", code);
 586                 goto FetchEndCall;
 587             }
 588             nbytes = rx_Read(rxcall, (char *)&length, sizeof(afs_int32));
 589             length = ntohl(length);
 590             if (nbytes != sizeof(afs_int32)) {
 591                 ubik_dprint("Rx-read length error=%d\n", code = BULK_ERROR);
 592                 code = EIO;
 593                 goto FetchEndCall;
 594             }
 595
 596 #ifdef OLD_URECOVERY
 597             /* Truncate the file first */
 598             code = (*ubik_dbase->truncate) (ubik_dbase, file, 0);
 599             if (code) {
 600                 ubik_dprint("truncate io error=%d\n", code);
 601                 goto FetchEndCall;
 602             }
 603             tversion.counter = 0;
 604 #endif
 605             /* give invalid label during file transit */
 606             tversion.epoch = 0;
 607             code = (*ubik_dbase->setlabel) (ubik_dbase, file, &tversion);
 608             if (code) {
 609                 ubik_dprint("setlabel io error=%d\n", code);
 610                 goto FetchEndCall;
 611             }
 612 #ifndef OLD_URECOVERY
 613             flen = length;
 614             afs_snprintf(pbuffer, sizeof(pbuffer), "%s.DB0.TMP", ubik_dbase->pathName);
 615             fd = open(pbuffer, O_CREAT | O_RDWR | O_TRUNC, 0600);
 616             if (fd < 0) {
 617                 code = errno;
 618                 goto FetchEndCall;
 619             }
 620             code = lseek(fd, HDRSIZE, 0);
 621             if (code != HDRSIZE) {
 622                 close(fd);
 623                 goto FetchEndCall;
 624             }
 625 #endif
 626
 627             pass = 0;
 628             while (length > 0) {
 629                 tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length);
 630 #ifndef AFS_PTHREAD_ENV
 631                 if (pass % 4 == 0)
 632                     IOMGR_Poll();
 633 #endif
 634                 nbytes = rx_Read(rxcall, tbuffer, tlen);
 635                 if (nbytes != tlen) {
 636                     ubik_dprint("Rx-read bulk error=%d\n", code = BULK_ERROR);
 637                     code = EIO;
 638                     close(fd);
 639                     goto FetchEndCall;
 640                 }
 641 #ifdef OLD_URECOVERY
 642                 nbytes =
 643                     (*ubik_dbase->write) (ubik_dbase, file, tbuffer, offset,
 644                                           tlen);
 645 #else
 646                 nbytes = write(fd, tbuffer, tlen);
 647                 pass++;
 648 #endif
 649                 if (nbytes != tlen) {
 650                     code = UIOERROR;
 651                     close(fd);
 652                     goto FetchEndCall;
 653                 }
 654                 offset += tlen;
 655                 length -= tlen;
 656             }
 657 #ifndef OLD_URECOVERY
 658             code = close(fd);
 659             if (code)
 660                 goto FetchEndCall;
 661 #endif
 662             code = EndDISK_GetFile(rxcall, &tversion);
 663           FetchEndCall:
 664             tcode = rx_EndCall(rxcall, code);
 665             if (!code)
 666                 code = tcode;
 667             if (!code) {
 668                 /* we got a new file, set up its header */
 669                 urecovery_state |= UBIK_RECHAVEDB;
 670                 memcpy(&ubik_dbase->version, &tversion,
 671                        sizeof(struct ubik_version));
 672 #ifdef OLD_URECOVERY
 673                 (*ubik_dbase->sync) (ubik_dbase, 0);    /* get data out first */
 674 #else
 675                 afs_snprintf(tbuffer, sizeof(tbuffer), "%s.DB0", ubik_dbase->pathName);
 676 #ifdef AFS_NT40_ENV
 677                 afs_snprintf(pbuffer, sizeof(pbuffer), "%s.DB0.OLD", ubik_dbase->pathName);
 678                 code = unlink(pbuffer);
 679                 if (!code)
 680                     code = rename(tbuffer, pbuffer);
 681                 afs_snprintf(pbuffer, sizeof(pbuffer), "%s.DB0.TMP", ubik_dbase->pathName);
 682 #endif
 683                 if (!code)
 684                     code = rename(pbuffer, tbuffer);
 685                 if (!code) {
 686                     (*ubik_dbase->open) (ubik_dbase, 0);
 687 #endif
 688                     /* after data is good, sync disk with correct label */
 689                     code =
 690                         (*ubik_dbase->setlabel) (ubik_dbase, 0,
 691                                                  &ubik_dbase->version);
 692 #ifndef OLD_URECOVERY
 693                 }
 694 #ifdef AFS_NT40_ENV
 695                 afs_snprintf(pbuffer, sizeof(pbuffer), "%s.DB0.OLD", ubik_dbase->pathName);
 696                 unlink(pbuffer);
 697 #endif
 698 #endif
 699             }
 700             if (code) {
 701 #ifndef OLD_URECOVERY
 702                 unlink(pbuffer);
 703                 /*
 704                  * We will effectively invalidate the old data forever now.
 705                  * Unclear if we *should* but we do.
 706                  */
 707 #endif
 708                 ubik_dbase->version.epoch = 0;
 709                 ubik_dbase->version.counter = 0;
 710                 ubik_print("Ubik: Synchronize database failed (error = %d)\n",
 711                            code);
 712             } else {
 713                 ubik_print("Ubik: Synchronize database completed\n");
 714                 urecovery_state |= UBIK_RECHAVEDB;
 715             }
 716             udisk_Invalidate(ubik_dbase, 0);    /* data has changed */
 717 #ifdef AFS_PTHREAD_ENV
 718             assert(pthread_cond_broadcast(&ubik_dbase->version_cond) == 0);
 719 #else
 720             LWP_NoYieldSignal(&ubik_dbase->version);
 721 #endif
 722             DBRELE(ubik_dbase);
 723         }
 724 #if defined(UBIK_PAUSE)
 725         if (!(urecovery_state & UBIK_RECSYNCSITE))
 726             continue;           /* lost sync */
 727 #endif /* UBIK_PAUSE */
 728         if (!(urecovery_state & UBIK_RECHAVEDB))
 729             continue;           /* not ready */
 730
 731         /* If the database was newly initialized, then when we establish quorum, write
 732          * a new label. This allows urecovery_AllBetter() to allow access for reads.
 733          * Setting it to 2 also allows another site to come along with a newer
 734          * database and overwrite this one.
 735          */
 736         if (ubik_dbase->version.epoch == 1) {
 737             DBHOLD(ubik_dbase);
 738             urecovery_AbortAll(ubik_dbase);
 739             ubik_epochTime = 2;
 740             ubik_dbase->version.epoch = ubik_epochTime;
 741             ubik_dbase->version.counter = 1;
 742             code =
 743                 (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version);
 744             udisk_Invalidate(ubik_dbase, 0);    /* data may have changed */
 745 #ifdef AFS_PTHREAD_ENV
 746             assert(pthread_cond_broadcast(&ubik_dbase->version_cond) == 0);
 747 #else
 748             LWP_NoYieldSignal(&ubik_dbase->version);
 749 #endif
 750             DBRELE(ubik_dbase);
 751         }
 752
 753         /* Check the other sites and send the database to them if they
 754          * do not have the current db.
 755          */
 756         if (!(urecovery_state & UBIK_RECSENTDB)) {
 757             /* now propagate out new version to everyone else */
 758             dbok = 1;           /* start off assuming they all worked */
 759
 760             DBHOLD(ubik_dbase);
 761             /*
 762              * Check if a write transaction is in progress. We can't send the
 763              * db when a write is in progress here because the db would be
 764              * obsolete as soon as it goes there. Also, ops after the begin
 765              * trans would reach the recepient and wouldn't find a transaction
 766              * pending there.  Frankly, I don't think it's possible to get past
 767              * the write-lock above if there is a write transaction in progress,
 768              * but then, it won't hurt to check, will it?
 769              */
 770             if (ubik_dbase->flags & DBWRITING) {
 771                 struct timeval tv;
 772                 int safety = 0;
 773                 tv.tv_sec = 0;
 774                 tv.tv_usec = 50000;
 775                 while ((ubik_dbase->flags & DBWRITING) && (safety < 500)) {
 776                     DBRELE(ubik_dbase);
 777                     /* sleep for a little while */
 778 #ifdef AFS_PTHREAD_ENV
 779                     select(0, 0, 0, 0, &tv);
 780 #else
 781                     IOMGR_Select(0, 0, 0, 0, &tv);
 782 #endif
 783                     tv.tv_usec += 10000;
 784                     safety++;
 785                     DBHOLD(ubik_dbase);
 786                 }
 787             }
 788
 789             for (ts = ubik_servers; ts; ts = ts->next) {
 790                 inAddr.s_addr = ts->addr[0];
 791                 if (!ts->up) {
 792                     ubik_dprint("recovery cannot send version to %s\n",
 793                                 afs_inet_ntoa(inAddr.s_addr));
 794                     dbok = 0;
 795                     continue;
 796                 }
 797                 ubik_dprint("recovery sending version to %s\n",
 798                             afs_inet_ntoa(inAddr.s_addr));
 799                 if (vcmp(ts->version, ubik_dbase->version) != 0) {
 800                     ubik_dprint("recovery stating local database\n");
 801
 802                     /* Rx code to do the Bulk Store */
 803                     code = (*ubik_dbase->stat) (ubik_dbase, 0, &ubikstat);
 804                     if (!code) {
 805                         length = ubikstat.size;
 806                         file = offset = 0;
 807                         rxcall = rx_NewCall(ts->disk_rxcid);
 808                         code =
 809                             StartDISK_SendFile(rxcall, file, length,
 810                                                &ubik_dbase->version);
 811                         if (code) {
 812                             ubik_dprint("StartDiskSendFile failed=%d\n",
 813                                         code);
 814                             goto StoreEndCall;
 815                         }
 816                         while (length > 0) {
 817                             tlen =
 818                                 (length >
 819                                  sizeof(tbuffer) ? sizeof(tbuffer) : length);
 820                             nbytes =
 821                                 (*ubik_dbase->read) (ubik_dbase, file,
 822                                                      tbuffer, offset, tlen);
 823                             if (nbytes != tlen) {
 824                                 ubik_dprint("Local disk read error=%d\n",
 825                                             code = UIOERROR);
 826                                 goto StoreEndCall;
 827                             }
 828                             nbytes = rx_Write(rxcall, tbuffer, tlen);
 829                             if (nbytes != tlen) {
 830                                 ubik_dprint("Rx-write bulk error=%d\n", code =
 831                                             BULK_ERROR);
 832                                 goto StoreEndCall;
 833                             }
 834                             offset += tlen;
 835                             length -= tlen;
 836                         }
 837                         code = EndDISK_SendFile(rxcall);
 838                       StoreEndCall:
 839                         code = rx_EndCall(rxcall, code);
 840                     }
 841                     if (code == 0) {
 842                         /* we set a new file, process its header */
 843                         ts->version = ubik_dbase->version;
 844                         ts->currentDB = 1;
 845                     } else
 846                         dbok = 0;
 847                 } else {
 848                     /* mark file up to date */
 849                     ts->currentDB = 1;
 850                 }
 851             }
 852             DBRELE(ubik_dbase);
 853             if (dbok)
 854                 urecovery_state |= UBIK_RECSENTDB;
 855         }
 856     }
 857     return NULL;
 858 }
 859
 860 /*!
 861  * \brief send a Probe to all the network address of this server
 862  *
 863  * \return 0 if success, else return 1
 864  */
 865 int
 866 DoProbe(struct ubik_server *server)
 867 {
 868     struct rx_connection *conns[UBIK_MAX_INTERFACE_ADDR];
 869     struct rx_connection *connSuccess = 0;
 870     int i, j;
 871     afs_uint32 addr;
 872     char buffer[32];
 873     extern afs_int32 ubikSecIndex;
 874     extern struct rx_securityClass *ubikSecClass;
 875
 876     for (i = 0; (addr = server->addr[i]) && (i < UBIK_MAX_INTERFACE_ADDR);
 877          i++) {
 878         conns[i] =
 879             rx_NewConnection(addr, ubik_callPortal, DISK_SERVICE_ID,
 880                              ubikSecClass, ubikSecIndex);
 881
 882         /* user requirement to use only the primary interface */
 883         if (ubikPrimaryAddrOnly) {
 884             i = 1;
 885             break;
 886         }
 887     }
 888     assert(i);                  /* at least one interface address for this server */
 889
 890     multi_Rx(conns, i) {
 891         multi_DISK_Probe();
 892         if (!multi_error) {     /* first success */
 893             addr = server->addr[multi_i];       /* successful interface addr */
 894
 895             if (server->disk_rxcid)     /* destroy existing conn */
 896                 rx_DestroyConnection(server->disk_rxcid);
 897             if (server->vote_rxcid)
 898                 rx_DestroyConnection(server->vote_rxcid);
 899
 900             /* make new connections */
 901             server->disk_rxcid = conns[multi_i];
 902             server->vote_rxcid = rx_NewConnection(addr, ubik_callPortal, VOTE_SERVICE_ID, ubikSecClass, ubikSecIndex);  /* for vote reqs */
 903
 904             connSuccess = conns[multi_i];
 905             strcpy(buffer, (char *)afs_inet_ntoa(server->addr[0]));
 906             ubik_print
 907                 ("ubik:server %s is back up: will be contacted through %s\n",
 908                  buffer, afs_inet_ntoa(addr));
 909
 910             multi_Abort;
 911         }
 912     } multi_End_Ignore;
 913
 914     /* Destroy all connections except the one on which we succeeded */
 915     for (j = 0; j < i; j++)
 916         if (conns[j] != connSuccess)
 917             rx_DestroyConnection(conns[j]);
 918
 919     if (!connSuccess)
 920         ubik_dprint("ubik:server %s still down\n",
 921                     afs_inet_ntoa(server->addr[0]));
 922
 923     if (connSuccess)
 924         return 0;               /* success */
 925     else
 926         return 1;               /* failure */
 927 }