src/ubik/recovery.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 RCSID
  14     ("$Header$");
  15
  16 #include <sys/types.h>
  17 #ifdef AFS_NT40_ENV
  18 #include <winsock2.h>
  19 #include <time.h>
  20 #else
  21 #include <sys/file.h>
  22 #include <netinet/in.h>
  23 #include <sys/time.h>
  24 #endif
  25 #include <assert.h>
  26 #include <lock.h>
  27 #ifdef HAVE_STRING_H
  28 #include <string.h>
  29 #else
  30 #ifdef HAVE_STRINGS_H
  31 #include <strings.h>
  32 #endif
  33 #endif
  34 #include <rx/xdr.h>
  35 #include <rx/rx.h>
  36 #include <errno.h>
  37 #include <afs/afsutil.h>
  38
  39 #define UBIK_INTERNALS
  40 #include "ubik.h"
  41 #include "ubik_int.h"
  42
  43 /* This module is responsible for determining when the system has
  44  * recovered to the point that it can handle new transactions.  It
  45  * replays logs, polls to determine the current dbase after a crash,
  46  * and distributes the new database to the others.
  47  */
  48
  49 /* The sync site associates a version number with each database.  It
  50  * broadcasts the version associated with its current dbase in every
  51  * one of its beacon messages.  When the sync site send a dbase to a
  52  * server, it also sends the db's version.  A non-sync site server can
  53  * tell if it has the right dbase version by simply comparing the
  54  * version from the beacon message (uvote_dbVersion) with the version
  55  * associated with the database (ubik_dbase->version).  The sync site
  56  * itself simply has one counter to keep track of all of this (again
  57  * ubik_dbase->version).
  58  */
  59
  60 /* sync site: routine called when the sync site loses its quorum; this
  61  * procedure is called "up" from the beacon package.  It resyncs the
  62  * dbase and nudges the recovery daemon to try to propagate out the
  63  * changes.  It also resets the recovery daemon's state, since
  64  * recovery must potentially find a new dbase to propagate out.  This
  65  * routine should not do anything with variables used by non-sync site
  66  * servers.
  67  */
  68
  69 /* if this flag is set, then ubik will use only the primary address
  70 ** ( the address specified in the CellServDB) to contact other
  71 ** ubik servers. Ubik recovery will not try opening connections
  72 ** to the alternate interface addresses.
  73 */
  74 int ubikPrimaryAddrOnly;
  75
  76 int
  77 urecovery_ResetState(void)
  78 {
  79     urecovery_state = 0;
  80     LWP_NoYieldSignal(&urecovery_state);
  81     return 0;
  82 }
  83
  84 /* sync site: routine called when a non-sync site server goes down; restarts recovery
  85  * process to send missing server the new db when it comes back up.
  86  * This routine should not do anything with variables used by non-sync site servers.
  87  */
  88 int
  89 urecovery_LostServer(void)
  90 {
  91     LWP_NoYieldSignal(&urecovery_state);
  92     return 0;
  93 }
  94
  95 /* return true iff we have a current database (called by both sync
  96  * sites and non-sync sites) How do we determine this?  If we're the
  97  * sync site, we wait until recovery has finished fetching and
  98  * re-labelling its dbase (it may still be trying to propagate it out
  99  * to everyone else; that's THEIR problem).  If we're not the sync
 100  * site, then we must have a dbase labelled with the right version,
 101  * and we must have a currently-good sync site.
 102  */
 103 int
 104 urecovery_AllBetter(register struct ubik_dbase *adbase, int areadAny)
 105 {
 106     register afs_int32 rcode;
 107
 108     ubik_dprint("allbetter checking\n");
 109     rcode = 0;
 110
 111
 112     if (areadAny) {
 113         if (ubik_dbase->version.epoch > 1)
 114             rcode = 1;          /* Happy with any good version of database */
 115     }
 116
 117     /* Check if we're sync site and we've got the right data */
 118     else if (ubeacon_AmSyncSite() && (urecovery_state & UBIK_RECHAVEDB)) {
 119         rcode = 1;
 120     }
 121
 122     /* next, check if we're aux site, and we've ever been sent the
 123      * right data (note that if a dbase update fails, we won't think
 124      * that the sync site is still the sync site, 'cause it won't talk
 125      * to us until a timeout period has gone by.  When we recover, we
 126      * leave this clear until we get a new dbase */
 127     else if ((uvote_GetSyncSite() && (vcmp(ubik_dbVersion, ubik_dbase->version) == 0))) {       /* && order is important */
 128         rcode = 1;
 129     }
 130
 131     ubik_dprint("allbetter: returning %d\n", rcode);
 132     return rcode;
 133 }
 134
 135 /* abort all transactions on this database */
 136 int
 137 urecovery_AbortAll(struct ubik_dbase *adbase)
 138 {
 139     register struct ubik_trans *tt;
 140     for (tt = adbase->activeTrans; tt; tt = tt->next) {
 141         udisk_abort(tt);
 142     }
 143     return 0;
 144 }
 145
 146 /* this routine aborts the current remote transaction, if any, if the tid is wrong */
 147 int
 148 urecovery_CheckTid(register struct ubik_tid *atid)
 149 {
 150     if (ubik_currentTrans) {
 151         /* there is remote write trans, see if we match, see if this
 152          * is a new transaction */
 153         if (atid->epoch != ubik_currentTrans->tid.epoch
 154             || atid->counter > ubik_currentTrans->tid.counter) {
 155             /* don't match, abort it */
 156             /* If the thread is not waiting for lock - ok to end it */
 157 #if !defined(UBIK_PAUSE)
 158             if (ubik_currentTrans->locktype != LOCKWAIT) {
 159 #endif /* UBIK_PAUSE */
 160                 udisk_end(ubik_currentTrans);
 161 #if !defined(UBIK_PAUSE)
 162             }
 163 #endif /* UBIK_PAUSE */
 164             ubik_currentTrans = (struct ubik_trans *)0;
 165         }
 166     }
 167 }
 168
 169 /* log format is defined here, and implicitly in disk.c
 170  *
 171  * 4 byte opcode, followed by parameters, each 4 bytes long.  All integers
 172  * are in logged in network standard byte order, in case we want to move logs
 173  * from machine-to-machine someday.
 174  *
 175  * Begin transaction: opcode
 176  * Commit transaction: opcode, version (8 bytes)
 177  * Truncate file: opcode, file number, length
 178  * Abort transaction: opcode
 179  * Write data: opcode, file, position, length, <length> data bytes
 180  *
 181  * A very simple routine, it just replays the log.  Note that this is a new-value only log, which
 182  * implies that no uncommitted data is written to the dbase: one writes data to the log, including
 183  * the commit record, then we allow data to be written through to the dbase.  In our particular
 184  * implementation, once a transaction is done, we write out the pages to the database, so that
 185  * our buffer package doesn't have to know about stable and uncommitted data in the memory buffers:
 186  * any changed data while there is an uncommitted write transaction can be zapped during an
 187  * abort and the remaining dbase on the disk is exactly the right dbase, without having to read
 188  * the log.
 189  */
 190
 191 /* replay logs */
 192 static int
 193 ReplayLog(register struct ubik_dbase *adbase)
 194 {
 195     afs_int32 opcode;
 196     register afs_int32 code, tpos;
 197     int logIsGood;
 198     afs_int32 len, thisSize, tfile, filePos;
 199     afs_int32 buffer[4];
 200     afs_int32 syncFile = -1;
 201     afs_int32 data[1024];
 202
 203     /* read the lock twice, once to see whether we have a transaction to deal
 204      * with that committed, (theoretically, we should support more than one
 205      * trans in the log at once, but not yet), and once replaying the
 206      * transactions.  */
 207     tpos = 0;
 208     logIsGood = 0;
 209     /* for now, assume that all ops in log pertain to one transaction; see if there's a commit */
 210     while (1) {
 211         code =
 212             (*adbase->read) (adbase, LOGFILE, &opcode, tpos,
 213                              sizeof(afs_int32));
 214         if (code != sizeof(afs_int32))
 215             break;
 216         if (opcode == LOGNEW) {
 217             /* handle begin trans */
 218             tpos += sizeof(afs_int32);
 219         } else if (opcode == LOGABORT)
 220             break;
 221         else if (opcode == LOGEND) {
 222             logIsGood = 1;
 223             break;
 224         } else if (opcode == LOGTRUNCATE) {
 225             tpos += 4;
 226             code =
 227                 (*adbase->read) (adbase, LOGFILE, buffer, tpos,
 228                                  2 * sizeof(afs_int32));
 229             if (code != 2 * sizeof(afs_int32))
 230                 break;          /* premature eof or io error */
 231             tpos += 2 * sizeof(afs_int32);
 232         } else if (opcode == LOGDATA) {
 233             tpos += 4;
 234             code =
 235                 (*adbase->read) (adbase, LOGFILE, buffer, tpos,
 236                                  3 * sizeof(afs_int32));
 237             if (code != 3 * sizeof(afs_int32))
 238                 break;
 239             /* otherwise, skip over the data bytes, too */
 240             tpos += buffer[2] + 3 * sizeof(afs_int32);
 241         } else {
 242             ubik_dprint("corrupt log opcode (%d) at position %d\n", opcode,
 243                         tpos);
 244             break;              /* corrupt log! */
 245         }
 246     }
 247     if (logIsGood) {
 248         /* actually do the replay; log should go all the way through the commit record, since
 249          * we just read it above. */
 250         tpos = 0;
 251         logIsGood = 0;
 252         syncFile = -1;
 253         while (1) {
 254             code =
 255                 (*adbase->read) (adbase, LOGFILE, &opcode, tpos,
 256                                  sizeof(afs_int32));
 257             if (code != sizeof(afs_int32))
 258                 break;
 259             if (opcode == LOGNEW) {
 260                 /* handle begin trans */
 261                 tpos += sizeof(afs_int32);
 262             } else if (opcode == LOGABORT)
 263                 panic("log abort\n");
 264             else if (opcode == LOGEND) {
 265                 tpos += 4;
 266                 code =
 267                     (*adbase->read) (adbase, LOGFILE, buffer, tpos,
 268                                      2 * sizeof(afs_int32));
 269                 if (code != 2 * sizeof(afs_int32))
 270                     return UBADLOG;
 271                 code = (*adbase->setlabel) (adbase, 0, buffer);
 272                 if (code)
 273                     return code;
 274                 logIsGood = 1;
 275                 break;          /* all done now */
 276             } else if (opcode == LOGTRUNCATE) {
 277                 tpos += 4;
 278                 code =
 279                     (*adbase->read) (adbase, LOGFILE, buffer, tpos,
 280                                      2 * sizeof(afs_int32));
 281                 if (code != 2 * sizeof(afs_int32))
 282                     break;      /* premature eof or io error */
 283                 tpos += 2 * sizeof(afs_int32);
 284                 code =
 285                     (*adbase->truncate) (adbase, ntohl(buffer[0]),
 286                                          ntohl(buffer[1]));
 287                 if (code)
 288                     return code;
 289             } else if (opcode == LOGDATA) {
 290                 tpos += 4;
 291                 code =
 292                     (*adbase->read) (adbase, LOGFILE, buffer, tpos,
 293                                      3 * sizeof(afs_int32));
 294                 if (code != 3 * sizeof(afs_int32))
 295                     break;
 296                 tpos += 3 * sizeof(afs_int32);
 297                 /* otherwise, skip over the data bytes, too */
 298                 len = ntohl(buffer[2]); /* total number of bytes to copy */
 299                 filePos = ntohl(buffer[1]);
 300                 tfile = ntohl(buffer[0]);
 301                 /* try to minimize file syncs */
 302                 if (syncFile != tfile) {
 303                     if (syncFile >= 0)
 304                         code = (*adbase->sync) (adbase, syncFile);
 305                     else
 306                         code = 0;
 307                     syncFile = tfile;
 308                     if (code)
 309                         return code;
 310                 }
 311                 while (len > 0) {
 312                     thisSize = (len > sizeof(data) ? sizeof(data) : len);
 313                     /* copy sizeof(data) buffer bytes at a time */
 314                     code =
 315                         (*adbase->read) (adbase, LOGFILE, data, tpos,
 316                                          thisSize);
 317                     if (code != thisSize)
 318                         return UBADLOG;
 319                     code =
 320                         (*adbase->write) (adbase, tfile, data, filePos,
 321                                           thisSize);
 322                     if (code != thisSize)
 323                         return UBADLOG;
 324                     filePos += thisSize;
 325                     tpos += thisSize;
 326                     len -= thisSize;
 327                 }
 328             } else {
 329                 ubik_dprint("corrupt log opcode (%d) at position %d\n",
 330                             opcode, tpos);
 331                 break;          /* corrupt log! */
 332             }
 333         }
 334         if (logIsGood) {
 335             if (syncFile >= 0)
 336                 code = (*adbase->sync) (adbase, syncFile);
 337             if (code)
 338                 return code;
 339         } else {
 340             ubik_dprint("Log read error on pass 2\n");
 341             return UBADLOG;
 342         }
 343     }
 344
 345     /* now truncate the log, we're done with it */
 346     code = (*adbase->truncate) (adbase, LOGFILE, 0);
 347     return code;
 348 }
 349
 350 /* Called at initialization to figure out version of the dbase we really have.
 351  * This routine is called after replaying the log; it reads the restored labels.
 352  */
 353 static int
 354 InitializeDB(register struct ubik_dbase *adbase)
 355 {
 356     register afs_int32 code;
 357
 358     code = (*adbase->getlabel) (adbase, 0, &adbase->version);
 359     if (code) {
 360         /* try setting the label to a new value */
 361         adbase->version.epoch = 1;      /* value for newly-initialized db */
 362         adbase->version.counter = 1;
 363         code = (*adbase->setlabel) (adbase, 0, &adbase->version);
 364         if (code) {
 365             /* failed, try to set it back */
 366             adbase->version.epoch = 0;
 367             adbase->version.counter = 0;
 368             (*adbase->setlabel) (adbase, 0, &adbase->version);
 369         }
 370         LWP_NoYieldSignal(&adbase->version);
 371     }
 372     return 0;
 373 }
 374
 375 /* initialize the local dbase
 376  * We replay the logs and then read the resulting file to figure out what version we've really got.
 377  */
 378 int
 379 urecovery_Initialize(register struct ubik_dbase *adbase)
 380 {
 381     register afs_int32 code;
 382
 383     code = ReplayLog(adbase);
 384     if (code)
 385         return code;
 386     code = InitializeDB(adbase);
 387     return code;
 388 }
 389
 390 /* Main interaction loop for the recovery manager
 391  * The recovery light-weight process only runs when you're the
 392  * synchronization site.  It performs the following tasks, if and only
 393  * if the prerequisite tasks have been performed successfully (it
 394  * keeps track of which ones have been performed in its bit map,
 395  * urecovery_state).
 396  *
 397  * First, it is responsible for probing that all servers are up.  This
 398  * is the only operation that must be performed even if this is not
 399  * yet the sync site, since otherwise this site may not notice that
 400  * enough other machines are running to even elect this guy to be the
 401  * sync site.
 402  *
 403  * After that, the recovery process does nothing until the beacon and
 404  * voting modules manage to get this site elected sync site.
 405  *
 406  * After becoming sync site, recovery first attempts to find the best
 407  * database available in the network (it must do this in order to
 408  * ensure finding the latest committed data).  After finding the right
 409  * database, it must fetch this dbase to the sync site.
 410  *
 411  * After fetching the dbase, it relabels it with a new version number,
 412  * to ensure that everyone recognizes this dbase as the most recent
 413  * dbase.
 414  *
 415  * One the dbase has been relabelled, this machine can start handling
 416  * requests.  However, the recovery module still has one more task:
 417  * propagating the dbase out to everyone who is up in the network.
 418  */
 419 int
 420 urecovery_Interact(void)
 421 {
 422     afs_int32 code, tcode;
 423     struct ubik_server *bestServer = NULL;
 424     struct ubik_server *ts;
 425     int dbok, doingRPC, now;
 426     afs_int32 lastProbeTime, lastDBVCheck;
 427     /* if we're the sync site, the best db version we've found yet */
 428     static struct ubik_version bestDBVersion;
 429     struct ubik_version tversion;
 430     struct timeval tv;
 431     int length, tlen, offset, file, nbytes;
 432     struct rx_call *rxcall;
 433     char tbuffer[256];
 434     struct ubik_stat ubikstat;
 435     struct in_addr inAddr;
 436
 437     /* otherwise, begin interaction */
 438     urecovery_state = 0;
 439     lastProbeTime = 0;
 440     lastDBVCheck = 0;
 441     while (1) {
 442         /* Run through this loop every 4 seconds */
 443         tv.tv_sec = 4;
 444         tv.tv_usec = 0;
 445         IOMGR_Select(0, 0, 0, 0, &tv);
 446
 447         ubik_dprint("recovery running in state %x\n", urecovery_state);
 448
 449         /* Every 30 seconds, check all the down servers and mark them
 450          * as up if they respond. When a server comes up or found to
 451          * not be current, then re-find the the best database and
 452          * propogate it.
 453          */
 454         if ((now = FT_ApproxTime()) > 30 + lastProbeTime) {
 455             for (ts = ubik_servers, doingRPC = 0; ts; ts = ts->next) {
 456                 if (!ts->up) {
 457                     doingRPC = 1;
 458                     code = DoProbe(ts);
 459                     if (code == 0) {
 460                         ts->up = 1;
 461                         urecovery_state &= ~UBIK_RECFOUNDDB;
 462                     }
 463                 } else if (!ts->currentDB) {
 464                     urecovery_state &= ~UBIK_RECFOUNDDB;
 465                 }
 466             }
 467             if (doingRPC)
 468                 now = FT_ApproxTime();
 469             lastProbeTime = now;
 470         }
 471
 472         /* Mark whether we are the sync site */
 473         if (!ubeacon_AmSyncSite()) {
 474             urecovery_state &= ~UBIK_RECSYNCSITE;
 475             continue;           /* nothing to do */
 476         }
 477         urecovery_state |= UBIK_RECSYNCSITE;
 478
 479         /* If a server has just come up or if we have not found the
 480          * most current database, then go find the most current db.
 481          */
 482         if (!(urecovery_state & UBIK_RECFOUNDDB)) {
 483             bestServer = (struct ubik_server *)0;
 484             bestDBVersion.epoch = 0;
 485             bestDBVersion.counter = 0;
 486             for (ts = ubik_servers; ts; ts = ts->next) {
 487                 if (!ts->up)
 488                     continue;   /* don't bother with these guys */
 489                 if (ts->isClone)
 490                     continue;
 491                 code = DISK_GetVersion(ts->disk_rxcid, &ts->version);
 492                 if (code == 0) {
 493                     /* perhaps this is the best version */
 494                     if (vcmp(ts->version, bestDBVersion) > 0) {
 495                         /* new best version */
 496                         bestDBVersion = ts->version;
 497                         bestServer = ts;
 498                     }
 499                 }
 500             }
 501             /* take into consideration our version. Remember if we,
 502              * the sync site, have the best version. Also note that
 503              * we may need to send the best version out.
 504              */
 505             if (vcmp(ubik_dbase->version, bestDBVersion) >= 0) {
 506                 bestDBVersion = ubik_dbase->version;
 507                 bestServer = (struct ubik_server *)0;
 508                 urecovery_state |= UBIK_RECHAVEDB;
 509             } else {
 510                 /* Clear the flag only when we know we have to retrieve
 511                  * the db. Because urecovery_AllBetter() looks at it.
 512                  */
 513                 urecovery_state &= ~UBIK_RECHAVEDB;
 514             }
 515             lastDBVCheck = FT_ApproxTime();
 516             urecovery_state |= UBIK_RECFOUNDDB;
 517             urecovery_state &= ~UBIK_RECSENTDB;
 518         }
 519 #if defined(UBIK_PAUSE)
 520         /* it's not possible for UBIK_RECFOUNDDB not to be set here.
 521          * However, we might have lost UBIK_RECSYNCSITE, and that
 522          * IS important.
 523          */
 524         if (!(urecovery_state & UBIK_RECSYNCSITE))
 525             continue;           /* lost sync */
 526 #else
 527         if (!(urecovery_state & UBIK_RECFOUNDDB))
 528             continue;           /* not ready */
 529 #endif /* UBIK_PAUSE */
 530
 531         /* If we, the sync site, do not have the best db version, then
 532          * go and get it from the server that does.
 533          */
 534         if ((urecovery_state & UBIK_RECHAVEDB) || !bestServer) {
 535             urecovery_state |= UBIK_RECHAVEDB;
 536         } else {
 537             /* we don't have the best version; we should fetch it. */
 538 #if defined(UBIK_PAUSE)
 539             DBHOLD(ubik_dbase);
 540 #else
 541             ObtainWriteLock(&ubik_dbase->versionLock);
 542 #endif /* UBIK_PAUSE */
 543             urecovery_AbortAll(ubik_dbase);
 544
 545             /* Rx code to do the Bulk fetch */
 546             file = 0;
 547             offset = 0;
 548             rxcall = rx_NewCall(bestServer->disk_rxcid);
 549
 550             ubik_print("Ubik: Synchronize database with server %s\n",
 551                        afs_inet_ntoa(bestServer->addr[0]));
 552
 553             code = StartDISK_GetFile(rxcall, file);
 554             if (code) {
 555                 ubik_dprint("StartDiskGetFile failed=%d\n", code);
 556                 goto FetchEndCall;
 557             }
 558             nbytes = rx_Read(rxcall, &length, sizeof(afs_int32));
 559             length = ntohl(length);
 560             if (nbytes != sizeof(afs_int32)) {
 561                 ubik_dprint("Rx-read length error=%d\n", code = BULK_ERROR);
 562                 code = EIO;
 563                 goto FetchEndCall;
 564             }
 565
 566             /* Truncate the file firest */
 567             code = (*ubik_dbase->truncate) (ubik_dbase, file, 0);
 568             if (code) {
 569                 ubik_dprint("truncate io error=%d\n", code);
 570                 goto FetchEndCall;
 571             }
 572
 573             /* give invalid label during file transit */
 574             tversion.epoch = 0;
 575             tversion.counter = 0;
 576             code = (*ubik_dbase->setlabel) (ubik_dbase, file, &tversion);
 577             if (code) {
 578                 ubik_dprint("setlabel io error=%d\n", code);
 579                 goto FetchEndCall;
 580             }
 581
 582             while (length > 0) {
 583                 tlen = (length > sizeof(tbuffer) ? sizeof(tbuffer) : length);
 584                 nbytes = rx_Read(rxcall, tbuffer, tlen);
 585                 if (nbytes != tlen) {
 586                     ubik_dprint("Rx-read bulk error=%d\n", code = BULK_ERROR);
 587                     code = EIO;
 588                     goto FetchEndCall;
 589                 }
 590                 nbytes =
 591                     (*ubik_dbase->write) (ubik_dbase, file, tbuffer, offset,
 592                                           tlen);
 593                 if (nbytes != tlen) {
 594                     code = UIOERROR;
 595                     goto FetchEndCall;
 596                 }
 597                 offset += tlen;
 598                 length -= tlen;
 599             }
 600             code = EndDISK_GetFile(rxcall, &tversion);
 601           FetchEndCall:
 602             tcode = rx_EndCall(rxcall, code);
 603             if (!code)
 604                 code = tcode;
 605             if (!code) {
 606                 /* we got a new file, set up its header */
 607                 urecovery_state |= UBIK_RECHAVEDB;
 608                 memcpy(&ubik_dbase->version, &tversion,
 609                        sizeof(struct ubik_version));
 610                 (*ubik_dbase->sync) (ubik_dbase, 0);    /* get data out first */
 611                 /* after data is good, sync disk with correct label */
 612                 code =
 613                     (*ubik_dbase->setlabel) (ubik_dbase, 0,
 614                                              &ubik_dbase->version);
 615             }
 616             if (code) {
 617                 ubik_dbase->version.epoch = 0;
 618                 ubik_dbase->version.counter = 0;
 619                 ubik_print("Ubik: Synchronize database failed (error = %d)\n",
 620                            code);
 621             } else {
 622                 ubik_print("Ubik: Synchronize database completed\n");
 623             }
 624             udisk_Invalidate(ubik_dbase, 0);    /* data has changed */
 625             LWP_NoYieldSignal(&ubik_dbase->version);
 626 #if defined(UBIK_PAUSE)
 627             DBRELE(ubik_dbase);
 628 #else
 629             ReleaseWriteLock(&ubik_dbase->versionLock);
 630 #endif /* UBIK_PAUSE */
 631         }
 632 #if defined(UBIK_PAUSE)
 633         if (!(urecovery_state & UBIK_RECSYNCSITE))
 634             continue;           /* lost sync */
 635 #endif /* UBIK_PAUSE */
 636         if (!(urecovery_state & UBIK_RECHAVEDB))
 637             continue;           /* not ready */
 638
 639         /* If the database was newly initialized, then when we establish quorum, write
 640          * a new label. This allows urecovery_AllBetter() to allow access for reads.
 641          * Setting it to 2 also allows another site to come along with a newer
 642          * database and overwrite this one.
 643          */
 644         if (ubik_dbase->version.epoch == 1) {
 645 #if defined(UBIK_PAUSE)
 646             DBHOLD(ubik_dbase);
 647 #else
 648             ObtainWriteLock(&ubik_dbase->versionLock);
 649 #endif /* UBIK_PAUSE */
 650             urecovery_AbortAll(ubik_dbase);
 651             ubik_epochTime = 2;
 652             ubik_dbase->version.epoch = ubik_epochTime;
 653             ubik_dbase->version.counter = 1;
 654             code =
 655                 (*ubik_dbase->setlabel) (ubik_dbase, 0, &ubik_dbase->version);
 656             udisk_Invalidate(ubik_dbase, 0);    /* data may have changed */
 657             LWP_NoYieldSignal(&ubik_dbase->version);
 658 #if defined(UBIK_PAUSE)
 659             DBRELE(ubik_dbase);
 660 #else
 661             ReleaseWriteLock(&ubik_dbase->versionLock);
 662 #endif /* UBIK_PAUSE */
 663         }
 664
 665         /* Check the other sites and send the database to them if they
 666          * do not have the current db.
 667          */
 668         if (!(urecovery_state & UBIK_RECSENTDB)) {
 669             /* now propagate out new version to everyone else */
 670             dbok = 1;           /* start off assuming they all worked */
 671
 672 #if defined(UBIK_PAUSE)
 673             DBHOLD(ubik_dbase);
 674 #else
 675             ObtainWriteLock(&ubik_dbase->versionLock);
 676 #endif /* UBIK_PAUSE */
 677             /*
 678              * Check if a write transaction is in progress. We can't send the
 679              * db when a write is in progress here because the db would be
 680              * obsolete as soon as it goes there. Also, ops after the begin
 681              * trans would reach the recepient and wouldn't find a transaction
 682              * pending there.  Frankly, I don't think it's possible to get past
 683              * the write-lock above if there is a write transaction in progress,
 684              * but then, it won't hurt to check, will it?
 685              */
 686             if (ubik_dbase->flags & DBWRITING) {
 687                 struct timeval tv;
 688                 int safety = 0;
 689                 tv.tv_sec = 0;
 690                 tv.tv_usec = 50000;
 691                 while ((ubik_dbase->flags & DBWRITING) && (safety < 500)) {
 692 #if defined(UBIK_PAUSE)
 693                     DBRELE(ubik_dbase);
 694 #else
 695                     ReleaseWriteLock(&ubik_dbase->versionLock);
 696 #endif /* UBIK_PAUSE */
 697                     /* sleep for a little while */
 698                     IOMGR_Select(0, 0, 0, 0, &tv);
 699                     tv.tv_usec += 10000;
 700                     safety++;
 701 #if defined(UBIK_PAUSE)
 702                     DBHOLD(ubik_dbase);
 703 #else
 704                     ObtainWriteLock(&ubik_dbase->versionLock);
 705 #endif /* UBIK_PAUSE */
 706                 }
 707             }
 708
 709             for (ts = ubik_servers; ts; ts = ts->next) {
 710                 inAddr.s_addr = ts->addr[0];
 711                 if (!ts->up) {
 712                     ubik_dprint("recovery cannot send version to %s\n",
 713                                 afs_inet_ntoa(inAddr.s_addr));
 714                     dbok = 0;
 715                     continue;
 716                 }
 717                 ubik_dprint("recovery sending version to %s\n",
 718                             afs_inet_ntoa(inAddr.s_addr));
 719                 if (vcmp(ts->version, ubik_dbase->version) != 0) {
 720                     ubik_dprint("recovery stating local database\n");
 721
 722                     /* Rx code to do the Bulk Store */
 723                     code = (*ubik_dbase->stat) (ubik_dbase, 0, &ubikstat);
 724                     if (!code) {
 725                         length = ubikstat.size;
 726                         file = offset = 0;
 727                         rxcall = rx_NewCall(ts->disk_rxcid);
 728                         code =
 729                             StartDISK_SendFile(rxcall, file, length,
 730                                                &ubik_dbase->version);
 731                         if (code) {
 732                             ubik_dprint("StartDiskSendFile failed=%d\n",
 733                                         code);
 734                             goto StoreEndCall;
 735                         }
 736                         while (length > 0) {
 737                             tlen =
 738                                 (length >
 739                                  sizeof(tbuffer) ? sizeof(tbuffer) : length);
 740                             nbytes =
 741                                 (*ubik_dbase->read) (ubik_dbase, file,
 742                                                      tbuffer, offset, tlen);
 743                             if (nbytes != tlen) {
 744                                 ubik_dprint("Local disk read error=%d\n",
 745                                             code = UIOERROR);
 746                                 goto StoreEndCall;
 747                             }
 748                             nbytes = rx_Write(rxcall, tbuffer, tlen);
 749                             if (nbytes != tlen) {
 750                                 ubik_dprint("Rx-write bulk error=%d\n", code =
 751                                             BULK_ERROR);
 752                                 goto StoreEndCall;
 753                             }
 754                             offset += tlen;
 755                             length -= tlen;
 756                         }
 757                         code = EndDISK_SendFile(rxcall);
 758                       StoreEndCall:
 759                         code = rx_EndCall(rxcall, code);
 760                     }
 761                     if (code == 0) {
 762                         /* we set a new file, process its header */
 763                         ts->version = ubik_dbase->version;
 764                         ts->currentDB = 1;
 765                     } else
 766                         dbok = 0;
 767                 } else {
 768                     /* mark file up to date */
 769                     ts->currentDB = 1;
 770                 }
 771             }
 772 #if defined(UBIK_PAUSE)
 773             DBRELE(ubik_dbase);
 774 #else
 775             ReleaseWriteLock(&ubik_dbase->versionLock);
 776 #endif /* UBIK_PAUSE */
 777             if (dbok)
 778                 urecovery_state |= UBIK_RECSENTDB;
 779         }
 780     }
 781 }
 782
 783 /*
 784 ** send a Probe to all the network address of this server
 785 ** Return 0  if success, else return 1
 786 */
 787 int
 788 DoProbe(struct ubik_server *server)
 789 {
 790     struct rx_connection *conns[UBIK_MAX_INTERFACE_ADDR];
 791     struct rx_connection *connSuccess = 0;
 792     int i, j;
 793     afs_uint32 addr;
 794     char buffer[32];
 795     extern afs_int32 ubikSecIndex;
 796     extern struct rx_securityClass *ubikSecClass;
 797
 798     for (i = 0; (addr = server->addr[i]) && (i < UBIK_MAX_INTERFACE_ADDR);
 799          i++) {
 800         conns[i] =
 801             rx_NewConnection(addr, ubik_callPortal, DISK_SERVICE_ID,
 802                              ubikSecClass, ubikSecIndex);
 803
 804         /* user requirement to use only the primary interface */
 805         if (ubikPrimaryAddrOnly) {
 806             i = 1;
 807             break;
 808         }
 809     }
 810     assert(i);                  /* at least one interface address for this server */
 811
 812     multi_Rx(conns, i) {
 813         multi_DISK_Probe();
 814         if (!multi_error) {     /* first success */
 815             addr = server->addr[multi_i];       /* successful interface addr */
 816
 817             if (server->disk_rxcid)     /* destroy existing conn */
 818                 rx_DestroyConnection(server->disk_rxcid);
 819             if (server->vote_rxcid)
 820                 rx_DestroyConnection(server->vote_rxcid);
 821
 822             /* make new connections */
 823             server->disk_rxcid = conns[multi_i];
 824             server->vote_rxcid = rx_NewConnection(addr, ubik_callPortal, VOTE_SERVICE_ID, ubikSecClass, ubikSecIndex);  /* for vote reqs */
 825
 826             connSuccess = conns[multi_i];
 827             strcpy(buffer, (char *)afs_inet_ntoa(server->addr[0]));
 828             ubik_print
 829                 ("ubik:server %s is back up: will be contacted through %s\n",
 830                  buffer, afs_inet_ntoa(addr));
 831
 832             multi_Abort;
 833         }
 834     } multi_End_Ignore;
 835
 836     /* Destroy all connections except the one on which we succeeded */
 837     for (j = 0; j < i; j++)
 838         if (conns[j] != connSuccess)
 839             rx_DestroyConnection(conns[j]);
 840
 841     if (!connSuccess)
 842         ubik_dprint("ubik:server %s still down\n",
 843                     afs_inet_ntoa(server->addr[0]));
 844
 845     if (connSuccess)
 846         return 0;               /* success */
 847     else
 848         return 1;               /* failure */
 849 }