ubik sync client error recovery
authorDerrick Brashear <shadow@dementia.org>
Tue, 16 Feb 2010 06:13:57 +0000 (01:13 -0500)
committerDerrick Brashear <shadow@dementia.org>
Sat, 27 Nov 2010 17:05:55 +0000 (09:05 -0800)
give ubik server "client mode' error recovery for token errors

Change-Id: Ibd6cad6ecf067da7da5724491756576d1ffedb03
Reviewed-on: http://gerrit.openafs.org/3150
Reviewed-by: Derrick Brashear <shadow@dementia.org>
Tested-by: BuildBot <buildbot@rampaginggeek.com>

src/auth/cellconfig.c
src/auth/cellconfig.p.h
src/ubik/Makefile.in
src/ubik/beacon.c
src/ubik/recovery.c
src/ubik/ubik.c
src/ubik/ubik.p.h

index 25c0a84..5830f91 100644 (file)
@@ -1500,6 +1500,39 @@ afsconf_GetLocalCell(struct afsconf_dir *adir, char *aname,
 }
 
 int
+afsconf_UpToDate(struct afsconf_dir *adir)
+{
+    char tbuffer[256];
+#ifdef AFS_NT40_ENV
+    char *p;
+#endif
+    struct stat tstat;
+    afs_int32 code = 0; /* default to not up to date */
+    LOCK_GLOBAL_MUTEX;
+#ifdef AFS_NT40_ENV
+    /* NT client config dir has no KeyFile; don't risk attempting open
+     * because there might be a random file of this name if dir is shared.
+     */
+    if (IsClientConfigDirectory(adir->name)) {
+       /* Not a server, nothing to reread */
+       code = 1;
+    } else {
+#endif
+       strcompose(tbuffer, 256, adir->name, "/", AFSDIR_KEY_FILE, NULL);
+
+       /* did file change? */
+       code = stat(tbuffer, &tstat);
+       if ((code == 0) && (tstat.st_mtime <= adir->timeRead)) {
+           code = 1;
+       }
+#ifdef AFS_NT40_ENV
+    }
+#endif
+    UNLOCK_GLOBAL_MUTEX;
+    return code;
+}
+
+int
 afsconf_Close(struct afsconf_dir *adir)
 {
     LOCK_GLOBAL_MUTEX;
index 056f7fa..65b0244 100644 (file)
@@ -111,6 +111,7 @@ extern int afsconf_GetCellInfo(struct afsconf_dir *adir, char *acellName,
 extern int afsconf_GetLocalCell(struct afsconf_dir *adir,
                                char *aname, afs_int32 alen);
 extern int afsconf_Close(struct afsconf_dir *adir);
+extern int afsconf_UpToDate(struct afsconf_dir *adir);
 extern int afsconf_IntGetKeys(struct afsconf_dir *adir);
 extern int afsconf_GetKeys(struct afsconf_dir *adir,
                           struct afsconf_keys *astr);
index 6b8fe74..e23d85f 100644 (file)
@@ -22,7 +22,7 @@ INCLS=${TOP_INCDIR}/lwp.h ${TOP_INCDIR}/lock.h \
        ${TOP_INCDIR}/rx/rx.h ${TOP_INCDIR}/rx/xdr.h \
        ${TOP_INCDIR}/lock.h ubik.h ubik_int.h
 
-LIBS=${TOP_LIBDIR}/librx.a ${TOP_LIBDIR}/liblwp.a \
+LIBS=${TOP_LIBDIR}/libauth.a ${TOP_LIBDIR}/librx.a ${TOP_LIBDIR}/liblwp.a \
      ${TOP_LIBDIR}/libcom_err.a ${TOP_LIBDIR}/libcmd.a \
      ${TOP_LIBDIR}/util.a ${TOP_LIBDIR}/libsys.a ${XLIBS}
 
index e803d1f..2f7d66e 100644 (file)
@@ -31,6 +31,7 @@
 #include <lock.h>
 #include <rx/xdr.h>
 #include <rx/rx.h>
+#include <rx/rxkad.h>
 #include <rx/rx_multi.h>
 #include <afs/cellconfig.h>
 #ifndef AFS_NT40_ENV
@@ -172,6 +173,52 @@ ubeacon_InitServerList(afs_uint32 ame, afs_uint32 aservers[])
     return code;
 }
 
+void
+ubeacon_InitSecurityClass(void)
+{
+    int i;
+    /* get the security index to use, if we can */
+    if (ubik_CRXSecurityProc) {
+       i = (*ubik_CRXSecurityProc) (ubik_CRXSecurityRock, &ubikSecClass,
+                                    &ubikSecIndex);
+    } else
+       i = 1;
+    if (i) {
+       /* don't have sec module yet */
+       ubikSecIndex = 0;
+       ubikSecClass = rxnull_NewClientSecurityObject();
+    }
+}
+
+void
+ubeacon_ReinitServer(struct ubik_server *ts)
+{
+    if (!afsconf_UpToDate(ubik_CRXSecurityRock)) {
+       struct rx_connection *disk_rxcid;
+       struct rx_connection *vote_rxcid;
+       struct rx_connection *tmp;
+       ubeacon_InitSecurityClass();
+       disk_rxcid =
+           rx_NewConnection(rx_HostOf(rx_PeerOf(ts->disk_rxcid)),
+                            ubik_callPortal, DISK_SERVICE_ID,
+                            ubikSecClass, ubikSecIndex);
+       if (disk_rxcid) {
+           tmp = ts->disk_rxcid;
+           ts->disk_rxcid = disk_rxcid;
+           rx_PutConnection(tmp);
+       }
+       vote_rxcid =
+           rx_NewConnection(rx_HostOf(rx_PeerOf(ts->vote_rxcid)),
+                            ubik_callPortal, VOTE_SERVICE_ID,
+                            ubikSecClass, ubikSecIndex);
+       if (vote_rxcid) {
+           tmp = ts->vote_rxcid;
+           ts->vote_rxcid = vote_rxcid;
+           rx_PutConnection(tmp);
+       }
+    }
+}
+
 /*!
  * \brief setup server list
  *
@@ -212,17 +259,8 @@ ubeacon_InitServerListCommon(afs_uint32 ame, struct afsconf_cell *info,
     if ((code = verifyInterfaceAddress(&ame, info, aservers)))
        return code;
 
-    /* get the security index to use, if we can */
-    if (ubik_CRXSecurityProc) {
-       i = (*ubik_CRXSecurityProc) (ubik_CRXSecurityRock, &ubikSecClass,
-                                    &ubikSecIndex);
-    } else
-       i = 1;
-    if (i) {
-       /* don't have sec module yet */
-       ubikSecIndex = 0;
-       ubikSecClass = rxnull_NewClientSecurityObject();
-    }
+    ubeacon_InitSecurityClass();
+
     magicHost = ntohl(ame);    /* do comparisons in host order */
     magicServer = (struct ubik_server *)0;
 
@@ -433,18 +471,26 @@ ubeacon_Interact(void *dummy)
                 * the vote was computed, *not* the time the vote expires.  We compute
                 * the latter down below if we got enough votes to go with */
                if (code > 0) {
-                   ts->lastVoteTime = code;
-                   if (code < oldestYesVote)
-                       oldestYesVote = code;
-                   ts->lastVote = 1;
-                   if (!ts->isClone)
-                       yesVotes += 2;
-                   if (ts->magic)
-                       yesVotes++;     /* the extra epsilon */
-                   ts->up = 1; /* server is up (not really necessary: recovery does this for real) */
-                   ts->beaconSinceDown = 1;
-                   ubik_dprint("yes vote from host %s\n",
-                               afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   if ((code & ~0xff) == ERROR_TABLE_BASE_RXK) {
+                       ubik_dprint("token error %d from host %s\n",
+                                   code, afs_inet_ntoa_r(ts->addr[0], hoststr));
+                       ts->up = 0;
+                       ts->beaconSinceDown = 0;
+                       urecovery_LostServer(ts);
+                   } else {
+                       ts->lastVoteTime = code;
+                       if (code < oldestYesVote)
+                           oldestYesVote = code;
+                       ts->lastVote = 1;
+                       if (!ts->isClone)
+                           yesVotes += 2;
+                       if (ts->magic)
+                           yesVotes++; /* the extra epsilon */
+                       ts->up = 1;     /* server is up (not really necessary: recovery does this for real) */
+                       ts->beaconSinceDown = 1;
+                       ubik_dprint("yes vote from host %s\n",
+                                   afs_inet_ntoa_r(ts->addr[0], hoststr));
+                   }
                } else if (code == 0) {
                    ts->lastVoteTime = temp;
                    ts->lastVote = 0;
@@ -454,7 +500,7 @@ ubeacon_Interact(void *dummy)
                } else if (code < 0) {
                    ts->up = 0;
                    ts->beaconSinceDown = 0;
-                   urecovery_LostServer();
+                   urecovery_LostServer(ts);
                    ubik_dprint("time out from %s\n",
                                afs_inet_ntoa_r(ts->addr[0], hoststr));
                }
index e47beb5..cc5d951 100644 (file)
@@ -31,6 +31,7 @@
 #include <rx/xdr.h>
 #include <rx/rx.h>
 #include <afs/afsutil.h>
+#include <afs/cellconfig.h>
 
 #define UBIK_INTERNALS
 #include "ubik.h"
@@ -84,13 +85,15 @@ urecovery_ResetState(void)
  * \brief sync site
  *
  * routine called when a non-sync site server goes down; restarts recovery
- * process to send missing server the new db when it comes back up.
+ * process to send missing server the new db when it comes back up for
+ * non-sync site servers.
  *
  * \note This routine should not do anything with variables used by non-sync site servers.
  */
 int
-urecovery_LostServer(void)
+urecovery_LostServer(struct ubik_server *ts)
 {
+    ubeacon_ReinitServer(ts);
 #if !defined(AFS_PTHREAD_ENV)
     /*  No corresponding LWP_WaitProcess found anywhere for this -- klm */
     LWP_NoYieldSignal(&urecovery_state);
index 082b437..bd25f48 100644 (file)
@@ -163,7 +163,7 @@ ContactQuorum_NoArguments(afs_int32 (*proc)(struct rx_connection *, ubik_tid *),
            ts->up = 0;         /* mark as down now; beacons will no longer be sent */
            ts->currentDB = 0;
            ts->beaconSinceDown = 0;
-           urecovery_LostServer();     /* tell recovery to try to resend dbase later */
+           urecovery_LostServer(ts);   /* tell recovery to try to resend dbase later */
        } else {                /* success */
            if (!ts->isClone)
                okcalls++;      /* count up how many worked */
@@ -209,7 +209,7 @@ ContactQuorum_DISK_Lock(struct ubik_trans *atrans, int aflags,afs_int32 file,
            ts->up = 0;         /* mark as down now; beacons will no longer be sent */
            ts->currentDB = 0;
            ts->beaconSinceDown = 0;
-           urecovery_LostServer();     /* tell recovery to try to resend dbase later */
+           urecovery_LostServer(ts);   /* tell recovery to try to resend dbase later */
        } else {                /* success */
            if (!ts->isClone)
                okcalls++;      /* count up how many worked */
@@ -255,7 +255,7 @@ ContactQuorum_DISK_Write(struct ubik_trans *atrans, int aflags,
            ts->up = 0;         /* mark as down now; beacons will no longer be sent */
            ts->currentDB = 0;
            ts->beaconSinceDown = 0;
-           urecovery_LostServer();     /* tell recovery to try to resend dbase later */
+           urecovery_LostServer(ts);   /* tell recovery to try to resend dbase later */
        } else {                /* success */
            if (!ts->isClone)
                okcalls++;      /* count up how many worked */
@@ -301,7 +301,7 @@ ContactQuorum_DISK_Truncate(struct ubik_trans *atrans, int aflags,
            ts->up = 0;         /* mark as down now; beacons will no longer be sent */
            ts->currentDB = 0;
            ts->beaconSinceDown = 0;
-           urecovery_LostServer();     /* tell recovery to try to resend dbase later */
+           urecovery_LostServer(ts);   /* tell recovery to try to resend dbase later */
        } else {                /* success */
            if (!ts->isClone)
                okcalls++;      /* count up how many worked */
@@ -382,7 +382,7 @@ ContactQuorum_DISK_WriteV(struct ubik_trans *atrans, int aflags,
            ts->up = 0;         /* mark as down now; beacons will no longer be sent */
            ts->currentDB = 0;
            ts->beaconSinceDown = 0;
-           urecovery_LostServer();     /* tell recovery to try to resend dbase later */
+           urecovery_LostServer(ts);   /* tell recovery to try to resend dbase later */
        } else {                /* success */
            if (!ts->isClone)
                okcalls++;      /* count up how many worked */
@@ -429,7 +429,7 @@ ContactQuorum_DISK_SetVersion(struct ubik_trans *atrans, int aflags,
            ts->up = 0;         /* mark as down now; beacons will no longer be sent */
            ts->currentDB = 0;
            ts->beaconSinceDown = 0;
-           urecovery_LostServer();     /* tell recovery to try to resend dbase later */
+           urecovery_LostServer(ts);   /* tell recovery to try to resend dbase later */
        } else {                /* success */
            if (!ts->isClone)
                okcalls++;      /* count up how many worked */
index ebe209a..0f4ebb7 100644 (file)
@@ -368,7 +368,7 @@ extern void uphys_invalidate(struct ubik_dbase *adbase,
 
 /*! \name recovery.c */
 extern int urecovery_ResetState(void);
-extern int urecovery_LostServer(void);
+extern int urecovery_LostServer(struct ubik_server *server);
 extern int urecovery_AllBetter(struct ubik_dbase *adbase,
                               int areadAny);
 extern int urecovery_AbortAll(struct ubik_dbase *adbase);
@@ -417,6 +417,8 @@ extern afs_uint32 ubikGetPrimaryInterfaceAddr(afs_uint32 addr);
 
 /*! \name beacon.c */
 struct afsconf_cell;
+extern void ubeacon_InitSecurityClass(void);
+extern void ubeacon_ReinitServer(struct ubik_server *ts);
 extern void ubeacon_Debug(struct ubik_debug *aparm);
 extern int ubeacon_AmSyncSite(void);
 extern int ubeacon_InitServerListByInfo(afs_uint32 ame,