#endif
#define BNODE_LWP_STACKSIZE (16 * 1024)
+#define BNODE_ERROR_COUNT_MAX 16 /* maximum number of retries */
int bnode_waiting = 0;
static PROCESS bproc_pid; /* pid of waker-upper */
}
int
+bnode_ResetErrorCount(struct bnode *abnode)
+{
+ abnode->errorStopCount = 0;
+ abnode->errorStopDelay = 0;
+ return 0;
+}
+
+int
bnode_SetStat(struct bnode *abnode, int agoal)
{
abnode->goal = agoal;
tb = tp->bnode;
bnode_Hold(tb);
- /* count restarts in last 10 seconds */
+ /* count restarts in last 30 seconds */
if (temp > tb->rsTime + 30) {
- /* it's been 10 seconds we've been counting */
+ /* it's been 30 seconds we've been counting */
tb->rsTime = temp;
tb->rsCount = 0;
}
+
if (WIFSIGNALED(status) == 0) {
/* exited, not signalled */
tp->lastExit = WEXITSTATUS(status);
tb->notifier);
hdl_notifier(tp);
}
- BOP_PROCEXIT(tb, tp);
- bnode_Check(tb);
- if (tb->rsCount++ > 10) {
- /* 10 in 10 seconds */
+ if (tb->goal && tb->rsCount++ > 10) {
+ /* 10 in 30 seconds */
+ if (tb->errorStopCount >= BNODE_ERROR_COUNT_MAX) {
+ tb->errorStopDelay = 0; /* max reached, give up. */
+ } else {
+ tb->errorStopCount++;
+ if (!tb->errorStopDelay) {
+ tb->errorStopDelay = 1;
+ } else {
+ tb->errorStopDelay *= 2;
+ }
+ }
tb->flags |= BNODE_ERRORSTOP;
bnode_SetGoal(tb, BSTAT_SHUTDOWN);
bozo_Log
("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n",
tb->name);
}
+ BOP_PROCEXIT(tb, tp);
+ bnode_Check(tb);
bnode_Release(tb); /* bnode delete can happen here */
DeleteProc(tp);
} else
free(tp);
return errno;
}
+ bozo_Log("%s started pid %ld: %s\n", abnode->name, cpid, aexecString);
bnode_FreeTokens(tlist);
allProcs = tp;
short flags; /* random flags */
char goal; /* 1=running or 0=not running */
char fileGoal; /* same, but to be stored in file */
+ afs_int32 errorStopCount; /* number of recent error stops */
+ afs_int32 errorStopDelay; /* seconds to wait before retrying start */
};
struct bnode_proc {
extern int bnode_SetStat(struct bnode *abnode, int agoal);
extern int bnode_CreatePidFile(struct bnode *abnode, struct bnode_proc *aproc, char *name);
extern int bnode_DestroyPidFile(struct bnode *abnode, struct bnode_proc *aproc);
+extern int bnode_ResetErrorCount(struct bnode *abnode);
return 0; /* don't do these guys */
bnode_Hold(abnode);
+ bnode_ResetErrorCount(abnode);
bnode_SetStat(abnode, BSTAT_NORMAL);
bnode_Release(abnode);
return 0;
bnode_Hold(tb);
bnode_SetStat(tb, BSTAT_SHUTDOWN);
code = bnode_WaitStatus(tb, BSTAT_SHUTDOWN); /* this can fail */
+ bnode_ResetErrorCount(tb);
bnode_SetStat(tb, BSTAT_NORMAL);
bnode_Release(tb);
goto fail;
}
bnode_Hold(tb);
+ bnode_ResetErrorCount(tb);
code = bnode_SetStat(tb, astatus);
bnode_Release(tb);
static int ez_procstarted(struct bnode *bnode, struct bnode_proc *proc);
#define SDTIME 60 /* time in seconds given to a process to evaporate */
+#define ERROR_RESET_TIME 60 /* time in seconds to wait before resetting error count state */
struct bnode_ops ezbnode_ops = {
ez_create,
return (struct bnode *)te;
}
-/* called to SIGKILL a process if it doesn't terminate normally */
+/* called to SIGKILL a process if it doesn't terminate normally
+ * or to retry start after an error stop. */
static int
ez_timeout(struct bnode *bn)
{
struct ezbnode *abnode = (struct ezbnode *)bn;
- if (!abnode->waitingForShutdown)
- return 0; /* spurious */
- /* send kill and turn off timer */
- bnode_StopProc(abnode->proc, SIGKILL);
- abnode->killSent = 1;
- bnode_SetTimeout((struct bnode *)abnode, 0);
+ if (abnode->waitingForShutdown) {
+ /* send kill and turn off timer */
+ bnode_StopProc(abnode->proc, SIGKILL);
+ abnode->killSent = 1;
+ bnode_SetTimeout((struct bnode *)abnode, 0);
+ } else if (!abnode->running && abnode->b.flags & BNODE_ERRORSTOP) {
+ /* was stopped for too many errors, retrying */
+ /* reset error count after running for a bit */
+ bnode_SetTimeout(bn, ERROR_RESET_TIME);
+ bnode_SetStat(bn, BSTAT_NORMAL);
+ } else {
+ bnode_SetTimeout(bn, 0); /* one shot timer */
+ bnode_ResetErrorCount(bn);
+ }
return 0;
}
temp = BSTAT_SHUTTINGDOWN;
else if (abnode->running)
temp = BSTAT_NORMAL;
+ else if (abnode->b.flags & BNODE_ERRORSTOP)
+ temp = BSTAT_STARTINGUP;
else
temp = BSTAT_SHUTDOWN;
*astatus = temp;
struct ezbnode *abnode = (struct ezbnode *)bn;
/* process has exited */
- afs_int32 code;
+ afs_int32 code = 0;
if (DoPidFiles) {
bozo_DeletePidFile(bn->name, NULL);
bnode_SetTimeout((struct bnode *) abnode, 0); /* clear timer */
if (abnode->b.goal)
code = ez_setstat((struct bnode *) abnode, BSTAT_NORMAL);
- else
- code = 0;
+ else if (abnode->b.flags & BNODE_ERRORSTOP && abnode->b.errorStopDelay) {
+ bozo_Log("%s will retry start in %d seconds\n", abnode->b.name,
+ abnode->b.errorStopDelay);
+ bnode_SetTimeout(bn, abnode->b.errorStopDelay);
+ }
return code;
}
goto done;
}
bnode_SetTimeout(fsbnode2bnode(te), POLLTIME);
- /* ask for timeout activations every 10 seconds */
+ /* ask for timeout activations every 20 seconds */
RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
SetNeedsClock(te); /* compute needsClock field */
goto done;
}
bnode_SetTimeout(fsbnode2bnode(te), POLLTIME);
- /* ask for timeout activations every 10 seconds */
+ /* ask for timeout activations every 20 seconds */
RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
SetNeedsClock(te); /* compute needsClock field */
SDTIME);
}
}
+
+ if ((abnode->b.flags & BNODE_ERRORSTOP) && !abnode->salRunning
+ && !abnode->volRunning && !abnode->fileRunning && !abnode->scanRunning
+ && !abnode->salsrvRunning) {
+ bnode_SetStat(bn, BSTAT_NORMAL);
+ }
+ else {
+ bnode_ResetErrorCount(bn);
+ }
+
SetNeedsClock(abnode);
return 0;
}
static void
SetNeedsClock(struct fsbnode *ab)
{
+ afs_int32 timeout = POLLTIME;
+
if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
&& (!ab->scancmd || ab->scanRunning)
- && (!ab->salsrvcmd || ab->salsrvRunning))
- ab->needsClock = 0; /* running normally */
- else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
- && !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning)
- ab->needsClock = 0; /* halted normally */
- else
+ && (!ab->salsrvcmd || ab->salsrvRunning)) {
+ if (ab->b.errorStopCount) {
+ /* reset error count after running for a bit */
+ ab->needsClock = 1;
+ } else {
+ ab->needsClock = 0; /* running normally */
+ }
+ } else if ((ab->b.goal == 0) && !ab->fileRunning && !ab->volRunning
+ && !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning) {
+ if (ab->b.flags & BNODE_ERRORSTOP && ab->b.errorStopDelay) {
+ bozo_Log("%s will retry start in %d seconds\n", ab->b.name,
+ ab->b.errorStopDelay);
+ ab->needsClock = 1; /* halted for errors, retry later */
+ timeout = ab->b.errorStopDelay;
+ } else {
+ ab->needsClock = 0; /* halted normally */
+ }
+ } else
ab->needsClock = 1; /* other */
- if (ab->needsClock && !bnode_PendingTimeout(fsbnode2bnode(ab)))
- bnode_SetTimeout(fsbnode2bnode(ab), POLLTIME);
+
+ if (ab->needsClock && (!bnode_PendingTimeout(fsbnode2bnode(ab))
+ || ab->b.period != timeout))
+ bnode_SetTimeout(fsbnode2bnode(ab), timeout);
if (!ab->needsClock)
bnode_SetTimeout(fsbnode2bnode(ab), 0);
}