2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
13 #include <afs/procmgmt.h>
20 #include <afs/audit.h>
21 #include <afs/afsutil.h>
22 #include <afs/fileutil.h>
25 #include "bosprototypes.h"
28 #define WCOREDUMP(x) ((x) & 0200)
31 #define BNODE_LWP_STACKSIZE (16 * 1024)
32 #define BNODE_ERROR_COUNT_MAX 16 /* maximum number of retries */
34 int bnode_waiting = 0;
35 static PROCESS bproc_pid; /* pid of waker-upper */
36 static struct bnode *allBnodes = 0; /* list of all bnodes */
37 static struct bnode_proc *allProcs = 0; /* list of all processes for which we're waiting */
38 static struct bnode_type *allTypes = 0; /* list of registered type handlers */
40 static struct bnode_stats {
44 extern const char *DoCore;
45 extern const char *DoPidFiles;
47 extern char **environ; /* env structure */
50 int hdl_notifier(struct bnode_proc *tp);
52 /* Remember the name of the process, if any, that failed last */
54 RememberProcName(struct bnode_proc *ap)
56 struct bnode *tbnodep;
59 if (tbnodep->lastErrorName) {
60 free(tbnodep->lastErrorName);
61 tbnodep->lastErrorName = NULL;
64 tbnodep->lastErrorName = strdup(ap->coreName);
67 /* utility for use by BOP_HASCORE functions to determine where a core file might
71 bnode_CoreName(struct bnode *abnode, char *acoreName, char *abuffer)
74 strcpy(abuffer, DoCore);
76 strcat(abuffer, AFSDIR_CORE_FILE);
78 strcpy(abuffer, AFSDIR_SERVER_CORELOG_FILEPATH);
80 strcat(abuffer, acoreName);
83 strcat(abuffer, abnode->name);
87 /* save core file, if any */
89 SaveCore(struct bnode *abnode, struct bnode_proc
95 char *corefile = NULL;
96 #ifdef BOZO_SAVE_CORES
98 struct tm *TimeFields;
102 /* Linux always appends the PID to core dumps from threaded processes, so
103 * we have to scan the directory to find core files under another name. */
105 strcpy(tbuffer, DoCore);
106 strcat(tbuffer, "/");
107 strcat(tbuffer, AFSDIR_CORE_FILE);
109 code = stat(AFSDIR_SERVER_CORELOG_FILEPATH, &tstat);
115 const char *coredir = AFSDIR_LOGS_DIR;
120 logdir = opendir(coredir);
123 while ((file = readdir(logdir)) != NULL) {
124 if (strncmp(file->d_name, "core.", 5) != 0)
126 pid = atol(file->d_name + 5);
127 if (pid == aproc->pid) {
128 length = strlen(coredir) + strlen(file->d_name) + 2;
129 corefile = malloc(length);
130 if (corefile == NULL) {
134 snprintf(corefile, length, "%s/%s", coredir, file->d_name);
141 corefile = strdup(tbuffer);
146 bnode_CoreName(abnode, aproc->coreName, tbuffer);
147 #ifdef BOZO_SAVE_CORES
148 FT_GetTimeOfDay(&Start, 0);
149 TimeFields = localtime(&Start.tv_sec);
150 sprintf(FileName, "%s.%d%02d%02d%02d%02d%02d", tbuffer,
151 TimeFields->tm_year + 1900, TimeFields->tm_mon + 1, TimeFields->tm_mday,
152 TimeFields->tm_hour, TimeFields->tm_min, TimeFields->tm_sec);
153 strcpy(tbuffer, FileName);
155 code = renamefile(corefile, tbuffer);
160 bnode_GetString(struct bnode *abnode, char *abuffer,
163 return BOP_GETSTRING(abnode, abuffer, alen);
167 bnode_GetParm(struct bnode *abnode, afs_int32 aindex,
168 char *abuffer, afs_int32 alen)
170 return BOP_GETPARM(abnode, aindex, abuffer, alen);
174 bnode_GetStat(struct bnode *abnode, afs_int32 * astatus)
176 return BOP_GETSTAT(abnode, astatus);
180 bnode_RestartP(struct bnode *abnode)
182 return BOP_RESTARTP(abnode);
186 bnode_Check(struct bnode *abnode)
188 if (abnode->flags & BNODE_WAIT) {
189 abnode->flags &= ~BNODE_WAIT;
190 LWP_NoYieldSignal(abnode);
195 /* tell if an instance has a core file */
197 bnode_HasCore(struct bnode *abnode)
199 return BOP_HASCORE(abnode);
202 /* wait for all bnodes to stabilize */
211 for (tb = allBnodes; tb; tb = tb->next) {
213 code = BOP_GETSTAT(tb, &stat);
218 if (stat != tb->goal) {
219 tb->flags |= BNODE_WAIT;
229 /* wait until bnode status is correct */
231 bnode_WaitStatus(struct bnode *abnode, int astatus)
239 code = BOP_GETSTAT(abnode, &stat);
243 /* otherwise, check if we're done */
244 if (stat == astatus) {
245 bnode_Release(abnode);
248 if (astatus != abnode->goal) {
249 bnode_Release(abnode);
250 return -1; /* no longer our goal, don't keep waiting */
252 /* otherwise, block */
253 abnode->flags |= BNODE_WAIT;
254 LWP_WaitProcess(abnode);
259 bnode_ResetErrorCount(struct bnode *abnode)
261 abnode->errorStopCount = 0;
262 abnode->errorStopDelay = 0;
267 bnode_SetStat(struct bnode *abnode, int agoal)
269 abnode->goal = agoal;
271 BOP_SETSTAT(abnode, agoal);
272 abnode->flags &= ~BNODE_ERRORSTOP;
277 bnode_SetGoal(struct bnode *abnode, int agoal)
279 abnode->goal = agoal;
285 bnode_SetFileGoal(struct bnode *abnode, int agoal)
287 if (abnode->fileGoal == agoal)
288 return 0; /* already done */
289 abnode->fileGoal = agoal;
294 /* apply a function to all bnodes in the system */
296 bnode_ApplyInstance(int (*aproc) (struct bnode *tb, void *), void *arock)
298 struct bnode *tb, *nb;
301 for (tb = allBnodes; tb; tb = nb) {
303 code = (*aproc) (tb, arock);
311 bnode_FindInstance(char *aname)
315 for (tb = allBnodes; tb; tb = tb->next) {
316 if (!strcmp(tb->name, aname))
322 static struct bnode_type *
323 FindType(char *aname)
325 struct bnode_type *tt;
327 for (tt = allTypes; tt; tt = tt->next) {
328 if (!strcmp(tt->name, aname))
331 return (struct bnode_type *)0;
335 bnode_Register(char *atype, struct bnode_ops *aprocs, int anparms)
337 struct bnode_type *tt;
339 for (tt = allTypes; tt; tt = tt->next) {
340 if (!strcmp(tt->name, atype))
344 tt = (struct bnode_type *)malloc(sizeof(struct bnode_type));
345 memset(tt, 0, sizeof(struct bnode_type));
355 bnode_Create(char *atype, char *ainstance, struct bnode ** abp, char *ap1,
356 char *ap2, char *ap3, char *ap4, char *ap5, char *notifier,
357 int fileGoal, int rewritefile)
359 struct bnode_type *type;
361 char *notifierpath = NULL;
364 if (bnode_FindInstance(ainstance))
366 type = FindType(atype);
370 if (notifier && strcmp(notifier, NONOTIFIER)) {
371 /* construct local path from canonical (wire-format) path */
372 if (ConstructLocalBinPath(notifier, ¬ifierpath)) {
373 bozo_Log("BNODE-Create: Notifier program path invalid '%s'\n",
378 if (stat(notifierpath, &tstat)) {
379 bozo_Log("BNODE-Create: Notifier program '%s' not found\n",
385 tb = (*type->ops->create) (ainstance, ap1, ap2, ap3, ap4, ap5);
390 tb->notifier = notifierpath;
394 /* The fs_create above calls bnode_InitBnode() which always sets the
395 ** fileGoal to BSTAT_NORMAL .... overwrite it with whatever is passed into
396 ** this function as a parameter... */
397 tb->fileGoal = fileGoal;
399 bnode_SetStat(tb, tb->goal); /* nudge it once */
401 if (rewritefile != 0)
408 bnode_DeleteName(char *ainstance)
412 tb = bnode_FindInstance(ainstance);
416 return bnode_Delete(tb);
420 bnode_Hold(struct bnode *abnode)
427 bnode_Release(struct bnode *abnode)
430 if (abnode->refCount == 0 && abnode->flags & BNODE_DELETE) {
431 abnode->flags &= ~BNODE_DELETE; /* we're going for it */
432 bnode_Delete(abnode);
438 bnode_Delete(struct bnode *abnode)
441 struct bnode **lb, *ub;
444 if (abnode->refCount != 0) {
445 abnode->flags |= BNODE_DELETE;
449 /* make sure the bnode is idle before zapping */
451 code = BOP_GETSTAT(abnode, &temp);
452 bnode_Release(abnode);
455 if (temp != BSTAT_SHUTDOWN)
458 /* all clear to zap */
459 for (lb = &allBnodes, ub = *lb; ub; lb = &ub->next, ub = *lb) {
461 /* unthread it from the list */
466 free(abnode->name); /* do this first, since bnode fields may be bad after BOP_DELETE */
467 code = BOP_DELETE(abnode); /* don't play games like holding over this one */
472 /* function to tell if there's a timeout coming up */
474 bnode_PendingTimeout(struct bnode *abnode)
476 return (abnode->flags & BNODE_NEEDTIMEOUT);
479 /* function called to set / clear periodic bnode wakeup times */
481 bnode_SetTimeout(struct bnode *abnode, afs_int32 atimeout)
484 abnode->nextTimeout = FT_ApproxTime() + atimeout;
485 abnode->flags |= BNODE_NEEDTIMEOUT;
486 abnode->period = atimeout;
487 IOMGR_Cancel(bproc_pid);
489 abnode->flags &= ~BNODE_NEEDTIMEOUT;
494 /* used by new bnode creation code to format bnode header */
496 bnode_InitBnode(struct bnode *abnode, struct bnode_ops *abnodeops,
499 struct bnode **lb, *nb;
501 /* format the bnode properly */
502 memset(abnode, 0, sizeof(struct bnode));
503 abnode->ops = abnodeops;
504 abnode->name = strdup(aname);
507 abnode->flags = BNODE_ACTIVE;
508 abnode->fileGoal = BSTAT_NORMAL;
509 abnode->goal = BSTAT_SHUTDOWN;
511 /* put the bnode at the end of the list so we write bnode file in same order */
512 for (lb = &allBnodes, nb = *lb; nb; lb = &nb->next, nb = *lb);
519 DeleteProc(struct bnode_proc *abproc)
521 struct bnode_proc **pb, *tb;
522 struct bnode_proc *nb;
524 for (pb = &allProcs, tb = *pb; tb; pb = &tb->next, tb = nb) {
535 /* bnode lwp executes this code repeatedly */
542 struct bnode_proc *tp;
544 int options; /* must not be register */
550 /* first figure out how long to sleep for */
551 temp = 0x7fffffff; /* afs_int32 time; maxint doesn't work in select */
553 for (tb = allBnodes; tb; tb = tb->next) {
554 if (tb->flags & BNODE_NEEDTIMEOUT) {
555 if (tb->nextTimeout < temp) {
557 temp = tb->nextTimeout;
561 /* now temp has the time at which we should wakeup next */
565 temp -= FT_ApproxTime(); /* how many seconds until next event */
571 code = IOMGR_Select(0, 0, 0, 0, &tv);
573 code = 0; /* fake timeout code */
575 /* figure out why we woke up; child exit or timeouts */
576 FT_GetTimeOfDay(&tv, 0); /* must do the real gettimeofday once and a while */
579 /* check all bnodes to see which ones need timeout events */
580 for (tb = allBnodes; tb; tb = nb) {
581 if ((tb->flags & BNODE_NEEDTIMEOUT) && temp > tb->nextTimeout) {
585 if (tb->flags & BNODE_NEEDTIMEOUT) { /* check again, BOP_TIMEOUT could change */
586 tb->nextTimeout = FT_ApproxTime() + tb->period;
589 bnode_Release(tb); /* delete may occur here */
595 /* signalled, probably by incoming signal */
598 bnode_waiting = options | 0x800000;
599 code = waitpid((pid_t) - 1, &status, options);
601 if (code == 0 || code == -1)
602 break; /* all done */
603 /* otherwise code has a process id, which we now search for */
604 for (tp = allProcs; tp; tp = tp->next)
612 /* count restarts in last 30 seconds */
613 if (temp > tb->rsTime + 30) {
614 /* it's been 30 seconds we've been counting */
620 if (WIFSIGNALED(status) == 0) {
621 /* exited, not signalled */
622 tp->lastExit = WEXITSTATUS(status);
625 tb->errorCode = tp->lastExit;
626 tb->lastErrorExit = FT_ApproxTime();
627 RememberProcName(tp);
631 bozo_Log("%s:%s exited with code %d\n", tb->name,
632 tp->coreName, tp->lastExit);
634 bozo_Log("%s exited with code %d\n", tb->name,
637 /* Signal occurred, perhaps spurious due to shutdown request.
638 * If due to a shutdown request, don't overwrite last error
641 tp->lastSignal = WTERMSIG(status);
643 if (tp->lastSignal != SIGQUIT
644 && tp->lastSignal != SIGTERM
645 && tp->lastSignal != SIGKILL) {
646 tb->errorSignal = tp->lastSignal;
647 tb->lastErrorExit = FT_ApproxTime();
648 RememberProcName(tp);
651 bozo_Log("%s:%s exited on signal %d%s\n",
652 tb->name, tp->coreName, tp->lastSignal,
653 WCOREDUMP(status) ? " (core dumped)" :
656 bozo_Log("%s exited on signal %d%s\n", tb->name,
658 WCOREDUMP(status) ? " (core dumped)" :
662 tb->lastAnyExit = FT_ApproxTime();
665 bozo_Log("BNODE: Notifier %s will be called\n",
670 if (tb->goal && tb->rsCount++ > 10) {
671 /* 10 in 30 seconds */
672 if (tb->errorStopCount >= BNODE_ERROR_COUNT_MAX) {
673 tb->errorStopDelay = 0; /* max reached, give up. */
675 tb->errorStopCount++;
676 if (!tb->errorStopDelay) {
677 tb->errorStopDelay = 1;
679 tb->errorStopDelay *= 2;
682 tb->flags |= BNODE_ERRORSTOP;
683 bnode_SetGoal(tb, BSTAT_SHUTDOWN);
685 ("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n",
688 BOP_PROCEXIT(tb, tp);
690 bnode_Release(tb); /* bnode delete can happen here */
693 bnode_stats.weirdPids++;
701 SendNotifierData(int fd, struct bnode_proc *tp)
703 struct bnode *tb = tp->bnode;
704 char buffer[1000], *bufp = buffer, *buf1;
708 * First sent out the bnode_proc struct
710 (void)sprintf(bufp, "BEGIN bnode_proc\n");
711 bufp += strlen(bufp);
712 (void)sprintf(bufp, "comLine: %s\n", tp->comLine);
713 bufp += strlen(bufp);
714 if (!(buf1 = tp->coreName))
716 (void)sprintf(bufp, "coreName: %s\n", buf1);
717 bufp += strlen(bufp);
718 (void)sprintf(bufp, "pid: %ld\n", afs_printable_int32_ld(tp->pid));
719 bufp += strlen(bufp);
720 (void)sprintf(bufp, "lastExit: %ld\n", afs_printable_int32_ld(tp->lastExit));
721 bufp += strlen(bufp);
723 (void)sprintf(bufp, "lastSignal: %ld\n", afs_printable_int32_ld(tp->lastSignal));
724 bufp += strlen(bufp);
726 (void)sprintf(bufp, "flags: %ld\n", afs_printable_int32_ld(tp->flags));
727 bufp += strlen(bufp);
728 (void)sprintf(bufp, "END bnode_proc\n");
729 bufp += strlen(bufp);
730 len = (int)(bufp - buffer);
731 if (write(fd, buffer, len) < 0) {
736 * Now sent out the bnode struct
739 (void)sprintf(bufp, "BEGIN bnode\n");
740 bufp += strlen(bufp);
741 (void)sprintf(bufp, "name: %s\n", tb->name);
742 bufp += strlen(bufp);
743 (void)sprintf(bufp, "rsTime: %ld\n", afs_printable_int32_ld(tb->rsTime));
744 bufp += strlen(bufp);
745 (void)sprintf(bufp, "rsCount: %ld\n", afs_printable_int32_ld(tb->rsCount));
746 bufp += strlen(bufp);
747 (void)sprintf(bufp, "procStartTime: %ld\n", afs_printable_int32_ld(tb->procStartTime));
748 bufp += strlen(bufp);
749 (void)sprintf(bufp, "procStarts: %ld\n", afs_printable_int32_ld(tb->procStarts));
750 bufp += strlen(bufp);
751 (void)sprintf(bufp, "lastAnyExit: %ld\n", afs_printable_int32_ld(tb->lastAnyExit));
752 bufp += strlen(bufp);
753 (void)sprintf(bufp, "lastErrorExit: %ld\n", afs_printable_int32_ld(tb->lastErrorExit));
754 bufp += strlen(bufp);
755 (void)sprintf(bufp, "errorCode: %ld\n", afs_printable_int32_ld(tb->errorCode));
756 bufp += strlen(bufp);
757 (void)sprintf(bufp, "errorSignal: %ld\n", afs_printable_int32_ld(tb->errorSignal));
758 bufp += strlen(bufp);
760 (void) sprintf(bufp, "lastErrorName: %s\n", tb->lastErrorName);
761 bufp += strlen(bufp);
763 (void)sprintf(bufp, "goal: %d\n", tb->goal);
764 bufp += strlen(bufp);
765 (void)sprintf(bufp, "END bnode\n");
766 bufp += strlen(bufp);
767 len = (int)(bufp - buffer);
768 if (write(fd, buffer, len) < 0) {
775 hdl_notifier(struct bnode_proc *tp)
777 #ifndef AFS_NT40_ENV /* NT notifier callout not yet implemented */
781 if (stat(tp->bnode->notifier, &tstat)) {
782 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
783 tp->bnode->notifier);
786 if ((pid = fork()) == 0) {
788 struct bnode *tb = tp->bnode;
790 #if defined(AFS_HPUX_ENV) || defined(AFS_SUN5_ENV) || defined(AFS_SGI51_ENV)
792 #elif defined(AFS_DARWIN90_ENV)
794 #elif defined(AFS_LINUX20_ENV) || defined(AFS_AIX_ENV)
799 fout = popen(tb->notifier, "w");
801 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
803 perror(tb->notifier);
806 SendNotifierData(fileno(fout), tp);
809 } else if (pid < 0) {
810 bozo_Log("Failed to fork creating process to handle notifier '%s'\n",
811 tp->bnode->notifier);
814 #endif /* AFS_NT40_ENV */
818 /* Called by IOMGR at low priority on IOMGR's stack shortly after a SIGCHLD
819 * occurs. Wakes up bproc do redo things */
821 bnode_SoftInt(void *param)
823 /* int asignal = (int) param; */
825 IOMGR_Cancel(bproc_pid);
829 /* Called at signal interrupt level; queues function to be called
830 * when IOMGR runs again.
833 bnode_Int(int asignal)
835 if (asignal == SIGQUIT || asignal == SIGTERM) {
836 IOMGR_SoftSig(bozo_ShutdownAndExit, (void *)(intptr_t)asignal);
838 IOMGR_SoftSig(bnode_SoftInt, (void *)(intptr_t)asignal);
843 /* intialize the whole system */
849 struct sigaction newaction;
850 static int initDone = 0;
855 memset(&bnode_stats, 0, sizeof(bnode_stats));
856 LWP_InitializeProcessSupport(1, &junk); /* just in case */
858 code = LWP_CreateProcess(bproc, BNODE_LWP_STACKSIZE,
859 /* priority */ 1, (void *) /* parm */ 0,
860 "bnode-manager", &bproc_pid);
863 memset(&newaction, 0, sizeof(newaction));
864 newaction.sa_handler = bnode_Int;
865 code = sigaction(SIGCHLD, &newaction, NULL);
868 code = sigaction(SIGQUIT, &newaction, NULL);
871 code = sigaction(SIGTERM, &newaction, NULL);
877 /* free token list returned by parseLine */
879 bnode_FreeTokens(struct bnode_token *alist)
881 struct bnode_token *nlist;
882 for (; alist; alist = nlist) {
893 if (x == 0 || x == ' ' || x == '\t' || x == '\n')
900 bnode_ParseLine(char *aline, struct bnode_token **alist)
905 struct bnode_token *first, *last;
906 struct bnode_token *ttok;
909 inToken = 0; /* not copying token chars at start */
910 first = (struct bnode_token *)0;
911 last = (struct bnode_token *)0;
914 if (tc == 0 || space(tc)) { /* terminating null gets us in here, too */
916 inToken = 0; /* end of this token */
919 (struct bnode_token *)malloc(sizeof(struct bnode_token));
920 ttok->next = (struct bnode_token *)0;
921 ttok->key = strdup(tbuffer);
931 /* an alpha character */
936 if (tptr - tbuffer >= sizeof(tbuffer))
937 return -1; /* token too long */
941 /* last token flushed 'cause space(0) --> true */
943 last->next = (struct bnode_token *)0;
952 bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName,
953 struct bnode_proc **aproc)
955 struct bnode_token *tlist, *tt;
957 struct bnode_proc *tp;
959 char *argv[MAXVARGS];
962 code = bnode_ParseLine(aexecString, &tlist); /* try parsing first */
965 tp = (struct bnode_proc *)malloc(sizeof(struct bnode_proc));
966 memset(tp, 0, sizeof(struct bnode_proc));
969 tp->comLine = aexecString;
970 tp->coreName = coreName; /* may be null */
971 abnode->procStartTime = FT_ApproxTime();
972 abnode->procStarts++;
974 /* convert linked list of tokens into argv structure */
975 for (tt = tlist, i = 0; i < (MAXVARGS - 1) && tt; tt = tt->next, i++) {
978 argv[i] = NULL; /* null-terminated */
980 cpid = spawnprocve(argv[0], argv, environ, -1);
981 osi_audit(BOSSpawnProcEvent, 0, AUD_STR, aexecString, AUD_END);
983 if (cpid == (pid_t) - 1) {
984 bozo_Log("Failed to spawn process for bnode '%s'\n", abnode->name);
985 bnode_FreeTokens(tlist);
989 bozo_Log("%s started pid %ld: %s\n", abnode->name, cpid, aexecString);
991 bnode_FreeTokens(tlist);
995 tp->flags = BPROC_STARTED;
996 tp->flags &= ~BPROC_EXITED;
997 BOP_PROCSTARTED(abnode, tp);
1003 bnode_StopProc(struct bnode_proc *aproc, int asignal)
1006 if (!(aproc->flags & BPROC_STARTED) || (aproc->flags & BPROC_EXITED))
1009 osi_audit(BOSStopProcEvent, 0, AUD_STR, (aproc ? aproc->comLine : NULL),
1012 code = kill(aproc->pid, asignal);
1013 bnode_Check(aproc->bnode);
1018 bnode_Deactivate(struct bnode *abnode)
1020 struct bnode **pb, *tb;
1022 if (!(abnode->flags & BNODE_ACTIVE))
1024 for (pb = &allBnodes, tb = *pb; tb; tb = nb) {
1028 tb->flags &= ~BNODE_ACTIVE;