2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
19 #include <sys/types.h>
26 #ifdef BOZO_SAVE_CORES
32 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
35 #include <afs/audit.h>
36 #include <afs/afsutil.h>
37 #include <afs/fileutil.h>
39 #include "bosprototypes.h"
41 #if defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV)
42 /* All known versions of AIX lack WCOREDUMP but this works */
43 #define WCOREDUMP(x) ((x) & 0x80)
46 #define BNODE_LWP_STACKSIZE (16 * 1024)
48 int bnode_waiting = 0;
49 static PROCESS bproc_pid; /* pid of waker-upper */
50 static struct bnode *allBnodes = 0; /* list of all bnodes */
51 static struct bnode_proc *allProcs = 0; /* list of all processes for which we're waiting */
52 static struct bnode_type *allTypes = 0; /* list of registered type handlers */
54 static struct bnode_stats {
59 extern char **environ; /* env structure */
62 int hdl_notifier(struct bnode_proc *tp);
64 /* Remember the name of the process, if any, that failed last */
66 RememberProcName(register struct bnode_proc *ap)
68 register struct bnode *tbnodep;
71 if (tbnodep->lastErrorName) {
72 free(tbnodep->lastErrorName);
73 tbnodep->lastErrorName = NULL;
76 tbnodep->lastErrorName = (char *)malloc(strlen(ap->coreName) + 1);
77 strcpy(tbnodep->lastErrorName, ap->coreName);
81 /* utility for use by BOP_HASCORE functions to determine where a core file might
85 bnode_CoreName(register struct bnode *abnode, char *acoreName, char *abuffer)
87 strcpy(abuffer, AFSDIR_SERVER_CORELOG_FILEPATH);
89 strcat(abuffer, acoreName);
92 strcat(abuffer, abnode->name);
96 /* save core file, if any */
98 SaveCore(register struct bnode *abnode, register struct bnode_proc
103 register afs_int32 code;
104 char *corefile = NULL;
105 #ifdef BOZO_SAVE_CORES
106 struct timeval Start;
107 struct tm *TimeFields;
111 /* Linux always appends the PID to core dumps from threaded processes, so
112 * we have to scan the directory to find core files under another name. */
113 code = stat(AFSDIR_SERVER_CORELOG_FILEPATH, &tstat);
120 logdir = opendir(AFSDIR_LOGS_DIR);
123 while ((file = readdir(logdir)) != NULL) {
124 if (strncmp(file->d_name, "core.", 5) != 0)
126 pid = atol(file->d_name + 5);
127 if (pid == aproc->pid) {
128 length = strlen(AFSDIR_LOGS_DIR) + strlen(file->d_name) + 2;
129 corefile = malloc(length);
130 if (corefile == NULL) {
134 snprintf(corefile, length, "%s/%s", AFSDIR_LOGS_DIR,
145 bnode_CoreName(abnode, aproc->coreName, tbuffer);
146 #ifdef BOZO_SAVE_CORES
147 FT_GetTimeOfDay(&Start, 0);
148 TimeFields = localtime(&Start.tv_sec);
149 sprintf(FileName, "%s.%d%02d%02d%02d%02d%02d", tbuffer,
150 TimeFields->tm_year + 1900, TimeFields->tm_mon + 1, TimeFields->tm_mday,
151 TimeFields->tm_hour, TimeFields->tm_min, TimeFields->tm_sec);
152 strcpy(tbuffer, FileName);
154 if (corefile == NULL)
155 code = renamefile(AFSDIR_SERVER_CORELOG_FILEPATH, tbuffer);
157 code = renamefile(corefile, tbuffer);
163 bnode_GetString(register struct bnode *abnode, register char *abuffer,
164 register afs_int32 alen)
166 return BOP_GETSTRING(abnode, abuffer, alen);
170 bnode_GetParm(register struct bnode *abnode, register afs_int32 aindex,
171 register char *abuffer, afs_int32 alen)
173 return BOP_GETPARM(abnode, aindex, abuffer, alen);
177 bnode_GetStat(register struct bnode *abnode, register afs_int32 * astatus)
179 return BOP_GETSTAT(abnode, astatus);
183 bnode_RestartP(register struct bnode *abnode)
185 return BOP_RESTARTP(abnode);
189 bnode_Check(register struct bnode *abnode)
191 if (abnode->flags & BNODE_WAIT) {
192 abnode->flags &= ~BNODE_WAIT;
193 LWP_NoYieldSignal(abnode);
198 /* tell if an instance has a core file */
200 bnode_HasCore(register struct bnode *abnode)
202 return BOP_HASCORE(abnode);
205 /* wait for all bnodes to stabilize */
209 register struct bnode *tb;
210 register afs_int32 code;
214 for (tb = allBnodes; tb; tb = tb->next) {
216 code = BOP_GETSTAT(tb, &stat);
221 if (stat != tb->goal) {
222 tb->flags |= BNODE_WAIT;
232 /* wait until bnode status is correct */
234 bnode_WaitStatus(register struct bnode *abnode, int astatus)
236 register afs_int32 code;
242 code = BOP_GETSTAT(abnode, &stat);
246 /* otherwise, check if we're done */
247 if (stat == astatus) {
248 bnode_Release(abnode);
251 if (astatus != abnode->goal) {
252 bnode_Release(abnode);
253 return -1; /* no longer our goal, don't keep waiting */
255 /* otherwise, block */
256 abnode->flags |= BNODE_WAIT;
257 LWP_WaitProcess(abnode);
262 bnode_SetStat(register struct bnode *abnode, register int agoal)
264 abnode->goal = agoal;
266 BOP_SETSTAT(abnode, agoal);
267 abnode->flags &= ~BNODE_ERRORSTOP;
272 bnode_SetGoal(register struct bnode *abnode, register int agoal)
274 abnode->goal = agoal;
280 bnode_SetFileGoal(register struct bnode *abnode, register int agoal)
282 if (abnode->fileGoal == agoal)
283 return 0; /* already done */
284 abnode->fileGoal = agoal;
289 /* apply a function to all bnodes in the system */
291 bnode_ApplyInstance(int (*aproc) (struct bnode *tb, void *), void *arock)
293 register struct bnode *tb, *nb;
294 register afs_int32 code;
296 for (tb = allBnodes; tb; tb = nb) {
298 code = (*aproc) (tb, arock);
306 bnode_FindInstance(register char *aname)
308 register struct bnode *tb;
310 for (tb = allBnodes; tb; tb = tb->next) {
311 if (!strcmp(tb->name, aname))
317 static struct bnode_type *
318 FindType(register char *aname)
320 register struct bnode_type *tt;
322 for (tt = allTypes; tt; tt = tt->next) {
323 if (!strcmp(tt->name, aname))
326 return (struct bnode_type *)0;
330 bnode_Register(char *atype, struct bnode_ops *aprocs, int anparms)
332 register struct bnode_type *tt;
334 for (tt = allTypes; tt; tt = tt->next) {
335 if (!strcmp(tt->name, atype))
339 tt = (struct bnode_type *)malloc(sizeof(struct bnode_type));
340 memset(tt, 0, sizeof(struct bnode_type));
350 bnode_Create(char *atype, char *ainstance, struct bnode ** abp, char *ap1,
351 char *ap2, char *ap3, char *ap4, char *ap5, char *notifier,
352 int fileGoal, int rewritefile)
354 struct bnode_type *type;
356 char *notifierpath = NULL;
359 if (bnode_FindInstance(ainstance))
361 type = FindType(atype);
365 if (notifier && strcmp(notifier, NONOTIFIER)) {
366 /* construct local path from canonical (wire-format) path */
367 if (ConstructLocalBinPath(notifier, ¬ifierpath)) {
368 bozo_Log("BNODE-Create: Notifier program path invalid '%s'\n",
373 if (stat(notifierpath, &tstat)) {
374 bozo_Log("BNODE-Create: Notifier program '%s' not found\n",
380 tb = (*type->ops->create) (ainstance, ap1, ap2, ap3, ap4, ap5);
385 tb->notifier = notifierpath;
389 /* The fs_create above calls bnode_InitBnode() which always sets the
390 ** fileGoal to BSTAT_NORMAL .... overwrite it with whatever is passed into
391 ** this function as a parameter... */
392 tb->fileGoal = fileGoal;
394 bnode_SetStat(tb, tb->goal); /* nudge it once */
396 if (rewritefile != 0)
403 bnode_DeleteName(char *ainstance)
405 register struct bnode *tb;
407 tb = bnode_FindInstance(ainstance);
411 return bnode_Delete(tb);
415 bnode_Hold(register struct bnode *abnode)
422 bnode_Release(register struct bnode *abnode)
425 if (abnode->refCount == 0 && abnode->flags & BNODE_DELETE) {
426 abnode->flags &= ~BNODE_DELETE; /* we're going for it */
427 bnode_Delete(abnode);
433 bnode_Delete(register struct bnode *abnode)
435 register afs_int32 code;
436 register struct bnode **lb, *ub;
439 if (abnode->refCount != 0) {
440 abnode->flags |= BNODE_DELETE;
444 /* make sure the bnode is idle before zapping */
446 code = BOP_GETSTAT(abnode, &temp);
447 bnode_Release(abnode);
450 if (temp != BSTAT_SHUTDOWN)
453 /* all clear to zap */
454 for (lb = &allBnodes, ub = *lb; ub; lb = &ub->next, ub = *lb) {
456 /* unthread it from the list */
461 free(abnode->name); /* do this first, since bnode fields may be bad after BOP_DELETE */
462 code = BOP_DELETE(abnode); /* don't play games like holding over this one */
467 /* function to tell if there's a timeout coming up */
469 bnode_PendingTimeout(register struct bnode *abnode)
471 return (abnode->flags & BNODE_NEEDTIMEOUT);
474 /* function called to set / clear periodic bnode wakeup times */
476 bnode_SetTimeout(register struct bnode *abnode, afs_int32 atimeout)
479 abnode->nextTimeout = FT_ApproxTime() + atimeout;
480 abnode->flags |= BNODE_NEEDTIMEOUT;
481 abnode->period = atimeout;
482 IOMGR_Cancel(bproc_pid);
484 abnode->flags &= ~BNODE_NEEDTIMEOUT;
489 /* used by new bnode creation code to format bnode header */
491 bnode_InitBnode(register struct bnode *abnode, struct bnode_ops *abnodeops,
494 struct bnode **lb, *nb;
496 /* format the bnode properly */
497 memset(abnode, 0, sizeof(struct bnode));
498 abnode->ops = abnodeops;
499 abnode->name = (char *)malloc(strlen(aname) + 1);
502 strcpy(abnode->name, aname);
503 abnode->flags = BNODE_ACTIVE;
504 abnode->fileGoal = BSTAT_NORMAL;
505 abnode->goal = BSTAT_SHUTDOWN;
507 /* put the bnode at the end of the list so we write bnode file in same order */
508 for (lb = &allBnodes, nb = *lb; nb; lb = &nb->next, nb = *lb);
515 DeleteProc(register struct bnode_proc *abproc)
517 register struct bnode_proc **pb, *tb;
518 struct bnode_proc *nb;
520 for (pb = &allProcs, tb = *pb; tb; pb = &tb->next, tb = nb) {
531 /* bnode lwp executes this code repeatedly */
535 register afs_int32 code;
536 register struct bnode *tb;
537 register afs_int32 temp;
538 register struct bnode_proc *tp;
540 int options; /* must not be register */
546 /* first figure out how long to sleep for */
547 temp = 0x7fffffff; /* afs_int32 time; maxint doesn't work in select */
549 for (tb = allBnodes; tb; tb = tb->next) {
550 if (tb->flags & BNODE_NEEDTIMEOUT) {
551 if (tb->nextTimeout < temp) {
553 temp = tb->nextTimeout;
557 /* now temp has the time at which we should wakeup next */
561 temp -= FT_ApproxTime(); /* how many seconds until next event */
567 code = IOMGR_Select(0, 0, 0, 0, &tv);
569 code = 0; /* fake timeout code */
571 /* figure out why we woke up; child exit or timeouts */
572 FT_GetTimeOfDay(&tv, 0); /* must do the real gettimeofday once and a while */
575 /* check all bnodes to see which ones need timeout events */
576 for (tb = allBnodes; tb; tb = nb) {
577 if ((tb->flags & BNODE_NEEDTIMEOUT) && temp > tb->nextTimeout) {
581 if (tb->flags & BNODE_NEEDTIMEOUT) { /* check again, BOP_TIMEOUT could change */
582 tb->nextTimeout = FT_ApproxTime() + tb->period;
585 bnode_Release(tb); /* delete may occur here */
591 /* signalled, probably by incoming signal */
594 bnode_waiting = options | 0x800000;
595 code = waitpid((pid_t) - 1, &status, options);
597 if (code == 0 || code == -1)
598 break; /* all done */
599 /* otherwise code has a process id, which we now search for */
600 for (tp = allProcs; tp; tp = tp->next)
608 /* count restarts in last 10 seconds */
609 if (temp > tb->rsTime + 30) {
610 /* it's been 10 seconds we've been counting */
615 if (WIFSIGNALED(status) == 0) {
616 /* exited, not signalled */
617 tp->lastExit = WEXITSTATUS(status);
620 tb->errorCode = tp->lastExit;
621 tb->lastErrorExit = FT_ApproxTime();
622 RememberProcName(tp);
626 bozo_Log("%s:%s exited with code %d\n", tb->name,
627 tp->coreName, tp->lastExit);
629 bozo_Log("%s exited with code %d\n", tb->name,
632 /* Signal occurred, perhaps spurious due to shutdown request.
633 * If due to a shutdown request, don't overwrite last error
636 tp->lastSignal = WTERMSIG(status);
638 if (tp->lastSignal != SIGQUIT
639 && tp->lastSignal != SIGTERM
640 && tp->lastSignal != SIGKILL) {
641 tb->errorSignal = tp->lastSignal;
642 tb->lastErrorExit = FT_ApproxTime();
643 RememberProcName(tp);
646 bozo_Log("%s:%s exited on signal %d%s\n",
647 tb->name, tp->coreName, tp->lastSignal,
648 WCOREDUMP(status) ? " (core dumped)" :
651 bozo_Log("%s exited on signal %d%s\n", tb->name,
653 WCOREDUMP(status) ? " (core dumped)" :
657 tb->lastAnyExit = FT_ApproxTime();
660 bozo_Log("BNODE: Notifier %s will be called\n",
664 BOP_PROCEXIT(tb, tp);
667 if (tb->rsCount++ > 10) {
668 /* 10 in 10 seconds */
669 tb->flags |= BNODE_ERRORSTOP;
670 bnode_SetGoal(tb, BSTAT_SHUTDOWN);
672 ("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n",
675 bnode_Release(tb); /* bnode delete can happen here */
678 bnode_stats.weirdPids++;
686 SendNotifierData(register int fd, register struct bnode_proc *tp)
688 register struct bnode *tb = tp->bnode;
689 char buffer[1000], *bufp = buffer, *buf1;
693 * First sent out the bnode_proc struct
695 (void)sprintf(bufp, "BEGIN bnode_proc\n");
696 bufp += strlen(bufp);
697 (void)sprintf(bufp, "comLine: %s\n", tp->comLine);
698 bufp += strlen(bufp);
699 if (!(buf1 = tp->coreName))
701 (void)sprintf(bufp, "coreName: %s\n", buf1);
702 bufp += strlen(bufp);
703 (void)sprintf(bufp, "pid: %ld\n", afs_printable_int32_ld(tp->pid));
704 bufp += strlen(bufp);
705 (void)sprintf(bufp, "lastExit: %ld\n", afs_printable_int32_ld(tp->lastExit));
706 bufp += strlen(bufp);
708 (void)sprintf(bufp, "lastSignal: %ld\n", afs_printable_int32_ld(tp->lastSignal));
709 bufp += strlen(bufp);
711 (void)sprintf(bufp, "flags: %ld\n", afs_printable_int32_ld(tp->flags));
712 bufp += strlen(bufp);
713 (void)sprintf(bufp, "END bnode_proc\n");
714 bufp += strlen(bufp);
715 len = (int)(bufp - buffer);
716 if (write(fd, buffer, len) < 0) {
721 * Now sent out the bnode struct
724 (void)sprintf(bufp, "BEGIN bnode\n");
725 bufp += strlen(bufp);
726 (void)sprintf(bufp, "name: %s\n", tb->name);
727 bufp += strlen(bufp);
728 (void)sprintf(bufp, "rsTime: %ld\n", afs_printable_int32_ld(tb->rsTime));
729 bufp += strlen(bufp);
730 (void)sprintf(bufp, "rsCount: %ld\n", afs_printable_int32_ld(tb->rsCount));
731 bufp += strlen(bufp);
732 (void)sprintf(bufp, "procStartTime: %ld\n", afs_printable_int32_ld(tb->procStartTime));
733 bufp += strlen(bufp);
734 (void)sprintf(bufp, "procStarts: %ld\n", afs_printable_int32_ld(tb->procStarts));
735 bufp += strlen(bufp);
736 (void)sprintf(bufp, "lastAnyExit: %ld\n", afs_printable_int32_ld(tb->lastAnyExit));
737 bufp += strlen(bufp);
738 (void)sprintf(bufp, "lastErrorExit: %ld\n", afs_printable_int32_ld(tb->lastErrorExit));
739 bufp += strlen(bufp);
740 (void)sprintf(bufp, "errorCode: %ld\n", afs_printable_int32_ld(tb->errorCode));
741 bufp += strlen(bufp);
742 (void)sprintf(bufp, "errorSignal: %ld\n", afs_printable_int32_ld(tb->errorSignal));
743 bufp += strlen(bufp);
745 (void) sprintf(bufp, "lastErrorName: %s\n", tb->lastErrorName);
746 bufp += strlen(bufp);
748 (void)sprintf(bufp, "goal: %d\n", tb->goal);
749 bufp += strlen(bufp);
750 (void)sprintf(bufp, "END bnode\n");
751 bufp += strlen(bufp);
752 len = (int)(bufp - buffer);
753 if (write(fd, buffer, len) < 0) {
760 hdl_notifier(struct bnode_proc *tp)
762 #ifndef AFS_NT40_ENV /* NT notifier callout not yet implemented */
766 if (stat(tp->bnode->notifier, &tstat)) {
767 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
768 tp->bnode->notifier);
771 if ((pid = fork()) == 0) {
773 struct bnode *tb = tp->bnode;
776 #if defined(AFS_HPUX_ENV) || defined(AFS_SUN5_ENV) || defined(AFS_SGI51_ENV)
778 #elif defined(AFS_DARWIN90_ENV)
780 #elif defined(AFS_LINUX20_ENV) || defined(AFS_AIX_ENV)
785 fout = popen(tb->notifier, "w");
787 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
789 perror(tb->notifier);
792 code = SendNotifierData(fileno(fout), tp);
795 } else if (pid < 0) {
796 bozo_Log("Failed to fork creating process to handle notifier '%s'\n",
797 tp->bnode->notifier);
800 #endif /* AFS_NT40_ENV */
804 /* Called by IOMGR at low priority on IOMGR's stack shortly after a SIGCHLD
805 * occurs. Wakes up bproc do redo things */
807 bnode_SoftInt(void *param)
809 /* int asignal = (int) param; */
811 IOMGR_Cancel(bproc_pid);
815 /* Called at signal interrupt level; queues function to be called
816 * when IOMGR runs again.
819 bnode_Int(int asignal)
821 if (asignal == SIGQUIT) {
822 IOMGR_SoftSig(bozo_ShutdownAndExit, (void *) asignal);
824 IOMGR_SoftSig(bnode_SoftInt, (void *) asignal);
829 /* intialize the whole system */
834 register afs_int32 code;
835 struct sigaction newaction;
836 static int initDone = 0;
841 memset(&bnode_stats, 0, sizeof(bnode_stats));
842 LWP_InitializeProcessSupport(1, &junk); /* just in case */
844 code = LWP_CreateProcess(bproc, BNODE_LWP_STACKSIZE,
845 /* priority */ 1, (void *) /* parm */ 0,
846 "bnode-manager", &bproc_pid);
849 memset((char *)&newaction, 0, sizeof(newaction));
850 newaction.sa_handler = bnode_Int;
851 code = sigaction(SIGCHLD, &newaction, NULL);
854 code = sigaction(SIGQUIT, &newaction, NULL);
860 /* free token list returned by parseLine */
862 bnode_FreeTokens(register struct bnode_token *alist)
864 register struct bnode_token *nlist;
865 for (; alist; alist = nlist) {
876 if (x == 0 || x == ' ' || x == '\t' || x == '\n')
883 bnode_ParseLine(char *aline, struct bnode_token **alist)
886 register char *tptr = NULL;
888 struct bnode_token *first, *last;
889 register struct bnode_token *ttok;
892 inToken = 0; /* not copying token chars at start */
893 first = (struct bnode_token *)0;
894 last = (struct bnode_token *)0;
897 if (tc == 0 || space(tc)) { /* terminating null gets us in here, too */
899 inToken = 0; /* end of this token */
902 (struct bnode_token *)malloc(sizeof(struct bnode_token));
903 ttok->next = (struct bnode_token *)0;
904 ttok->key = (char *)malloc(strlen(tbuffer) + 1);
905 strcpy(ttok->key, tbuffer);
915 /* an alpha character */
920 if (tptr - tbuffer >= sizeof(tbuffer))
921 return -1; /* token too long */
925 /* last token flushed 'cause space(0) --> true */
927 last->next = (struct bnode_token *)0;
937 bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName,
938 struct bnode_proc **aproc)
940 struct bnode_token *tlist, *tt;
942 struct bnode_proc *tp;
944 char *argv[MAXVARGS];
947 code = bnode_ParseLine(aexecString, &tlist); /* try parsing first */
950 tp = (struct bnode_proc *)malloc(sizeof(struct bnode_proc));
951 memset(tp, 0, sizeof(struct bnode_proc));
954 tp->comLine = aexecString;
955 tp->coreName = coreName; /* may be null */
956 abnode->procStartTime = FT_ApproxTime();
957 abnode->procStarts++;
959 /* convert linked list of tokens into argv structure */
960 for (tt = tlist, i = 0; i < (MAXVARGS - 1) && tt; tt = tt->next, i++) {
963 argv[i] = NULL; /* null-terminated */
965 cpid = spawnprocve(argv[0], argv, environ, -1);
966 osi_audit(BOSSpawnProcEvent, 0, AUD_STR, aexecString, AUD_END);
968 if (cpid == (pid_t) - 1) {
969 bozo_Log("Failed to spawn process for bnode '%s'\n", abnode->name);
970 bnode_FreeTokens(tlist);
975 bnode_FreeTokens(tlist);
979 tp->flags = BPROC_STARTED;
980 tp->flags &= ~BPROC_EXITED;
986 bnode_StopProc(register struct bnode_proc *aproc, int asignal)
989 if (!(aproc->flags & BPROC_STARTED) || (aproc->flags & BPROC_EXITED))
992 osi_audit(BOSStopProcEvent, 0, AUD_STR, (aproc ? aproc->comLine : NULL),
995 code = kill(aproc->pid, asignal);
996 bnode_Check(aproc->bnode);
1001 bnode_Deactivate(register struct bnode *abnode)
1003 register struct bnode **pb, *tb;
1005 if (!(abnode->flags & BNODE_ACTIVE))
1007 for (pb = &allBnodes, tb = *pb; tb; tb = nb) {
1011 tb->flags &= ~BNODE_ACTIVE;