2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
20 #include <sys/types.h>
27 #ifdef BOZO_SAVE_CORES
40 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
42 #include <afs/audit.h>
43 #include <afs/afsutil.h>
44 #include <afs/fileutil.h>
47 #if defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV)
48 /* All known versions of AIX lack WCOREDUMP but this works */
49 #define WCOREDUMP(x) ((x) & 0x80)
52 #define BNODE_LWP_STACKSIZE (16 * 1024)
54 int bnode_waiting = 0;
55 static PROCESS bproc_pid; /* pid of waker-upper */
56 static struct bnode *allBnodes = 0; /* list of all bnodes */
57 static struct bnode_proc *allProcs = 0; /* list of all processes for which we're waiting */
58 static struct bnode_type *allTypes = 0; /* list of registered type handlers */
60 static struct bnode_stats {
65 extern char **environ; /* env structure */
68 /* Remember the name of the process, if any, that failed last */
70 RememberProcName(register struct bnode_proc *ap)
72 register struct bnode *tbnodep;
75 if (tbnodep->lastErrorName) {
76 free(tbnodep->lastErrorName);
77 tbnodep->lastErrorName = NULL;
80 tbnodep->lastErrorName = (char *)malloc(strlen(ap->coreName) + 1);
81 strcpy(tbnodep->lastErrorName, ap->coreName);
85 /* utility for use by BOP_HASCORE functions to determine where a core file might
89 bnode_CoreName(register struct bnode *abnode, char *acoreName, char *abuffer)
91 strcpy(abuffer, AFSDIR_SERVER_CORELOG_FILEPATH);
93 strcat(abuffer, acoreName);
96 strcat(abuffer, abnode->name);
100 /* save core file, if any */
102 SaveCore(register struct bnode *abnode, register struct bnode_proc
107 register afs_int32 code;
108 #ifdef BOZO_SAVE_CORES
109 struct timeval Start;
110 struct tm *TimeFields;
114 code = stat(AFSDIR_SERVER_CORELOG_FILEPATH, &tstat);
118 bnode_CoreName(abnode, aproc->coreName, tbuffer);
119 #ifdef BOZO_SAVE_CORES
120 TM_GetTimeOfDay(&Start, 0);
121 TimeFields = localtime(&Start.tv_sec);
122 sprintf(FileName, "%s.%d%02d%02d%02d%02d%02d", tbuffer,
123 TimeFields->tm_year, TimeFields->tm_mon + 1, TimeFields->tm_mday,
124 TimeFields->tm_hour, TimeFields->tm_min, TimeFields->tm_sec);
125 strcpy(tbuffer, FileName);
127 code = renamefile(AFSDIR_SERVER_CORELOG_FILEPATH, tbuffer);
131 bnode_GetString(register struct bnode *abnode, register char *abuffer,
132 register afs_int32 alen)
134 return BOP_GETSTRING(abnode, abuffer, alen);
138 bnode_GetParm(register struct bnode *abnode, register afs_int32 aindex,
139 register char *abuffer, afs_int32 alen)
141 return BOP_GETPARM(abnode, aindex, abuffer, alen);
145 bnode_GetStat(register struct bnode *abnode, register afs_int32 * astatus)
147 return BOP_GETSTAT(abnode, astatus);
151 bnode_RestartP(register struct bnode *abnode)
153 return BOP_RESTARTP(abnode);
157 bnode_Check(register struct bnode *abnode)
159 if (abnode->flags & BNODE_WAIT) {
160 abnode->flags &= ~BNODE_WAIT;
161 LWP_NoYieldSignal(abnode);
166 /* tell if an instance has a core file */
168 bnode_HasCore(register struct bnode *abnode)
170 return BOP_HASCORE(abnode);
173 /* wait for all bnodes to stabilize */
177 register struct bnode *tb;
178 register afs_int32 code;
182 for (tb = allBnodes; tb; tb = tb->next) {
184 code = BOP_GETSTAT(tb, &stat);
189 if (stat != tb->goal) {
190 tb->flags |= BNODE_WAIT;
200 /* wait until bnode status is correct */
202 bnode_WaitStatus(register struct bnode *abnode, int astatus)
204 register afs_int32 code;
210 code = BOP_GETSTAT(abnode, &stat);
214 /* otherwise, check if we're done */
215 if (stat == astatus) {
216 bnode_Release(abnode);
219 if (astatus != abnode->goal) {
220 bnode_Release(abnode);
221 return -1; /* no longer our goal, don't keep waiting */
223 /* otherwise, block */
224 abnode->flags |= BNODE_WAIT;
225 LWP_WaitProcess(abnode);
230 bnode_SetStat(register struct bnode *abnode, register int agoal)
232 abnode->goal = agoal;
234 BOP_SETSTAT(abnode, agoal);
235 abnode->flags &= ~BNODE_ERRORSTOP;
240 bnode_SetGoal(register struct bnode *abnode, register int agoal)
242 abnode->goal = agoal;
248 bnode_SetFileGoal(register struct bnode *abnode, register int agoal)
250 if (abnode->fileGoal == agoal)
251 return 0; /* already done */
252 abnode->fileGoal = agoal;
257 /* apply a function to all bnodes in the system */
259 bnode_ApplyInstance(int (*aproc) (), char *arock)
261 register struct bnode *tb, *nb;
262 register afs_int32 code;
264 for (tb = allBnodes; tb; tb = nb) {
266 code = (*aproc) (tb, arock);
274 bnode_FindInstance(register char *aname)
276 register struct bnode *tb;
278 for (tb = allBnodes; tb; tb = tb->next) {
279 if (!strcmp(tb->name, aname))
285 static struct bnode_type *
286 FindType(register char *aname)
288 register struct bnode_type *tt;
290 for (tt = allTypes; tt; tt = tt->next) {
291 if (!strcmp(tt->name, aname))
294 return (struct bnode_type *)0;
298 bnode_Register(char *atype, struct bnode_ops *aprocs, int anparms)
300 register struct bnode_type *tt;
302 for (tt = allTypes; tt; tt = tt->next) {
303 if (!strcmp(tt->name, atype))
307 tt = (struct bnode_type *)malloc(sizeof(struct bnode_type));
308 memset(tt, 0, sizeof(struct bnode_type));
318 bnode_Create(char *atype, char *ainstance, struct bnode ** abp, char *ap1,
319 char *ap2, char *ap3, char *ap4, char *ap5, char *notifier,
322 struct bnode_type *type;
324 char *notifierpath = NULL;
327 if (bnode_FindInstance(ainstance))
329 type = FindType(atype);
333 if (notifier && strcmp(notifier, NONOTIFIER)) {
334 /* construct local path from canonical (wire-format) path */
335 if (ConstructLocalBinPath(notifier, ¬ifierpath)) {
336 bozo_Log("BNODE-Create: Notifier program path invalid '%s'\n",
341 if (stat(notifierpath, &tstat)) {
342 bozo_Log("BNODE-Create: Notifier program '%s' not found\n",
348 tb = (*type->ops->create) (ainstance, ap1, ap2, ap3, ap4, ap5);
353 tb->notifier = notifierpath;
357 /* The fs_create above calls bnode_InitBnode() which always sets the
358 ** fileGoal to BSTAT_NORMAL .... overwrite it with whatever is passed into
359 ** this function as a parameter... */
360 tb->fileGoal = fileGoal;
362 bnode_SetStat(tb, tb->goal); /* nudge it once */
368 bnode_DeleteName(char *ainstance)
370 register struct bnode *tb;
372 tb = bnode_FindInstance(ainstance);
376 return bnode_Delete(tb);
380 bnode_Hold(register struct bnode *abnode)
387 bnode_Release(register struct bnode *abnode)
390 if (abnode->refCount == 0 && abnode->flags & BNODE_DELETE) {
391 abnode->flags &= ~BNODE_DELETE; /* we're going for it */
392 bnode_Delete(abnode);
398 bnode_Delete(register struct bnode *abnode)
400 register afs_int32 code;
401 register struct bnode **lb, *ub;
404 if (abnode->refCount != 0) {
405 abnode->flags |= BNODE_DELETE;
409 /* make sure the bnode is idle before zapping */
411 code = BOP_GETSTAT(abnode, &temp);
412 bnode_Release(abnode);
415 if (temp != BSTAT_SHUTDOWN)
418 /* all clear to zap */
419 for (lb = &allBnodes, ub = *lb; ub; lb = &ub->next, ub = *lb) {
421 /* unthread it from the list */
426 free(abnode->name); /* do this first, since bnode fields may be bad after BOP_DELETE */
427 code = BOP_DELETE(abnode); /* don't play games like holding over this one */
432 /* function to tell if there's a timeout coming up */
434 bnode_PendingTimeout(register struct bnode *abnode)
436 return (abnode->flags & BNODE_NEEDTIMEOUT);
439 /* function called to set / clear periodic bnode wakeup times */
441 bnode_SetTimeout(register struct bnode *abnode, afs_int32 atimeout)
444 abnode->nextTimeout = FT_ApproxTime() + atimeout;
445 abnode->flags |= BNODE_NEEDTIMEOUT;
446 abnode->period = atimeout;
447 IOMGR_Cancel(bproc_pid);
449 abnode->flags &= ~BNODE_NEEDTIMEOUT;
454 /* used by new bnode creation code to format bnode header */
456 bnode_InitBnode(register struct bnode *abnode, struct bnode_ops *abnodeops,
459 struct bnode **lb, *nb;
461 /* format the bnode properly */
462 memset(abnode, 0, sizeof(struct bnode));
463 abnode->ops = abnodeops;
464 abnode->name = (char *)malloc(strlen(aname) + 1);
467 strcpy(abnode->name, aname);
468 abnode->flags = BNODE_ACTIVE;
469 abnode->fileGoal = BSTAT_NORMAL;
470 abnode->goal = BSTAT_SHUTDOWN;
472 /* put the bnode at the end of the list so we write bnode file in same order */
473 for (lb = &allBnodes, nb = *lb; nb; lb = &nb->next, nb = *lb);
480 DeleteProc(register struct bnode_proc *abproc)
482 register struct bnode_proc **pb, *tb;
483 struct bnode_proc *nb;
485 for (pb = &allProcs, tb = *pb; tb; pb = &tb->next, tb = nb) {
496 /* bnode lwp executes this code repeatedly */
500 register afs_int32 code;
501 register struct bnode *tb;
502 register afs_int32 temp;
503 register struct bnode_proc *tp;
505 int options; /* must not be register */
511 /* first figure out how long to sleep for */
512 temp = 0x7fffffff; /* afs_int32 time; maxint doesn't work in select */
514 for (tb = allBnodes; tb; tb = tb->next) {
515 if (tb->flags & BNODE_NEEDTIMEOUT) {
516 if (tb->nextTimeout < temp) {
518 temp = tb->nextTimeout;
522 /* now temp has the time at which we should wakeup next */
526 temp -= FT_ApproxTime(); /* how many seconds until next event */
532 code = IOMGR_Select(0, 0, 0, 0, &tv);
534 code = 0; /* fake timeout code */
536 /* figure out why we woke up; child exit or timeouts */
537 FT_GetTimeOfDay(&tv, 0); /* must do the real gettimeofday once and a while */
540 /* check all bnodes to see which ones need timeout events */
541 for (tb = allBnodes; tb; tb = nb) {
542 if ((tb->flags & BNODE_NEEDTIMEOUT) && temp > tb->nextTimeout) {
546 if (tb->flags & BNODE_NEEDTIMEOUT) { /* check again, BOP_TIMEOUT could change */
547 tb->nextTimeout = FT_ApproxTime() + tb->period;
550 bnode_Release(tb); /* delete may occur here */
556 /* signalled, probably by incoming signal */
559 bnode_waiting = options | 0x800000;
560 code = waitpid((pid_t) - 1, &status, options);
562 if (code == 0 || code == -1)
563 break; /* all done */
564 /* otherwise code has a process id, which we now search for */
565 for (tp = allProcs; tp; tp = tp->next)
573 /* count restarts in last 10 seconds */
574 if (temp > tb->rsTime + 30) {
575 /* it's been 10 seconds we've been counting */
580 if (WIFSIGNALED(status) == 0) {
581 /* exited, not signalled */
582 tp->lastExit = WEXITSTATUS(status);
585 tb->errorCode = tp->lastExit;
586 tb->lastErrorExit = FT_ApproxTime();
587 RememberProcName(tp);
591 bozo_Log("%s:%s exited with code %d\n", tb->name,
592 tp->coreName, tp->lastExit);
594 bozo_Log("%s exited with code %d\n", tb->name,
597 /* Signal occurred, perhaps spurious due to shutdown request.
598 * If due to a shutdown request, don't overwrite last error
601 tp->lastSignal = WTERMSIG(status);
603 if (tp->lastSignal != SIGQUIT
604 && tp->lastSignal != SIGTERM
605 && tp->lastSignal != SIGKILL) {
606 tb->errorSignal = tp->lastSignal;
607 tb->lastErrorExit = FT_ApproxTime();
608 RememberProcName(tp);
611 bozo_Log("%s:%s exited on signal %d%s\n",
612 tb->name, tp->coreName, tp->lastSignal,
613 WCOREDUMP(status) ? " (core dumped)" :
616 bozo_Log("%s exited on signal %d%s\n", tb->name,
618 WCOREDUMP(status) ? " (core dumped)" :
622 tb->lastAnyExit = FT_ApproxTime();
625 bozo_Log("BNODE: Notifier %s will be called\n",
629 BOP_PROCEXIT(tb, tp);
632 if (tb->rsCount++ > 10) {
633 /* 10 in 10 seconds */
634 tb->flags |= BNODE_ERRORSTOP;
635 bnode_SetGoal(tb, BSTAT_SHUTDOWN);
637 ("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n",
640 bnode_Release(tb); /* bnode delete can happen here */
643 bnode_stats.weirdPids++;
650 SendNotifierData(register int fd, register struct bnode_proc *tp)
652 register struct bnode *tb = tp->bnode;
653 char buffer[1000], *bufp = buffer, *buf1;
657 * First sent out the bnode_proc struct
659 (void)sprintf(bufp, "BEGIN bnode_proc\n");
660 bufp += strlen(bufp);
661 (void)sprintf(bufp, "comLine: %s\n", tp->comLine);
662 bufp += strlen(bufp);
663 if (!(buf1 = tp->coreName))
665 (void)sprintf(bufp, "coreName: %s\n", buf1);
666 bufp += strlen(bufp);
667 (void)sprintf(bufp, "pid: %ld\n", tp->pid);
668 bufp += strlen(bufp);
669 (void)sprintf(bufp, "lastExit: %ld\n", tp->lastExit);
670 bufp += strlen(bufp);
672 (void)sprintf(bufp, "lastSignal: %ld\n", tp->lastSignal);
673 bufp += strlen(bufp);
675 (void)sprintf(bufp, "flags: %ld\n", tp->flags);
676 bufp += strlen(bufp);
677 (void)sprintf(bufp, "END bnode_proc\n");
678 bufp += strlen(bufp);
679 len = (int)(bufp - buffer);
680 if (write(fd, buffer, len) < 0) {
685 * Now sent out the bnode struct
688 (void)sprintf(bufp, "BEGIN bnode\n");
689 bufp += strlen(bufp);
690 (void)sprintf(bufp, "name: %s\n", tb->name);
691 bufp += strlen(bufp);
692 (void)sprintf(bufp, "rsTime: %ld\n", tb->rsTime);
693 bufp += strlen(bufp);
694 (void)sprintf(bufp, "rsCount: %ld\n", tb->rsCount);
695 bufp += strlen(bufp);
696 (void)sprintf(bufp, "procStartTime: %ld\n", tb->procStartTime);
697 bufp += strlen(bufp);
698 (void)sprintf(bufp, "procStarts: %ld\n", tb->procStarts);
699 bufp += strlen(bufp);
700 (void)sprintf(bufp, "lastAnyExit: %ld\n", tb->lastAnyExit);
701 bufp += strlen(bufp);
702 (void)sprintf(bufp, "lastErrorExit: %ld\n", tb->lastErrorExit);
703 bufp += strlen(bufp);
704 (void)sprintf(bufp, "errorCode: %ld\n", tb->errorCode);
705 bufp += strlen(bufp);
706 (void)sprintf(bufp, "errorSignal: %ld\n", tb->errorSignal);
707 bufp += strlen(bufp);
709 (void) sprintf(bufp, "lastErrorName: %s\n", tb->lastErrorName);
710 bufp += strlen(bufp);
712 (void)sprintf(bufp, "goal: %d\n", tb->goal);
713 bufp += strlen(bufp);
714 (void)sprintf(bufp, "END bnode\n");
715 bufp += strlen(bufp);
716 len = (int)bufp - (int)buffer;
717 if (write(fd, buffer, len) < 0) {
723 hdl_notifier(struct bnode_proc *tp)
725 #ifndef AFS_NT40_ENV /* NT notifier callout not yet implemented */
726 int code, pid, status;
729 if (stat(tp->bnode->notifier, &tstat)) {
730 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
731 tp->bnode->notifier);
734 if ((pid = fork()) == 0) {
736 struct bnode *tb = tp->bnode;
739 #if defined(AFS_HPUX_ENV) || defined(AFS_SUN5_ENV) || defined(AFS_SGI51_ENV)
742 #ifdef AFS_LINUX20_ENV
748 fout = popen(tb->notifier, "w");
750 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
752 perror(tb->notifier);
755 code = SendNotifierData(fileno(fout), tp);
758 } else if (pid < 0) {
759 bozo_Log("Failed to fork creating process to handle notifier '%s'\n",
760 tp->bnode->notifier);
763 #endif /* AFS_NT40_ENV */
767 /* Called by IOMGR at low priority on IOMGR's stack shortly after a SIGCHLD
768 * occurs. Wakes up bproc do redo things */
770 bnode_SoftInt(int asignal)
772 IOMGR_Cancel(bproc_pid);
776 /* Called at signal interrupt level; queues function to be called
777 * when IOMGR runs again.
780 bnode_Int(int asignal)
782 extern void bozo_ShutdownAndExit();
784 if (asignal == SIGQUIT) {
785 IOMGR_SoftSig(bozo_ShutdownAndExit, (char *)asignal);
787 IOMGR_SoftSig(bnode_SoftInt, (char *)asignal);
792 /* intialize the whole system */
797 register afs_int32 code;
798 struct sigaction newaction;
804 memset(&bnode_stats, 0, sizeof(bnode_stats));
805 LWP_InitializeProcessSupport(1, &junk); /* just in case */
807 code = LWP_CreateProcess(bproc, BNODE_LWP_STACKSIZE,
808 /* priority */ 1, (void *) /* parm */ 0,
809 "bnode-manager", &bproc_pid);
812 memset((char *)&newaction, 0, sizeof(newaction));
813 newaction.sa_handler = bnode_Int;
814 code = sigaction(SIGCHLD, &newaction, NULL);
817 code = sigaction(SIGQUIT, &newaction, NULL);
823 /* free token list returned by parseLine */
825 bnode_FreeTokens(register struct bnode_token *alist)
827 register struct bnode_token *nlist;
828 for (; alist; alist = nlist) {
839 if (x == 0 || x == ' ' || x == '\t' || x == '\n')
846 bnode_ParseLine(char *aline, struct bnode_token **alist)
851 struct bnode_token *first, *last;
852 register struct bnode_token *ttok;
855 inToken = 0; /* not copying token chars at start */
856 first = (struct bnode_token *)0;
857 last = (struct bnode_token *)0;
860 if (tc == 0 || space(tc)) { /* terminating null gets us in here, too */
862 inToken = 0; /* end of this token */
865 (struct bnode_token *)malloc(sizeof(struct bnode_token));
866 ttok->next = (struct bnode_token *)0;
867 ttok->key = (char *)malloc(strlen(tbuffer) + 1);
868 strcpy(ttok->key, tbuffer);
878 /* an alpha character */
883 if (tptr - tbuffer >= sizeof(tbuffer))
884 return -1; /* token too long */
888 /* last token flushed 'cause space(0) --> true */
890 last->next = (struct bnode_token *)0;
899 bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName,
900 struct bnode_proc **aproc)
902 struct bnode_token *tlist, *tt;
904 struct bnode_proc *tp;
906 char *argv[MAXVARGS];
909 code = bnode_ParseLine(aexecString, &tlist); /* try parsing first */
912 tp = (struct bnode_proc *)malloc(sizeof(struct bnode_proc));
913 memset(tp, 0, sizeof(struct bnode_proc));
918 tp->comLine = aexecString;
919 tp->coreName = coreName; /* may be null */
920 abnode->procStartTime = FT_ApproxTime();
921 abnode->procStarts++;
923 /* convert linked list of tokens into argv structure */
924 for (tt = tlist, i = 0; i < (MAXVARGS - 1) && tt; tt = tt->next, i++) {
927 argv[i] = NULL; /* null-terminated */
929 cpid = spawnprocve(argv[0], argv, environ, -1);
930 osi_audit(BOSSpawnProcEvent, 0, AUD_STR, aexecString, AUD_END);
932 if (cpid == (pid_t) - 1) {
933 bozo_Log("Failed to spawn process for bnode '%s'\n", abnode->name);
934 bnode_FreeTokens(tlist);
939 bnode_FreeTokens(tlist);
941 tp->flags = BPROC_STARTED;
942 tp->flags &= ~BPROC_EXITED;
948 bnode_StopProc(register struct bnode_proc *aproc, int asignal)
951 if (!(aproc->flags & BPROC_STARTED) || (aproc->flags & BPROC_EXITED))
954 osi_audit(BOSStopProcEvent, 0, AUD_STR, (aproc ? aproc->comLine : NULL),
957 code = kill(aproc->pid, asignal);
958 bnode_Check(aproc->bnode);
963 bnode_Deactivate(register struct bnode *abnode)
965 register struct bnode **pb, *tb;
967 if (!(abnode->flags & BNODE_ACTIVE))
969 for (pb = &allBnodes, tb = *pb; tb; tb = nb) {
973 tb->flags &= ~BNODE_ACTIVE;