2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
13 #include <afs/procmgmt.h>
20 #include <afs/audit.h>
21 #include <afs/afsutil.h>
22 #include <afs/fileutil.h>
23 #include <opr/queue.h>
26 #include "bnode_internal.h"
27 #include "bosprototypes.h"
30 #define WCOREDUMP(x) ((x) & 0200)
33 #define BNODE_LWP_STACKSIZE (16 * 1024)
34 #define BNODE_ERROR_COUNT_MAX 16 /* maximum number of retries */
36 int bnode_waiting = 0;
37 static PROCESS bproc_pid; /* pid of waker-upper */
38 static struct opr_queue allBnodes; /**< List of all bnodes */
39 static struct opr_queue allProcs; /**< List of all processes for which we're waiting */
40 static struct opr_queue allTypes; /**< List of all registered type handlers */
42 static struct bnode_stats {
46 extern const char *DoCore;
47 extern const char *DoPidFiles;
49 extern char **environ; /* env structure */
52 int hdl_notifier(struct bnode_proc *tp);
54 /* Remember the name of the process, if any, that failed last */
56 RememberProcName(struct bnode_proc *ap)
58 struct bnode *tbnodep;
61 if (tbnodep->lastErrorName) {
62 free(tbnodep->lastErrorName);
63 tbnodep->lastErrorName = NULL;
66 tbnodep->lastErrorName = strdup(ap->coreName);
69 /* utility for use by BOP_HASCORE functions to determine where a core file might
73 bnode_CoreName(struct bnode *abnode, char *acoreName, char *abuffer)
76 strcpy(abuffer, DoCore);
78 strcat(abuffer, AFSDIR_CORE_FILE);
80 strcpy(abuffer, AFSDIR_SERVER_CORELOG_FILEPATH);
82 strcat(abuffer, acoreName);
85 strcat(abuffer, abnode->name);
89 /* save core file, if any */
91 SaveCore(struct bnode *abnode, struct bnode_proc
97 char *corefile = NULL;
98 #ifdef BOZO_SAVE_CORES
100 struct tm *TimeFields;
104 /* Linux always appends the PID to core dumps from threaded processes, so
105 * we have to scan the directory to find core files under another name. */
107 strcpy(tbuffer, DoCore);
108 strcat(tbuffer, "/");
109 strcat(tbuffer, AFSDIR_CORE_FILE);
111 code = stat(AFSDIR_SERVER_CORELOG_FILEPATH, &tstat);
116 const char *coredir = AFSDIR_LOGS_DIR;
121 logdir = opendir(coredir);
124 while ((file = readdir(logdir)) != NULL) {
125 if (strncmp(file->d_name, "core.", 5) != 0)
127 pid = atol(file->d_name + 5);
128 if (pid == aproc->pid) {
129 asprintf(&corefile, "%s/%s", coredir, file->d_name);
130 if (corefile == NULL) {
140 corefile = strdup(tbuffer);
145 bnode_CoreName(abnode, aproc->coreName, tbuffer);
146 #ifdef BOZO_SAVE_CORES
147 FT_GetTimeOfDay(&Start, 0);
148 TimeFields = localtime(&Start.tv_sec);
149 sprintf(FileName, "%s.%d%02d%02d%02d%02d%02d", tbuffer,
150 TimeFields->tm_year + 1900, TimeFields->tm_mon + 1, TimeFields->tm_mday,
151 TimeFields->tm_hour, TimeFields->tm_min, TimeFields->tm_sec);
152 strcpy(tbuffer, FileName);
154 rk_rename(corefile, tbuffer);
159 bnode_GetString(struct bnode *abnode, char *abuffer,
162 return BOP_GETSTRING(abnode, abuffer, alen);
166 bnode_GetParm(struct bnode *abnode, afs_int32 aindex,
167 char *abuffer, afs_int32 alen)
169 return BOP_GETPARM(abnode, aindex, abuffer, alen);
173 bnode_GetStat(struct bnode *abnode, afs_int32 * astatus)
175 return BOP_GETSTAT(abnode, astatus);
179 bnode_RestartP(struct bnode *abnode)
181 return BOP_RESTARTP(abnode);
185 bnode_Check(struct bnode *abnode)
187 if (abnode->flags & BNODE_WAIT) {
188 abnode->flags &= ~BNODE_WAIT;
189 LWP_NoYieldSignal(abnode);
194 /* tell if an instance has a core file */
196 bnode_HasCore(struct bnode *abnode)
198 return BOP_HASCORE(abnode);
201 /* wait for all bnodes to stabilize */
205 struct opr_queue *cursor;
210 for (opr_queue_Scan(&allBnodes, cursor)) {
211 struct bnode *tb = opr_queue_Entry(cursor, struct bnode, q);
214 code = BOP_GETSTAT(tb, &stat);
219 if (stat != tb->goal) {
220 tb->flags |= BNODE_WAIT;
230 /* wait until bnode status is correct */
232 bnode_WaitStatus(struct bnode *abnode, int astatus)
240 code = BOP_GETSTAT(abnode, &stat);
244 /* otherwise, check if we're done */
245 if (stat == astatus) {
246 bnode_Release(abnode);
249 if (astatus != abnode->goal) {
250 bnode_Release(abnode);
251 return -1; /* no longer our goal, don't keep waiting */
253 /* otherwise, block */
254 abnode->flags |= BNODE_WAIT;
255 LWP_WaitProcess(abnode);
260 bnode_ResetErrorCount(struct bnode *abnode)
262 abnode->errorStopCount = 0;
263 abnode->errorStopDelay = 0;
268 bnode_SetStat(struct bnode *abnode, int agoal)
270 abnode->goal = agoal;
272 BOP_SETSTAT(abnode, agoal);
273 abnode->flags &= ~BNODE_ERRORSTOP;
278 bnode_SetGoal(struct bnode *abnode, int agoal)
280 abnode->goal = agoal;
286 bnode_SetFileGoal(struct bnode *abnode, int agoal)
288 if (abnode->fileGoal == agoal)
289 return 0; /* already done */
290 abnode->fileGoal = agoal;
295 /* apply a function to all bnodes in the system */
297 bnode_ApplyInstance(int (*aproc) (struct bnode *tb, void *), void *arock)
299 struct opr_queue *cursor, *store;
302 for (opr_queue_ScanSafe(&allBnodes, cursor, store)) {
303 struct bnode *tb = opr_queue_Entry(cursor, struct bnode, q);
304 code = (*aproc) (tb, arock);
312 bnode_FindInstance(char *aname)
314 struct opr_queue *cursor;
316 for (opr_queue_Scan(&allBnodes, cursor)) {
317 struct bnode *tb = opr_queue_Entry(cursor, struct bnode, q);
319 if (!strcmp(tb->name, aname))
325 static struct bnode_type *
326 FindType(char *aname)
328 struct opr_queue *cursor;
330 for (opr_queue_Scan(&allTypes, cursor)) {
331 struct bnode_type *tt = opr_queue_Entry(cursor, struct bnode_type, q);
333 if (!strcmp(tt->name, aname))
340 bnode_Register(char *atype, struct bnode_ops *aprocs, int anparms)
342 struct opr_queue *cursor;
343 struct bnode_type *tt = NULL;
345 for (opr_queue_Scan(&allTypes, cursor), tt = NULL) {
346 tt = opr_queue_Entry(cursor, struct bnode_type, q);
347 if (!strcmp(tt->name, atype))
351 tt = calloc(1, sizeof(struct bnode_type));
352 opr_queue_Init(&tt->q);
353 opr_queue_Prepend(&allTypes, &tt->q);
361 bnode_Create(char *atype, char *ainstance, struct bnode ** abp, char *ap1,
362 char *ap2, char *ap3, char *ap4, char *ap5, char *notifier,
363 int fileGoal, int rewritefile)
365 struct bnode_type *type;
367 char *notifierpath = NULL;
370 if (bnode_FindInstance(ainstance))
372 type = FindType(atype);
376 if (notifier && strcmp(notifier, NONOTIFIER)) {
377 /* construct local path from canonical (wire-format) path */
378 if (ConstructLocalBinPath(notifier, ¬ifierpath)) {
379 bozo_Log("BNODE-Create: Notifier program path invalid '%s'\n",
384 if (stat(notifierpath, &tstat)) {
385 bozo_Log("BNODE-Create: Notifier program '%s' not found\n",
391 tb = (*type->ops->create) (ainstance, ap1, ap2, ap3, ap4, ap5);
396 tb->notifier = notifierpath;
400 /* The fs_create above calls bnode_InitBnode() which always sets the
401 ** fileGoal to BSTAT_NORMAL .... overwrite it with whatever is passed into
402 ** this function as a parameter... */
403 tb->fileGoal = fileGoal;
405 bnode_SetStat(tb, tb->goal); /* nudge it once */
407 if (rewritefile != 0)
414 bnode_DeleteName(char *ainstance)
418 tb = bnode_FindInstance(ainstance);
422 return bnode_Delete(tb);
426 bnode_Hold(struct bnode *abnode)
433 bnode_Release(struct bnode *abnode)
436 if (abnode->refCount == 0 && abnode->flags & BNODE_DELETE) {
437 abnode->flags &= ~BNODE_DELETE; /* we're going for it */
438 bnode_Delete(abnode);
444 bnode_Delete(struct bnode *abnode)
449 if (abnode->refCount != 0) {
450 abnode->flags |= BNODE_DELETE;
454 /* make sure the bnode is idle before zapping */
456 code = BOP_GETSTAT(abnode, &temp);
457 bnode_Release(abnode);
460 if (temp != BSTAT_SHUTDOWN)
463 /* all clear to zap */
464 opr_queue_Remove(&abnode->q);
465 free(abnode->name); /* do this first, since bnode fields may be bad after BOP_DELETE */
466 code = BOP_DELETE(abnode); /* don't play games like holding over this one */
471 /* function to tell if there's a timeout coming up */
473 bnode_PendingTimeout(struct bnode *abnode)
475 return (abnode->flags & BNODE_NEEDTIMEOUT);
478 /* function called to set / clear periodic bnode wakeup times */
480 bnode_SetTimeout(struct bnode *abnode, afs_int32 atimeout)
483 abnode->nextTimeout = FT_ApproxTime() + atimeout;
484 abnode->flags |= BNODE_NEEDTIMEOUT;
485 abnode->period = atimeout;
486 IOMGR_Cancel(bproc_pid);
488 abnode->flags &= ~BNODE_NEEDTIMEOUT;
493 /* used by new bnode creation code to format bnode header */
495 bnode_InitBnode(struct bnode *abnode, struct bnode_ops *abnodeops,
498 /* format the bnode properly */
499 memset(abnode, 0, sizeof(struct bnode));
500 opr_queue_Init(&abnode->q);
501 abnode->ops = abnodeops;
502 abnode->name = strdup(aname);
505 abnode->flags = BNODE_ACTIVE;
506 abnode->fileGoal = BSTAT_NORMAL;
507 abnode->goal = BSTAT_SHUTDOWN;
509 /* put the bnode at the end of the list so we write bnode file in same order */
510 opr_queue_Append(&allBnodes, &abnode->q);
515 /* bnode lwp executes this code repeatedly */
522 struct opr_queue *cursor, *store;
523 struct bnode_proc *tp;
524 int options; /* must not be register */
530 /* first figure out how long to sleep for */
531 temp = 0x7fffffff; /* afs_int32 time; maxint doesn't work in select */
533 for (opr_queue_Scan(&allBnodes, cursor)) {
534 tb = opr_queue_Entry(cursor, struct bnode, q);
535 if (tb->flags & BNODE_NEEDTIMEOUT) {
536 if (tb->nextTimeout < temp) {
538 temp = tb->nextTimeout;
542 /* now temp has the time at which we should wakeup next */
546 temp -= FT_ApproxTime(); /* how many seconds until next event */
552 code = IOMGR_Select(0, 0, 0, 0, &tv);
554 code = 0; /* fake timeout code */
556 /* figure out why we woke up; child exit or timeouts */
557 FT_GetTimeOfDay(&tv, 0); /* must do the real gettimeofday once and a while */
560 /* check all bnodes to see which ones need timeout events */
561 for (opr_queue_ScanSafe(&allBnodes, cursor, store)) {
562 tb = opr_queue_Entry(cursor, struct bnode, q);
563 if ((tb->flags & BNODE_NEEDTIMEOUT) && temp > tb->nextTimeout) {
567 if (tb->flags & BNODE_NEEDTIMEOUT) { /* check again, BOP_TIMEOUT could change */
568 tb->nextTimeout = FT_ApproxTime() + tb->period;
570 bnode_Release(tb); /* delete may occur here */
575 /* signalled, probably by incoming signal */
578 bnode_waiting = options | 0x800000;
579 code = waitpid((pid_t) - 1, &status, options);
581 if (code == 0 || code == -1)
582 break; /* all done */
583 /* otherwise code has a process id, which we now search for */
584 for (tp = NULL, opr_queue_Scan(&allProcs, cursor), tp = NULL) {
585 tp = opr_queue_Entry(cursor, struct bnode_proc, q);
595 /* count restarts in last 30 seconds */
596 if (temp > tb->rsTime + 30) {
597 /* it's been 30 seconds we've been counting */
603 if (WIFSIGNALED(status) == 0) {
604 /* exited, not signalled */
605 tp->lastExit = WEXITSTATUS(status);
608 tb->errorCode = tp->lastExit;
609 tb->lastErrorExit = FT_ApproxTime();
610 RememberProcName(tp);
614 bozo_Log("%s:%s exited with code %d\n", tb->name,
615 tp->coreName, tp->lastExit);
617 bozo_Log("%s exited with code %d\n", tb->name,
620 /* Signal occurred, perhaps spurious due to shutdown request.
621 * If due to a shutdown request, don't overwrite last error
624 tp->lastSignal = WTERMSIG(status);
626 if (tp->lastSignal != SIGQUIT
627 && tp->lastSignal != SIGTERM
628 && tp->lastSignal != SIGKILL) {
629 tb->errorSignal = tp->lastSignal;
630 tb->lastErrorExit = FT_ApproxTime();
631 RememberProcName(tp);
634 bozo_Log("%s:%s exited on signal %d%s\n",
635 tb->name, tp->coreName, tp->lastSignal,
636 WCOREDUMP(status) ? " (core dumped)" :
639 bozo_Log("%s exited on signal %d%s\n", tb->name,
641 WCOREDUMP(status) ? " (core dumped)" :
645 tb->lastAnyExit = FT_ApproxTime();
648 bozo_Log("BNODE: Notifier %s will be called\n",
653 if (tb->goal && tb->rsCount++ > 10) {
654 /* 10 in 30 seconds */
655 if (tb->errorStopCount >= BNODE_ERROR_COUNT_MAX) {
656 tb->errorStopDelay = 0; /* max reached, give up. */
658 tb->errorStopCount++;
659 if (!tb->errorStopDelay) {
660 tb->errorStopDelay = 1;
662 tb->errorStopDelay *= 2;
665 tb->flags |= BNODE_ERRORSTOP;
666 bnode_SetGoal(tb, BSTAT_SHUTDOWN);
668 ("BNODE '%s' repeatedly failed to start, perhaps missing executable.\n",
671 BOP_PROCEXIT(tb, tp);
673 bnode_Release(tb); /* bnode delete can happen here */
674 opr_queue_Remove(&tp->q);
677 bnode_stats.weirdPids++;
685 SendNotifierData(int fd, struct bnode_proc *tp)
687 struct bnode *tb = tp->bnode;
688 char buffer[1000], *bufp = buffer, *buf1;
692 * First sent out the bnode_proc struct
694 (void)sprintf(bufp, "BEGIN bnode_proc\n");
695 bufp += strlen(bufp);
696 (void)sprintf(bufp, "comLine: %s\n", tp->comLine);
697 bufp += strlen(bufp);
698 if (!(buf1 = tp->coreName))
700 (void)sprintf(bufp, "coreName: %s\n", buf1);
701 bufp += strlen(bufp);
702 (void)sprintf(bufp, "pid: %ld\n", afs_printable_int32_ld(tp->pid));
703 bufp += strlen(bufp);
704 (void)sprintf(bufp, "lastExit: %ld\n", afs_printable_int32_ld(tp->lastExit));
705 bufp += strlen(bufp);
707 (void)sprintf(bufp, "lastSignal: %ld\n", afs_printable_int32_ld(tp->lastSignal));
708 bufp += strlen(bufp);
710 (void)sprintf(bufp, "flags: %ld\n", afs_printable_int32_ld(tp->flags));
711 bufp += strlen(bufp);
712 (void)sprintf(bufp, "END bnode_proc\n");
713 bufp += strlen(bufp);
714 len = (int)(bufp - buffer);
715 if (write(fd, buffer, len) < 0) {
720 * Now sent out the bnode struct
723 (void)sprintf(bufp, "BEGIN bnode\n");
724 bufp += strlen(bufp);
725 (void)sprintf(bufp, "name: %s\n", tb->name);
726 bufp += strlen(bufp);
727 (void)sprintf(bufp, "rsTime: %ld\n", afs_printable_int32_ld(tb->rsTime));
728 bufp += strlen(bufp);
729 (void)sprintf(bufp, "rsCount: %ld\n", afs_printable_int32_ld(tb->rsCount));
730 bufp += strlen(bufp);
731 (void)sprintf(bufp, "procStartTime: %ld\n", afs_printable_int32_ld(tb->procStartTime));
732 bufp += strlen(bufp);
733 (void)sprintf(bufp, "procStarts: %ld\n", afs_printable_int32_ld(tb->procStarts));
734 bufp += strlen(bufp);
735 (void)sprintf(bufp, "lastAnyExit: %ld\n", afs_printable_int32_ld(tb->lastAnyExit));
736 bufp += strlen(bufp);
737 (void)sprintf(bufp, "lastErrorExit: %ld\n", afs_printable_int32_ld(tb->lastErrorExit));
738 bufp += strlen(bufp);
739 (void)sprintf(bufp, "errorCode: %ld\n", afs_printable_int32_ld(tb->errorCode));
740 bufp += strlen(bufp);
741 (void)sprintf(bufp, "errorSignal: %ld\n", afs_printable_int32_ld(tb->errorSignal));
742 bufp += strlen(bufp);
744 (void) sprintf(bufp, "lastErrorName: %s\n", tb->lastErrorName);
745 bufp += strlen(bufp);
747 (void)sprintf(bufp, "goal: %d\n", tb->goal);
748 bufp += strlen(bufp);
749 (void)sprintf(bufp, "END bnode\n");
750 bufp += strlen(bufp);
751 len = (int)(bufp - buffer);
752 if (write(fd, buffer, len) < 0) {
759 hdl_notifier(struct bnode_proc *tp)
761 #ifndef AFS_NT40_ENV /* NT notifier callout not yet implemented */
765 if (stat(tp->bnode->notifier, &tstat)) {
766 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
767 tp->bnode->notifier);
770 if ((pid = fork()) == 0) {
772 struct bnode *tb = tp->bnode;
774 #if defined(AFS_HPUX_ENV) || defined(AFS_SUN5_ENV) || defined(AFS_SGI51_ENV)
776 #elif defined(AFS_DARWIN90_ENV)
778 #elif defined(AFS_LINUX20_ENV) || defined(AFS_AIX_ENV)
783 fout = popen(tb->notifier, "w");
785 bozo_Log("BNODE: Failed to find notifier '%s'; ignored\n",
787 perror(tb->notifier);
790 SendNotifierData(fileno(fout), tp);
793 } else if (pid < 0) {
794 bozo_Log("Failed to fork creating process to handle notifier '%s'\n",
795 tp->bnode->notifier);
798 #endif /* AFS_NT40_ENV */
802 /* Called by IOMGR at low priority on IOMGR's stack shortly after a SIGCHLD
803 * occurs. Wakes up bproc do redo things */
805 bnode_SoftInt(void *param)
807 /* int asignal = (int) param; */
809 IOMGR_Cancel(bproc_pid);
813 /* Called at signal interrupt level; queues function to be called
814 * when IOMGR runs again.
817 bnode_Int(int asignal)
819 if (asignal == SIGQUIT || asignal == SIGTERM) {
820 IOMGR_SoftSig(bozo_ShutdownAndExit, (void *)(intptr_t)asignal);
822 IOMGR_SoftSig(bnode_SoftInt, (void *)(intptr_t)asignal);
827 /* intialize the whole system */
833 struct sigaction newaction;
834 static int initDone = 0;
839 opr_queue_Init(&allTypes);
840 opr_queue_Init(&allProcs);
841 opr_queue_Init(&allBnodes);
842 memset(&bnode_stats, 0, sizeof(bnode_stats));
843 LWP_InitializeProcessSupport(1, &junk); /* just in case */
845 code = LWP_CreateProcess(bproc, BNODE_LWP_STACKSIZE,
846 /* priority */ 1, (void *) /* parm */ 0,
847 "bnode-manager", &bproc_pid);
850 memset(&newaction, 0, sizeof(newaction));
851 newaction.sa_handler = bnode_Int;
852 code = sigaction(SIGCHLD, &newaction, NULL);
855 code = sigaction(SIGQUIT, &newaction, NULL);
858 code = sigaction(SIGTERM, &newaction, NULL);
864 /* free token list returned by parseLine */
866 bnode_FreeTokens(struct bnode_token *alist)
868 struct bnode_token *nlist;
869 for (; alist; alist = nlist) {
880 if (x == 0 || x == ' ' || x == '\t' || x == '\n')
887 bnode_ParseLine(char *aline, struct bnode_token **alist)
892 struct bnode_token *first, *last;
893 struct bnode_token *ttok;
896 inToken = 0; /* not copying token chars at start */
897 first = (struct bnode_token *)0;
898 last = (struct bnode_token *)0;
901 if (tc == 0 || space(tc)) { /* terminating null gets us in here, too */
903 inToken = 0; /* end of this token */
905 ttok = malloc(sizeof(struct bnode_token));
906 ttok->next = (struct bnode_token *)0;
907 ttok->key = strdup(tbuffer);
917 /* an alpha character */
922 if (tptr - tbuffer >= sizeof(tbuffer))
923 return -1; /* token too long */
927 /* last token flushed 'cause space(0) --> true */
929 last->next = (struct bnode_token *)0;
938 bnode_NewProc(struct bnode *abnode, char *aexecString, char *coreName,
939 struct bnode_proc **aproc)
941 struct bnode_token *tlist, *tt;
943 struct bnode_proc *tp;
945 char *argv[MAXVARGS];
948 code = bnode_ParseLine(aexecString, &tlist); /* try parsing first */
951 tp = calloc(1, sizeof(struct bnode_proc));
952 opr_queue_Init(&tp->q);
954 tp->comLine = aexecString;
955 tp->coreName = coreName; /* may be null */
956 abnode->procStartTime = FT_ApproxTime();
957 abnode->procStarts++;
959 /* convert linked list of tokens into argv structure */
960 for (tt = tlist, i = 0; i < (MAXVARGS - 1) && tt; tt = tt->next, i++) {
963 argv[i] = NULL; /* null-terminated */
965 cpid = spawnprocve(argv[0], argv, environ, -1);
966 osi_audit(BOSSpawnProcEvent, 0, AUD_STR, aexecString, AUD_END);
968 if (cpid == (pid_t) - 1) {
969 bozo_Log("Failed to spawn process for bnode '%s'\n", abnode->name);
970 bnode_FreeTokens(tlist);
974 bozo_Log("%s started pid %ld: %s\n", abnode->name, cpid, aexecString);
976 bnode_FreeTokens(tlist);
977 opr_queue_Prepend(&allProcs, &tp->q);
980 tp->flags = BPROC_STARTED;
981 tp->flags &= ~BPROC_EXITED;
982 BOP_PROCSTARTED(abnode, tp);
988 bnode_StopProc(struct bnode_proc *aproc, int asignal)
991 if (!(aproc->flags & BPROC_STARTED) || (aproc->flags & BPROC_EXITED))
994 osi_audit(BOSStopProcEvent, 0, AUD_STR, (aproc ? aproc->comLine : NULL),
997 code = kill(aproc->pid, asignal);
998 bnode_Check(aproc->bnode);
1004 bnode_Deactivate(struct bnode *abnode)
1006 struct opr_queue *cursor;
1007 if (!(abnode->flags & BNODE_ACTIVE))
1010 if (opr_queue_IsOnQueue(&abnode->q)) {
1011 tb->flags &= ~BNODE_ACTIVE;