2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
16 #include <sys/types.h>
38 #endif /* AFS_NT40_ENV */
40 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
41 #include <afs/afsutil.h>
44 static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
45 static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
46 static int fs_hascore();
47 struct bnode *fs_create();
49 static SetNeedsClock();
52 static int emergency = 0;
54 /* if this file exists, then we have to salvage the file system */
55 #define SALFILE "SALVAGE."
57 #define POLLTIME 20 /* for handling below */
58 #define SDTIME 60 /* time in seconds given to a process to evaporate */
61 Normal operation involves having the file server and the vol server both running.
63 If the vol server terminates, it can simply be restarted.
65 If the file server terminates, the disk must salvaged before the file server
66 can be restarted. In order to restart either the file server or the salvager,
67 the vol server must be shut down.
69 If the file server terminates *normally* (exits after receiving a SIGQUIT)
70 then we don't have to salvage it.
72 The needsSalvage flag is set when the file server is started. It is cleared
73 if the file server exits when fileSDW is true but fileKillSent is false,
74 indicating that it exited after receiving a quit, but before we sent it a kill.
76 The needsSalvage flag is cleared when the salvager exits.
79 struct bnode_ops fsbnode_ops = {
94 afs_int32 timeSDStarted; /* time shutdown operation started */
95 char *filecmd; /* command to start primary file server */
96 char *volcmd; /* command to start secondary vol server */
97 char *salcmd; /* command to start salvager */
98 char *scancmd; /* command to start scanner (MR-AFS) */
99 struct bnode_proc *fileProc; /* process for file server */
100 struct bnode_proc *volProc; /* process for vol server */
101 struct bnode_proc *salProc; /* process for salvager */
102 struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */
103 afs_int32 lastFileStart; /* last start for file */
104 afs_int32 lastVolStart; /* last start for vol */
105 afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */
106 char fileRunning; /* file process is running */
107 char volRunning; /* volser is running */
108 char salRunning; /* salvager is running */
109 char scanRunning; /* scanner is running (MR_AFS) */
110 char fileSDW; /* file shutdown wait */
111 char volSDW; /* vol shutdown wait */
112 char salSDW; /* waiting for the salvager to shutdown */
113 char scanSDW; /* scanner shutdown wait (MR_AFS) */
114 char fileKillSent; /* kill signal has been sent */
117 char scanKillSent; /* kill signal has been sent (MR_AFS) */
118 char needsSalvage; /* salvage before running */
119 char needsClock; /* do we need clock ticks */
122 /* Function to tell whether this bnode has a core file or not. You might
123 * think that this could be in bnode.c, and decide what core files to check
124 * for based on the bnode's coreName property, but that doesn't work because
125 * there may not be an active process for a bnode that dumped core at the
126 * time the query is done.
129 fs_hascore(register struct ezbnode *abnode)
133 /* see if file server has a core file */
134 bnode_CoreName(abnode, "file", tbuffer);
135 if (access(tbuffer, 0) == 0)
138 /* see if volserver has a core file */
139 bnode_CoreName(abnode, "vol", tbuffer);
140 if (access(tbuffer, 0) == 0)
143 /* see if salvager left a core file */
144 bnode_CoreName(abnode, "salv", tbuffer);
145 if (access(tbuffer, 0) == 0)
148 /* see if scanner left a core file (MR-AFS) */
149 bnode_CoreName(abnode, "scan", tbuffer);
150 if (access(tbuffer, 0) == 0)
153 /* no one left a core file */
158 fs_restartp(register struct fsbnode *abnode)
160 struct bnode_token *tt;
161 register afs_int32 code;
164 code = bnode_ParseLine(abnode->filecmd, &tt);
169 code = stat(tt->key, &tstat);
171 bnode_FreeTokens(tt);
174 if (tstat.st_ctime > abnode->lastFileStart)
178 bnode_FreeTokens(tt);
182 /* now do same for volcmd */
183 code = bnode_ParseLine(abnode->volcmd, &tt);
188 code = stat(tt->key, &tstat);
190 bnode_FreeTokens(tt);
193 if (tstat.st_ctime > abnode->lastVolStart)
197 bnode_FreeTokens(tt);
201 if (abnode->scancmd) { /* Only in MR-AFS */
202 /* now do same for scancmd (MR-AFS) */
203 code = bnode_ParseLine(abnode->scancmd, &tt);
208 code = stat(tt->key, &tstat);
210 bnode_FreeTokens(tt);
213 if (tstat.st_ctime > abnode->lastScanStart)
217 bnode_FreeTokens(tt);
223 /* set needsSalvage flag, creating file SALVAGE.<instancename> if
224 we need to salvage the file system (so we can tell over panic reboots */
226 SetSalFlag(register struct fsbnode *abnode, register int aflag)
228 char tbuffer[AFSDIR_PATH_MAX];
231 abnode->needsSalvage = aflag;
232 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
233 SALFILE, abnode->b.name, NULL);
235 fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
243 /* set the needsSalvage flag according to the existence of the salvage file */
245 RestoreSalFlag(register struct fsbnode *abnode)
247 char tbuffer[AFSDIR_PATH_MAX];
249 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
250 SALFILE, abnode->b.name, NULL);
251 if (access(tbuffer, 0) == 0) {
252 /* file exists, so need to salvage */
253 abnode->needsSalvage = 1;
255 abnode->needsSalvage = 0;
261 copystr(register char *a)
264 b = (char *)malloc(strlen(a) + 1);
270 fs_delete(struct fsbnode *abnode)
272 free(abnode->filecmd);
273 free(abnode->volcmd);
274 free(abnode->salcmd);
276 free(abnode->scancmd);
284 AppendExecutableExtension(char *cmd)
286 char cmdext[_MAX_EXT];
288 _splitpath(cmd, NULL, NULL, NULL, cmdext);
289 if (*cmdext == '\0') {
290 /* no filename extension supplied for cmd; append .exe */
294 #endif /* AFS_NT40_ENV */
298 fs_create(char *ainstance, char *afilecmd, char *avolcmd, char *asalcmd,
302 register struct fsbnode *te;
303 char cmdname[AFSDIR_PATH_MAX];
304 char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
307 fileCmdpath = volCmdpath = salCmdpath = NULL;
309 /* construct local paths from canonical (wire-format) paths */
310 if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
311 bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
314 if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
315 bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
318 if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
319 bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
323 if (ascancmd && strlen(ascancmd)) {
324 if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
325 bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
331 sscanf(fileCmdpath, "%s", cmdname);
333 AppendExecutableExtension(cmdname);
335 if (stat(cmdname, &tstat)) {
336 bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
340 sscanf(volCmdpath, "%s", cmdname);
342 AppendExecutableExtension(cmdname);
344 if (stat(cmdname, &tstat)) {
345 bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
349 sscanf(salCmdpath, "%s", cmdname);
351 AppendExecutableExtension(cmdname);
353 if (stat(cmdname, &tstat)) {
354 bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
358 if (ascancmd && strlen(ascancmd)) {
359 sscanf(scanCmdpath, "%s", cmdname);
361 AppendExecutableExtension(cmdname);
363 if (stat(cmdname, &tstat)) {
364 bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
377 te = (struct fsbnode *)malloc(sizeof(struct fsbnode));
378 memset(te, 0, sizeof(struct fsbnode));
379 te->filecmd = fileCmdpath;
380 te->volcmd = volCmdpath;
381 te->salcmd = salCmdpath;
382 if (ascancmd && strlen(ascancmd))
383 te->scancmd = scanCmdpath;
386 if (bnode_InitBnode(te, &fsbnode_ops, ainstance) != 0) {
393 bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */
394 RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
395 SetNeedsClock(te); /* compute needsClock field */
396 return (struct bnode *)te;
399 /* called to SIGKILL a process if it doesn't terminate normally */
401 fs_timeout(struct fsbnode *abnode)
403 register afs_int32 now;
405 now = FT_ApproxTime();
407 if (abnode->volSDW) {
408 if (!abnode->volKillSent && now - abnode->timeSDStarted > SDTIME) {
409 bnode_StopProc(abnode->volProc, SIGKILL);
410 abnode->volKillSent = 1;
412 ("bos shutdown: volserver failed to shutdown within %d seconds\n",
416 if (abnode->salSDW) {
417 if (!abnode->salKillSent && now - abnode->timeSDStarted > SDTIME) {
418 bnode_StopProc(abnode->salProc, SIGKILL);
419 abnode->salKillSent = 1;
421 ("bos shutdown: salvager failed to shutdown within %d seconds\n",
425 if (abnode->fileSDW) {
426 if (!abnode->fileKillSent && now - abnode->timeSDStarted > FSSDTIME) {
427 bnode_StopProc(abnode->fileProc, SIGKILL);
428 abnode->fileKillSent = 1;
430 ("bos shutdown: fileserver failed to shutdown within %d seconds\n",
434 if (abnode->scanSDW) {
435 if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
436 bnode_StopProc(abnode->scanProc, SIGKILL);
437 abnode->scanKillSent = 1;
439 ("bos shutdown: scanner failed to shutdown within %d seconds\n",
443 SetNeedsClock(abnode);
448 fs_getstat(struct fsbnode *abnode, afs_int32 * astatus)
450 register afs_int32 temp;
451 if (abnode->volSDW || abnode->fileSDW || abnode->salSDW
453 temp = BSTAT_SHUTTINGDOWN;
454 else if (abnode->salRunning)
456 else if (abnode->volRunning && abnode->fileRunning
457 && (!abnode->scancmd || abnode->scanRunning))
459 else if (!abnode->salRunning && !abnode->volRunning
460 && !abnode->fileRunning && !abnode->scanRunning)
461 temp = BSTAT_SHUTDOWN;
463 temp = BSTAT_STARTINGUP;
469 fs_setstat(register struct fsbnode *abnode, afs_int32 astatus)
471 return NudgeProcs(abnode);
475 fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc)
477 /* process has exited */
479 if (aproc == abnode->volProc) {
481 abnode->volRunning = 0;
483 abnode->volKillSent = 0;
484 } else if (aproc == abnode->fileProc) {
485 /* if we were expecting a shutdown and we didn't send a kill signal
486 * and exited (didn't have a signal termination), then we assume that
487 * the file server exited after putting the appropriate volumes safely
488 * offline, and don't salvage next time.
490 if (abnode->fileSDW && !abnode->fileKillSent
491 && aproc->lastSignal == 0)
492 SetSalFlag(abnode, 0); /* shut down normally */
493 abnode->fileProc = 0;
494 abnode->fileRunning = 0;
496 abnode->fileKillSent = 0;
497 } else if (aproc == abnode->salProc) {
498 /* if we didn't shutdown the salvager, then assume it exited ok, and thus
499 * that we don't have to salvage again */
501 SetSalFlag(abnode, 0); /* salvage just completed */
503 abnode->salRunning = 0;
505 abnode->salKillSent = 0;
506 } else if (aproc == abnode->scanProc) {
507 abnode->scanProc = 0;
508 abnode->scanRunning = 0;
510 abnode->scanKillSent = 0;
513 /* now restart anyone who needs to restart */
514 return NudgeProcs(abnode);
517 /* make sure we're periodically checking the state if we need to */
519 SetNeedsClock(register struct fsbnode *ab)
521 if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
522 && (!ab->scancmd || ab->scanRunning))
523 ab->needsClock = 0; /* running normally */
524 else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
525 && !ab->salRunning && !ab->scanRunning)
526 ab->needsClock = 0; /* halted normally */
528 ab->needsClock = 1; /* other */
529 if (ab->needsClock && !bnode_PendingTimeout(ab))
530 bnode_SetTimeout(ab, POLLTIME);
532 bnode_SetTimeout(ab, 0);
536 NudgeProcs(register struct fsbnode *abnode)
538 struct bnode_proc *tp; /* not register */
539 register afs_int32 code;
542 now = FT_ApproxTime();
543 if (abnode->b.goal == 1) {
544 /* we're trying to run the system. If the file server is running, then we
545 * are trying to start up the system. If it is not running, then needsSalvage
546 * tells us if we need to run the salvager or not */
547 if (abnode->fileRunning) {
548 if (abnode->salRunning) {
549 bozo_Log("Salvager running along with file server!\n");
550 bozo_Log("Emergency shutdown\n");
552 bnode_SetGoal(abnode, BSTAT_SHUTDOWN);
553 bnode_StopProc(abnode->salProc, SIGKILL);
554 SetNeedsClock(abnode);
557 if (!abnode->volRunning) {
558 abnode->lastVolStart = FT_ApproxTime();
559 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
561 abnode->volProc = tp;
562 abnode->volRunning = 1;
565 if (abnode->scancmd) {
566 if (!abnode->scanRunning) {
567 abnode->lastScanStart = FT_ApproxTime();
569 bnode_NewProc(abnode, abnode->scancmd, "scanner",
572 abnode->scanProc = tp;
573 abnode->scanRunning = 1;
577 } else { /* file is not running */
578 /* see how to start */
579 if (!abnode->needsSalvage) {
580 /* no crash apparent, just start up normally */
581 if (!abnode->fileRunning) {
582 abnode->lastFileStart = FT_ApproxTime();
584 bnode_NewProc(abnode, abnode->filecmd, "file", &tp);
586 abnode->fileProc = tp;
587 abnode->fileRunning = 1;
588 SetSalFlag(abnode, 1);
591 if (!abnode->volRunning) {
592 abnode->lastVolStart = FT_ApproxTime();
593 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
595 abnode->volProc = tp;
596 abnode->volRunning = 1;
599 if (abnode->scancmd && !abnode->scanRunning) {
600 abnode->lastScanStart = FT_ApproxTime();
602 bnode_NewProc(abnode, abnode->scancmd, "scanner",
605 abnode->scanProc = tp;
606 abnode->scanRunning = 1;
609 } else { /* needs to be salvaged */
610 /* make sure file server and volser are gone */
611 if (abnode->volRunning) {
612 bnode_StopProc(abnode->volProc, SIGTERM);
614 abnode->timeSDStarted = now;
617 if (abnode->fileRunning) {
618 bnode_StopProc(abnode->fileProc, SIGQUIT);
619 if (!abnode->fileSDW)
620 abnode->timeSDStarted = now;
623 if (abnode->scanRunning) {
624 bnode_StopProc(abnode->scanProc, SIGTERM);
625 if (!abnode->scanSDW)
626 abnode->timeSDStarted = now;
629 if (abnode->volRunning || abnode->fileRunning
630 || abnode->scanRunning)
632 /* otherwise, it is safe to start salvager */
633 if (!abnode->salRunning) {
634 code = bnode_NewProc(abnode, abnode->salcmd, "salv", &tp);
636 abnode->salProc = tp;
637 abnode->salRunning = 1;
642 } else { /* goal is 0, we're shutting down */
643 /* trying to shutdown */
644 if (abnode->salRunning && !abnode->salSDW) {
645 bnode_StopProc(abnode->salProc, SIGTERM);
647 abnode->timeSDStarted = now;
649 if (abnode->fileRunning && !abnode->fileSDW) {
650 bnode_StopProc(abnode->fileProc, SIGQUIT);
652 abnode->timeSDStarted = now;
654 if (abnode->volRunning && !abnode->volSDW) {
655 bnode_StopProc(abnode->volProc, SIGTERM);
657 abnode->timeSDStarted = now;
659 if (abnode->scanRunning && !abnode->scanSDW) {
660 bnode_StopProc(abnode->scanProc, SIGTERM);
662 abnode->timeSDStarted = now;
665 SetNeedsClock(abnode);
670 fs_getstring(struct fsbnode *abnode, char *abuffer, afs_int32 alen)
674 if (abnode->b.goal == 1) {
675 if (abnode->fileRunning) {
677 strcpy(abuffer, "file server shutting down");
678 else if (abnode->scancmd) {
679 if (!abnode->volRunning && !abnode->scanRunning)
681 "file server up; volser and scanner down");
682 else if (abnode->volRunning && !abnode->scanRunning)
684 "file server up; volser up; scanner down");
685 else if (!abnode->volRunning && abnode->scanRunning)
687 "file server up; volser down; scanner up");
690 strcpy(abuffer, "file server running");
691 } else if (!abnode->volRunning)
692 strcpy(abuffer, "file server up; volser down");
694 strcpy(abuffer, "file server running");
695 } else if (abnode->salRunning) {
696 strcpy(abuffer, "salvaging file system");
698 strcpy(abuffer, "starting file server");
701 if (abnode->fileRunning || abnode->volRunning || abnode->scanRunning) {
702 strcpy(abuffer, "file server shutting down");
703 } else if (abnode->salRunning)
704 strcpy(abuffer, "salvager shutting down");
706 strcpy(abuffer, "file server shut down");
712 fs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer,
716 strcpy(abuffer, abnode->filecmd);
717 else if (aindex == 1)
718 strcpy(abuffer, abnode->volcmd);
719 else if (aindex == 2)
720 strcpy(abuffer, abnode->salcmd);
721 else if (aindex == 3 && abnode->scancmd)
722 strcpy(abuffer, abnode->scancmd);