2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
37 #endif /* AFS_NT40_ENV */
39 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
40 #include <afs/afsutil.h>
43 static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
44 static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
45 static int fs_hascore();
46 struct bnode *fs_create();
47 struct bnode *fsmr_create();
49 static SetNeedsClock();
52 static int emergency = 0;
54 /* if this file exists, then we have to salvage the file system */
55 #define SALFILE "SALVAGE."
57 #define POLLTIME 20 /* for handling below */
58 #define SDTIME 60 /* time in seconds given to a process to evaporate */
61 Normal operation involves having the file server and the vol server both running.
63 If the vol server terminates, it can simply be restarted.
65 If the file server terminates, the disk must salvaged before the file server
66 can be restarted. In order to restart either the file server or the salvager,
67 the vol server must be shut down.
69 If the file server terminates *normally* (exits after receiving a SIGQUIT)
70 then we don't have to salvage it.
72 The needsSalvage flag is set when the file server is started. It is cleared
73 if the file server exits when fileSDW is true but fileKillSent is false,
74 indicating that it exited after receiving a quit, but before we sent it a kill.
76 The needsSalvage flag is cleared when the salvager exits.
79 struct bnode_ops fsbnode_ops = {
94 afs_int32 timeSDStarted; /* time shutdown operation started */
95 char *filecmd; /* command to start primary file server */
96 char *volcmd; /* command to start secondary vol server */
97 char *salcmd; /* command to start salvager */
98 char *scancmd; /* command to start scanner (MR-AFS) */
99 struct bnode_proc *fileProc; /* process for file server */
100 struct bnode_proc *volProc; /* process for vol server */
101 struct bnode_proc *salProc; /* process for salvager */
102 struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */
103 afs_int32 lastFileStart; /* last start for file */
104 afs_int32 lastVolStart; /* last start for vol */
105 afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */
106 char fileRunning; /* file process is running */
107 char volRunning; /* volser is running */
108 char salRunning; /* salvager is running */
109 char scanRunning; /* scanner is running (MR_AFS) */
110 char fileSDW; /* file shutdown wait */
111 char volSDW; /* vol shutdown wait */
112 char salSDW; /* waiting for the salvager to shutdown */
113 char scanSDW; /* scanner shutdown wait (MR_AFS) */
114 char fileKillSent; /* kill signal has been sent */
117 char scanKillSent; /* kill signal has been sent (MR_AFS) */
118 char needsSalvage; /* salvage before running */
119 char needsClock; /* do we need clock ticks */
122 /* Function to tell whether this bnode has a core file or not. You might
123 * think that this could be in bnode.c, and decide what core files to check
124 * for based on the bnode's coreName property, but that doesn't work because
125 * there may not be an active process for a bnode that dumped core at the
126 * time the query is done.
128 static int fs_hascore(abnode)
129 register struct ezbnode *abnode; {
132 /* see if file server has a core file */
133 bnode_CoreName(abnode, "file", tbuffer);
134 if (access(tbuffer, 0) == 0) return 1;
136 /* see if volserver has a core file */
137 bnode_CoreName(abnode, "vol", tbuffer);
138 if (access(tbuffer, 0) == 0) return 1;
140 /* see if salvager left a core file */
141 bnode_CoreName(abnode, "salv", tbuffer);
142 if (access(tbuffer, 0) == 0) return 1;
144 /* see if scanner left a core file (MR-AFS) */
145 bnode_CoreName(abnode, "scan", tbuffer);
146 if (access(tbuffer, 0) == 0) return 1;
148 /* no one left a core file */
152 static int fs_restartp (abnode)
153 register struct fsbnode *abnode; {
154 struct bnode_token *tt;
155 register afs_int32 code;
158 code = bnode_ParseLine(abnode->filecmd, &tt);
161 code = stat(tt->key, &tstat);
163 bnode_FreeTokens(tt);
166 if (tstat.st_ctime > abnode->lastFileStart) code = 1;
168 bnode_FreeTokens(tt);
169 if (code) return code;
171 /* now do same for volcmd */
172 code = bnode_ParseLine(abnode->volcmd, &tt);
175 code = stat(tt->key, &tstat);
177 bnode_FreeTokens(tt);
180 if (tstat.st_ctime > abnode->lastVolStart) code = 1;
182 bnode_FreeTokens(tt);
183 if (code) return code;
185 if (abnode->scancmd) { /* Only in MR-AFS */
186 /* now do same for scancmd (MR-AFS) */
187 code = bnode_ParseLine(abnode->scancmd, &tt);
190 code = stat(tt->key, &tstat);
192 bnode_FreeTokens(tt);
195 if (tstat.st_ctime > abnode->lastScanStart) code = 1;
197 bnode_FreeTokens(tt);
203 /* set needsSalvage flag, creating file SALVAGE.<instancename> if
204 we need to salvage the file system (so we can tell over panic reboots */
205 static SetSalFlag(abnode, aflag)
206 register struct fsbnode *abnode;
207 register int aflag; {
208 char tbuffer[AFSDIR_PATH_MAX];
211 abnode->needsSalvage = aflag;
212 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
213 abnode->b.name, NULL);
215 fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
224 /* set the needsSalvage flag according to the existence of the salvage file */
225 static RestoreSalFlag(abnode)
226 register struct fsbnode *abnode; {
227 char tbuffer[AFSDIR_PATH_MAX];
229 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
230 abnode->b.name, NULL);
231 if (access(tbuffer, 0) == 0) {
232 /* file exists, so need to salvage */
233 abnode->needsSalvage = 1;
236 abnode->needsSalvage = 0;
244 b = (char *) malloc(strlen(a)+1);
249 static int fs_delete(abnode)
250 struct fsbnode *abnode; {
251 free(abnode->filecmd);
252 free(abnode->volcmd);
253 free(abnode->salcmd);
254 if (abnode->scancmd) free(abnode->scancmd);
261 static void AppendExecutableExtension(char *cmd)
263 char cmdext[_MAX_EXT];
265 _splitpath(cmd, NULL, NULL, NULL, cmdext);
266 if (*cmdext == '\0') {
267 /* no filename extension supplied for cmd; append .exe */
271 #endif /* AFS_NT40_ENV */
274 struct bnode *fs_create(ainstance, afilecmd, avolcmd, asalcmd, ascancmd)
281 register struct fsbnode *te;
282 char cmdname[AFSDIR_PATH_MAX];
283 char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
286 fileCmdpath = volCmdpath = salCmdpath = NULL;
288 /* construct local paths from canonical (wire-format) paths */
289 if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
290 bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
293 if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
294 bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
297 if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
298 bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
302 if (ascancmd && strlen(ascancmd)) {
303 if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
304 bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
310 sscanf(fileCmdpath, "%s", cmdname);
312 AppendExecutableExtension(cmdname);
314 if (stat(cmdname, &tstat)) {
315 bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
319 sscanf(volCmdpath, "%s", cmdname);
321 AppendExecutableExtension(cmdname);
323 if (stat(cmdname, &tstat)) {
324 bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
328 sscanf(salCmdpath, "%s", cmdname);
330 AppendExecutableExtension(cmdname);
332 if (stat(cmdname, &tstat)) {
333 bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
337 if (ascancmd && strlen(ascancmd)) {
338 sscanf(scanCmdpath, "%s", cmdname);
340 AppendExecutableExtension(cmdname);
342 if (stat(cmdname, &tstat)) {
343 bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
350 free(fileCmdpath); free(volCmdpath); free(salCmdpath);
354 te = (struct fsbnode *) malloc(sizeof(struct fsbnode));
355 memset(te, 0, sizeof(struct fsbnode));
356 te->filecmd = fileCmdpath;
357 te->volcmd = volCmdpath;
358 te->salcmd = salCmdpath;
359 if (ascancmd && strlen(ascancmd))
360 te->scancmd = scanCmdpath;
363 bnode_InitBnode(te, &fsbnode_ops, ainstance);
364 bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */
365 RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
366 SetNeedsClock(te); /* compute needsClock field */
367 return (struct bnode *) te;
370 /* called to SIGKILL a process if it doesn't terminate normally */
371 static int fs_timeout(abnode)
372 struct fsbnode *abnode; {
373 register afs_int32 now;
375 now = FT_ApproxTime();
377 if (abnode->volSDW) {
378 if (!abnode->volKillSent && now - abnode->timeSDStarted > SDTIME) {
379 bnode_StopProc(abnode->volProc, SIGKILL);
380 abnode->volKillSent = 1;
381 bozo_Log("bos shutdown: volserver failed to shutdown within %d seconds\n",
385 if (abnode->salSDW) {
386 if (!abnode->salKillSent && now - abnode->timeSDStarted > SDTIME) {
387 bnode_StopProc(abnode->salProc, SIGKILL);
388 abnode->salKillSent = 1;
389 bozo_Log("bos shutdown: salvager failed to shutdown within %d seconds\n",
393 if (abnode->fileSDW) {
394 if (!abnode->fileKillSent && now - abnode->timeSDStarted > FSSDTIME) {
395 bnode_StopProc(abnode->fileProc, SIGKILL);
396 abnode->fileKillSent = 1;
397 bozo_Log("bos shutdown: fileserver failed to shutdown within %d seconds\n",
401 if (abnode->scanSDW) {
402 if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
403 bnode_StopProc(abnode->scanProc, SIGKILL);
404 abnode->scanKillSent = 1;
405 bozo_Log("bos shutdown: scanner failed to shutdown within %d seconds\n",
409 SetNeedsClock(abnode);
413 static int fs_getstat(abnode, astatus)
414 struct fsbnode *abnode;
415 afs_int32 *astatus; {
416 register afs_int32 temp;
417 if (abnode->volSDW || abnode->fileSDW || abnode->salSDW || abnode->scanSDW)
418 temp = BSTAT_SHUTTINGDOWN;
419 else if (abnode->salRunning) temp = BSTAT_NORMAL;
420 else if (abnode->volRunning && abnode->fileRunning && (!abnode->scancmd ||
421 abnode->scanRunning)) temp = BSTAT_NORMAL;
422 else if (!abnode->salRunning && !abnode->volRunning && !abnode->fileRunning
423 && !abnode->scanRunning) temp = BSTAT_SHUTDOWN;
424 else temp = BSTAT_STARTINGUP;
429 static int fs_setstat(abnode, astatus)
430 register struct fsbnode *abnode;
432 return NudgeProcs(abnode);
435 static int fs_procexit(abnode, aproc)
436 struct fsbnode *abnode;
437 struct bnode_proc *aproc; {
438 /* process has exited */
440 if (aproc == abnode->volProc) {
442 abnode->volRunning = 0;
444 abnode->volKillSent = 0;
446 else if (aproc == abnode->fileProc) {
447 /* if we were expecting a shutdown and we didn't send a kill signal
448 * and exited (didn't have a signal termination), then we assume that
449 * the file server exited after putting the appropriate volumes safely
450 * offline, and don't salvage next time.
452 if (abnode->fileSDW && !abnode->fileKillSent && aproc->lastSignal == 0)
453 SetSalFlag(abnode, 0); /* shut down normally */
454 abnode->fileProc = 0;
455 abnode->fileRunning = 0;
457 abnode->fileKillSent = 0;
459 else if (aproc == abnode->salProc) {
460 /* if we didn't shutdown the salvager, then assume it exited ok, and thus
461 that we don't have to salvage again */
463 SetSalFlag(abnode, 0); /* salvage just completed */
465 abnode->salRunning = 0;
467 abnode->salKillSent = 0;
469 else if (aproc == abnode->scanProc) {
470 abnode->scanProc = 0;
471 abnode->scanRunning = 0;
473 abnode->scanKillSent = 0;
476 /* now restart anyone who needs to restart */
477 return NudgeProcs(abnode);
480 /* make sure we're periodically checking the state if we need to */
481 static SetNeedsClock(ab)
482 register struct fsbnode *ab;
484 if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
485 && (!ab->scancmd || ab->scanRunning))
486 ab->needsClock = 0; /* running normally */
487 else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
488 && !ab->salRunning && !ab->scanRunning)
489 ab->needsClock = 0; /* halted normally */
490 else ab->needsClock = 1; /* other */
491 if (ab->needsClock && !bnode_PendingTimeout(ab))
492 bnode_SetTimeout(ab, POLLTIME);
493 if (!ab->needsClock) bnode_SetTimeout(ab, 0);
496 static NudgeProcs(abnode)
497 register struct fsbnode *abnode; {
498 struct bnode_proc *tp; /* not register */
499 register afs_int32 code;
502 now = FT_ApproxTime();
503 if (abnode->b.goal == 1) {
504 /* we're trying to run the system. If the file server is running, then we
505 are trying to start up the system. If it is not running, then needsSalvage
506 tells us if we need to run the salvager or not */
507 if (abnode->fileRunning) {
508 if (abnode->salRunning) {
509 bozo_Log("Salvager running along with file server!\n");
510 bozo_Log("Emergency shutdown\n");
512 bnode_SetGoal(abnode, BSTAT_SHUTDOWN);
513 bnode_StopProc(abnode->salProc, SIGKILL);
514 SetNeedsClock(abnode);
517 if (!abnode->volRunning) {
518 abnode->lastVolStart = FT_ApproxTime();
519 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
521 abnode->volProc = tp;
522 abnode->volRunning = 1;
525 if (abnode->scancmd) {
526 if (!abnode->scanRunning) {
527 abnode->lastScanStart = FT_ApproxTime();
528 code = bnode_NewProc(abnode, abnode->scancmd, "scanner", &tp);
530 abnode->scanProc = tp;
531 abnode->scanRunning = 1;
536 else { /* file is not running */
537 /* see how to start */
538 if (!abnode->needsSalvage) {
539 /* no crash apparent, just start up normally */
540 if (!abnode->fileRunning) {
541 abnode->lastFileStart = FT_ApproxTime();
542 code = bnode_NewProc(abnode, abnode->filecmd, "file", &tp);
544 abnode->fileProc = tp;
545 abnode->fileRunning = 1;
546 SetSalFlag(abnode, 1);
549 if (!abnode->volRunning) {
550 abnode->lastVolStart = FT_ApproxTime();
551 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
553 abnode->volProc = tp;
554 abnode->volRunning = 1;
557 if (abnode->scancmd && !abnode->scanRunning) {
558 abnode->lastScanStart = FT_ApproxTime();
559 code = bnode_NewProc(abnode, abnode->scancmd, "scanner",
562 abnode->scanProc = tp;
563 abnode->scanRunning = 1;
567 else { /* needs to be salvaged */
568 /* make sure file server and volser are gone */
569 if (abnode->volRunning) {
570 bnode_StopProc(abnode->volProc, SIGTERM);
571 if (!abnode->volSDW) abnode->timeSDStarted = now;
574 if (abnode->fileRunning) {
575 bnode_StopProc(abnode->fileProc, SIGQUIT);
576 if (!abnode->fileSDW) abnode->timeSDStarted = now;
579 if (abnode->scanRunning) {
580 bnode_StopProc(abnode->scanProc, SIGTERM);
581 if (!abnode->scanSDW) abnode->timeSDStarted = now;
584 if (abnode->volRunning || abnode->fileRunning
585 || abnode->scanRunning) return 0;
586 /* otherwise, it is safe to start salvager */
587 if (!abnode->salRunning) {
588 code = bnode_NewProc(abnode, abnode->salcmd, "salv", &tp);
590 abnode->salProc = tp;
591 abnode->salRunning = 1;
597 else { /* goal is 0, we're shutting down */
598 /* trying to shutdown */
599 if (abnode->salRunning && !abnode->salSDW) {
600 bnode_StopProc(abnode->salProc, SIGTERM);
602 abnode->timeSDStarted = now;
604 if (abnode->fileRunning && !abnode->fileSDW) {
605 bnode_StopProc(abnode->fileProc, SIGQUIT);
607 abnode->timeSDStarted = now;
609 if (abnode->volRunning && !abnode->volSDW) {
610 bnode_StopProc(abnode->volProc, SIGTERM);
612 abnode->timeSDStarted = now;
614 if (abnode->scanRunning && !abnode->scanSDW) {
615 bnode_StopProc(abnode->scanProc, SIGTERM);
617 abnode->timeSDStarted = now;
620 SetNeedsClock(abnode);
624 static int fs_getstring(abnode, abuffer, alen)
625 struct fsbnode *abnode;
628 if (alen < 40) return -1;
629 if (abnode->b.goal == 1) {
630 if (abnode->fileRunning) {
631 if (abnode->fileSDW) strcpy(abuffer, "file server shutting down");
632 else if (abnode->scancmd) {
633 if (!abnode->volRunning && !abnode->scanRunning)
634 strcpy(abuffer, "file server up; volser and scanner down");
635 else if (abnode->volRunning && !abnode->scanRunning)
636 strcpy(abuffer, "file server up; volser up; scanner down");
637 else if (!abnode->volRunning && abnode->scanRunning)
638 strcpy(abuffer, "file server up; volser down; scanner up");
640 else strcpy(abuffer, "file server running");
642 else if (!abnode->volRunning)
643 strcpy(abuffer, "file server up; volser down");
644 else strcpy(abuffer, "file server running");
646 else if (abnode->salRunning) {
647 strcpy(abuffer, "salvaging file system");
649 else strcpy(abuffer, "starting file server");
653 if (abnode->fileRunning || abnode->volRunning || abnode->scanRunning) {
654 strcpy(abuffer, "file server shutting down");
656 else if (abnode->salRunning)
657 strcpy(abuffer, "salvager shutting down");
658 else strcpy(abuffer, "file server shut down");
663 static fs_getparm(abnode, aindex, abuffer, alen)
664 struct fsbnode *abnode;
669 strcpy(abuffer, abnode->filecmd);
670 else if (aindex == 1)
671 strcpy(abuffer, abnode->volcmd);
672 else if (aindex == 2)
673 strcpy(abuffer, abnode->salcmd);
674 else if (aindex == 3 && abnode->scancmd)
675 strcpy(abuffer, abnode->scancmd);