2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 #include <sys/types.h>
27 #endif /* AFS_NT40_ENV */
29 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
30 #include <afs/afsutil.h>
33 static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
34 static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
35 static int fs_hascore();
36 struct bnode *fs_create();
37 struct bnode *fsmr_create();
39 static SetNeedsClock();
42 static int emergency = 0;
44 /* if this file exists, then we have to salvage the file system */
45 #define SALFILE "SALVAGE."
47 #define POLLTIME 20 /* for handling below */
48 #define SDTIME 60 /* time in seconds given to a process to evaporate */
51 Normal operation involves having the file server and the vol server both running.
53 If the vol server terminates, it can simply be restarted.
55 If the file server terminates, the disk must salvaged before the file server
56 can be restarted. In order to restart either the file server or the salvager,
57 the vol server must be shut down.
59 If the file server terminates *normally* (exits after receiving a SIGQUIT)
60 then we don't have to salvage it.
62 The needsSalvage flag is set when the file server is started. It is cleared
63 if the file server exits when fileSDW is true but fileKillSent is false,
64 indicating that it exited after receiving a quit, but before we sent it a kill.
66 The needsSalvage flag is cleared when the salvager exits.
69 struct bnode_ops fsbnode_ops = {
84 afs_int32 timeSDStarted; /* time shutdown operation started */
85 char *filecmd; /* command to start primary file server */
86 char *volcmd; /* command to start secondary vol server */
87 char *salcmd; /* command to start salvager */
88 char *scancmd; /* command to start scanner (MR-AFS) */
89 struct bnode_proc *fileProc; /* process for file server */
90 struct bnode_proc *volProc; /* process for vol server */
91 struct bnode_proc *salProc; /* process for salvager */
92 struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */
93 afs_int32 lastFileStart; /* last start for file */
94 afs_int32 lastVolStart; /* last start for vol */
95 afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */
96 char fileRunning; /* file process is running */
97 char volRunning; /* volser is running */
98 char salRunning; /* salvager is running */
99 char scanRunning; /* scanner is running (MR_AFS) */
100 char fileSDW; /* file shutdown wait */
101 char volSDW; /* vol shutdown wait */
102 char salSDW; /* waiting for the salvager to shutdown */
103 char scanSDW; /* scanner shutdown wait (MR_AFS) */
104 char fileKillSent; /* kill signal has been sent */
107 char scanKillSent; /* kill signal has been sent (MR_AFS) */
108 char needsSalvage; /* salvage before running */
109 char needsClock; /* do we need clock ticks */
112 /* Function to tell whether this bnode has a core file or not. You might
113 * think that this could be in bnode.c, and decide what core files to check
114 * for based on the bnode's coreName property, but that doesn't work because
115 * there may not be an active process for a bnode that dumped core at the
116 * time the query is done.
118 static int fs_hascore(abnode)
119 register struct ezbnode *abnode; {
122 /* see if file server has a core file */
123 bnode_CoreName(abnode, "file", tbuffer);
124 if (access(tbuffer, 0) == 0) return 1;
126 /* see if volserver has a core file */
127 bnode_CoreName(abnode, "vol", tbuffer);
128 if (access(tbuffer, 0) == 0) return 1;
130 /* see if salvager left a core file */
131 bnode_CoreName(abnode, "salv", tbuffer);
132 if (access(tbuffer, 0) == 0) return 1;
134 /* see if scanner left a core file (MR-AFS) */
135 bnode_CoreName(abnode, "scan", tbuffer);
136 if (access(tbuffer, 0) == 0) return 1;
138 /* no one left a core file */
142 static int fs_restartp (abnode)
143 register struct fsbnode *abnode; {
144 struct bnode_token *tt;
145 register afs_int32 code;
148 code = bnode_ParseLine(abnode->filecmd, &tt);
151 code = stat(tt->key, &tstat);
153 bnode_FreeTokens(tt);
156 if (tstat.st_ctime > abnode->lastFileStart) code = 1;
158 bnode_FreeTokens(tt);
159 if (code) return code;
161 /* now do same for volcmd */
162 code = bnode_ParseLine(abnode->volcmd, &tt);
165 code = stat(tt->key, &tstat);
167 bnode_FreeTokens(tt);
170 if (tstat.st_ctime > abnode->lastVolStart) code = 1;
172 bnode_FreeTokens(tt);
173 if (code) return code;
175 if (abnode->scancmd) { /* Only in MR-AFS */
176 /* now do same for scancmd (MR-AFS) */
177 code = bnode_ParseLine(abnode->scancmd, &tt);
180 code = stat(tt->key, &tstat);
182 bnode_FreeTokens(tt);
185 if (tstat.st_ctime > abnode->lastScanStart) code = 1;
187 bnode_FreeTokens(tt);
193 /* set needsSalvage flag, creating file SALVAGE.<instancename> if
194 we need to salvage the file system (so we can tell over panic reboots */
195 static SetSalFlag(abnode, aflag)
196 register struct fsbnode *abnode;
197 register int aflag; {
198 char tbuffer[AFSDIR_PATH_MAX];
201 abnode->needsSalvage = aflag;
202 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
203 abnode->b.name, NULL);
205 fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
214 /* set the needsSalvage flag according to the existence of the salvage file */
215 static RestoreSalFlag(abnode)
216 register struct fsbnode *abnode; {
217 char tbuffer[AFSDIR_PATH_MAX];
219 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
220 abnode->b.name, NULL);
221 if (access(tbuffer, 0) == 0) {
222 /* file exists, so need to salvage */
223 abnode->needsSalvage = 1;
226 abnode->needsSalvage = 0;
234 b = (char *) malloc(strlen(a)+1);
239 static int fs_delete(abnode)
240 struct fsbnode *abnode; {
241 free(abnode->filecmd);
242 free(abnode->volcmd);
243 free(abnode->salcmd);
244 if (abnode->scancmd) free(abnode->scancmd);
251 static void AppendExecutableExtension(char *cmd)
253 char cmdext[_MAX_EXT];
255 _splitpath(cmd, NULL, NULL, NULL, cmdext);
256 if (*cmdext == '\0') {
257 /* no filename extension supplied for cmd; append .exe */
261 #endif /* AFS_NT40_ENV */
264 struct bnode *fs_create(ainstance, afilecmd, avolcmd, asalcmd, ascancmd)
271 register struct fsbnode *te;
272 char cmdname[AFSDIR_PATH_MAX];
273 char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
276 fileCmdpath = volCmdpath = salCmdpath = NULL;
278 /* construct local paths from canonical (wire-format) paths */
279 if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
280 bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
283 if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
284 bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
287 if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
288 bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
292 if (ascancmd && strlen(ascancmd)) {
293 if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
294 bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
300 sscanf(fileCmdpath, "%s", cmdname);
302 AppendExecutableExtension(cmdname);
304 if (stat(cmdname, &tstat)) {
305 bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
309 sscanf(volCmdpath, "%s", cmdname);
311 AppendExecutableExtension(cmdname);
313 if (stat(cmdname, &tstat)) {
314 bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
318 sscanf(salCmdpath, "%s", cmdname);
320 AppendExecutableExtension(cmdname);
322 if (stat(cmdname, &tstat)) {
323 bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
327 if (ascancmd && strlen(ascancmd)) {
328 sscanf(scanCmdpath, "%s", cmdname);
330 AppendExecutableExtension(cmdname);
332 if (stat(cmdname, &tstat)) {
333 bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
340 free(fileCmdpath); free(volCmdpath); free(salCmdpath);
341 return (struct bnode *)0;
344 te = (struct fsbnode *) malloc(sizeof(struct fsbnode));
345 memset(te, 0, sizeof(struct fsbnode));
346 te->filecmd = fileCmdpath;
347 te->volcmd = volCmdpath;
348 te->salcmd = salCmdpath;
349 if (ascancmd && strlen(ascancmd))
350 te->scancmd = scanCmdpath;
352 te->scancmd = (char *)0;
353 bnode_InitBnode(te, &fsbnode_ops, ainstance);
354 bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */
355 RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
356 SetNeedsClock(te); /* compute needsClock field */
357 return (struct bnode *) te;
360 /* called to SIGKILL a process if it doesn't terminate normally */
361 static int fs_timeout(abnode)
362 struct fsbnode *abnode; {
363 register afs_int32 now;
365 now = FT_ApproxTime();
367 if (abnode->volSDW) {
368 if (!abnode->volKillSent && now - abnode->timeSDStarted > SDTIME) {
369 bnode_StopProc(abnode->volProc, SIGKILL);
370 abnode->volKillSent = 1;
371 bozo_Log("bos shutdown: volserver failed to shutdown within %d seconds\n",
375 if (abnode->salSDW) {
376 if (!abnode->salKillSent && now - abnode->timeSDStarted > SDTIME) {
377 bnode_StopProc(abnode->salProc, SIGKILL);
378 abnode->salKillSent = 1;
379 bozo_Log("bos shutdown: salvager failed to shutdown within %d seconds\n",
383 if (abnode->fileSDW) {
384 if (!abnode->fileKillSent && now - abnode->timeSDStarted > FSSDTIME) {
385 bnode_StopProc(abnode->fileProc, SIGKILL);
386 abnode->fileKillSent = 1;
387 bozo_Log("bos shutdown: fileserver failed to shutdown within %d seconds\n",
391 if (abnode->scanSDW) {
392 if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
393 bnode_StopProc(abnode->scanProc, SIGKILL);
394 abnode->scanKillSent = 1;
395 bozo_Log("bos shutdown: scanner failed to shutdown within %d seconds\n",
399 SetNeedsClock(abnode);
403 static int fs_getstat(abnode, astatus)
404 struct fsbnode *abnode;
405 afs_int32 *astatus; {
406 register afs_int32 temp;
407 if (abnode->volSDW || abnode->fileSDW || abnode->salSDW || abnode->scanSDW)
408 temp = BSTAT_SHUTTINGDOWN;
409 else if (abnode->salRunning) temp = BSTAT_NORMAL;
410 else if (abnode->volRunning && abnode->fileRunning && (!abnode->scancmd ||
411 abnode->scanRunning)) temp = BSTAT_NORMAL;
412 else if (!abnode->salRunning && !abnode->volRunning && !abnode->fileRunning
413 && !abnode->scanRunning) temp = BSTAT_SHUTDOWN;
414 else temp = BSTAT_STARTINGUP;
419 static int fs_setstat(abnode, astatus)
420 register struct fsbnode *abnode;
422 return NudgeProcs(abnode);
425 static int fs_procexit(abnode, aproc)
426 struct fsbnode *abnode;
427 struct bnode_proc *aproc; {
428 /* process has exited */
430 if (aproc == abnode->volProc) {
432 abnode->volRunning = 0;
434 abnode->volKillSent = 0;
436 else if (aproc == abnode->fileProc) {
437 /* if we were expecting a shutdown and we didn't send a kill signal
438 * and exited (didn't have a signal termination), then we assume that
439 * the file server exited after putting the appropriate volumes safely
440 * offline, and don't salvage next time.
442 if (abnode->fileSDW && !abnode->fileKillSent && aproc->lastSignal == 0)
443 SetSalFlag(abnode, 0); /* shut down normally */
444 abnode->fileProc = 0;
445 abnode->fileRunning = 0;
447 abnode->fileKillSent = 0;
449 else if (aproc == abnode->salProc) {
450 /* if we didn't shutdown the salvager, then assume it exited ok, and thus
451 that we don't have to salvage again */
453 SetSalFlag(abnode, 0); /* salvage just completed */
455 abnode->salRunning = 0;
457 abnode->salKillSent = 0;
459 else if (aproc == abnode->scanProc) {
460 abnode->scanProc = 0;
461 abnode->scanRunning = 0;
463 abnode->scanKillSent = 0;
466 /* now restart anyone who needs to restart */
467 return NudgeProcs(abnode);
470 /* make sure we're periodically checking the state if we need to */
471 static SetNeedsClock(ab)
472 register struct fsbnode *ab;
474 if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
475 && (!ab->scancmd || ab->scanRunning))
476 ab->needsClock = 0; /* running normally */
477 else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
478 && !ab->salRunning && !ab->scanRunning)
479 ab->needsClock = 0; /* halted normally */
480 else ab->needsClock = 1; /* other */
481 if (ab->needsClock && !bnode_PendingTimeout(ab))
482 bnode_SetTimeout(ab, POLLTIME);
483 if (!ab->needsClock) bnode_SetTimeout(ab, 0);
486 static NudgeProcs(abnode)
487 register struct fsbnode *abnode; {
488 struct bnode_proc *tp; /* not register */
489 register afs_int32 code;
492 now = FT_ApproxTime();
493 if (abnode->b.goal == 1) {
494 /* we're trying to run the system. If the file server is running, then we
495 are trying to start up the system. If it is not running, then needsSalvage
496 tells us if we need to run the salvager or not */
497 if (abnode->fileRunning) {
498 if (abnode->salRunning) {
499 bozo_Log("Salvager running along with file server!\n");
500 bozo_Log("Emergency shutdown\n");
502 bnode_SetGoal(abnode, BSTAT_SHUTDOWN);
503 bnode_StopProc(abnode->salProc, SIGKILL);
504 SetNeedsClock(abnode);
507 if (!abnode->volRunning) {
508 abnode->lastVolStart = FT_ApproxTime();
509 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
511 abnode->volProc = tp;
512 abnode->volRunning = 1;
515 if (abnode->scancmd) {
516 if (!abnode->scanRunning) {
517 abnode->lastScanStart = FT_ApproxTime();
518 code = bnode_NewProc(abnode, abnode->scancmd, "scanner", &tp);
520 abnode->scanProc = tp;
521 abnode->scanRunning = 1;
526 else { /* file is not running */
527 /* see how to start */
528 if (!abnode->needsSalvage) {
529 /* no crash apparent, just start up normally */
530 if (!abnode->fileRunning) {
531 abnode->lastFileStart = FT_ApproxTime();
532 code = bnode_NewProc(abnode, abnode->filecmd, "file", &tp);
534 abnode->fileProc = tp;
535 abnode->fileRunning = 1;
536 SetSalFlag(abnode, 1);
539 if (!abnode->volRunning) {
540 abnode->lastVolStart = FT_ApproxTime();
541 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
543 abnode->volProc = tp;
544 abnode->volRunning = 1;
547 if (abnode->scancmd && !abnode->scanRunning) {
548 abnode->lastScanStart = FT_ApproxTime();
549 code = bnode_NewProc(abnode, abnode->scancmd, "scanner",
552 abnode->scanProc = tp;
553 abnode->scanRunning = 1;
557 else { /* needs to be salvaged */
558 /* make sure file server and volser are gone */
559 if (abnode->volRunning) {
560 bnode_StopProc(abnode->volProc, SIGTERM);
561 if (!abnode->volSDW) abnode->timeSDStarted = now;
564 if (abnode->fileRunning) {
565 bnode_StopProc(abnode->fileProc, SIGQUIT);
566 if (!abnode->fileSDW) abnode->timeSDStarted = now;
569 if (abnode->scanRunning) {
570 bnode_StopProc(abnode->scanProc, SIGTERM);
571 if (!abnode->scanSDW) abnode->timeSDStarted = now;
574 if (abnode->volRunning || abnode->fileRunning
575 || abnode->scanRunning) return 0;
576 /* otherwise, it is safe to start salvager */
577 if (!abnode->salRunning) {
578 code = bnode_NewProc(abnode, abnode->salcmd, "salv", &tp);
580 abnode->salProc = tp;
581 abnode->salRunning = 1;
587 else { /* goal is 0, we're shutting down */
588 /* trying to shutdown */
589 if (abnode->salRunning && !abnode->salSDW) {
590 bnode_StopProc(abnode->salProc, SIGTERM);
592 abnode->timeSDStarted = now;
594 if (abnode->fileRunning && !abnode->fileSDW) {
595 bnode_StopProc(abnode->fileProc, SIGQUIT);
597 abnode->timeSDStarted = now;
599 if (abnode->volRunning && !abnode->volSDW) {
600 bnode_StopProc(abnode->volProc, SIGTERM);
602 abnode->timeSDStarted = now;
604 if (abnode->scanRunning && !abnode->scanSDW) {
605 bnode_StopProc(abnode->scanProc, SIGTERM);
607 abnode->timeSDStarted = now;
610 SetNeedsClock(abnode);
614 static int fs_getstring(abnode, abuffer, alen)
615 struct fsbnode *abnode;
618 if (alen < 40) return -1;
619 if (abnode->b.goal == 1) {
620 if (abnode->fileRunning) {
621 if (abnode->fileSDW) strcpy(abuffer, "file server shutting down");
622 else if (abnode->scancmd) {
623 if (!abnode->volRunning && !abnode->scanRunning)
624 strcpy(abuffer, "file server up; volser and scanner down");
625 else if (abnode->volRunning && !abnode->scanRunning)
626 strcpy(abuffer, "file server up; volser up; scanner down");
627 else if (!abnode->volRunning && abnode->scanRunning)
628 strcpy(abuffer, "file server up; volser down; scanner up");
630 else strcpy(abuffer, "file server running");
632 else if (!abnode->volRunning)
633 strcpy(abuffer, "file server up; volser down");
634 else strcpy(abuffer, "file server running");
636 else if (abnode->salRunning) {
637 strcpy(abuffer, "salvaging file system");
639 else strcpy(abuffer, "starting file server");
643 if (abnode->fileRunning || abnode->volRunning || abnode->scanRunning) {
644 strcpy(abuffer, "file server shutting down");
646 else if (abnode->salRunning)
647 strcpy(abuffer, "salvager shutting down");
648 else strcpy(abuffer, "file server shut down");
653 static fs_getparm(abnode, aindex, abuffer, alen)
654 struct fsbnode *abnode;
659 strcpy(abuffer, abnode->filecmd);
660 else if (aindex == 1)
661 strcpy(abuffer, abnode->volcmd);
662 else if (aindex == 2)
663 strcpy(abuffer, abnode->salcmd);
664 else if (aindex == 3 && abnode->scancmd)
665 strcpy(abuffer, abnode->scancmd);