2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afs/param.h>
11 #include <afsconfig.h>
15 #include <sys/types.h>
27 #endif /* AFS_NT40_ENV */
29 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
30 #include <afs/afsutil.h>
33 static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
34 static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
35 static int fs_hascore();
36 struct bnode *fs_create();
37 struct bnode *fsmr_create();
39 static SetNeedsClock();
42 static int emergency = 0;
44 /* if this file exists, then we have to salvage the file system */
45 #define SALFILE "SALVAGE."
47 #define POLLTIME 20 /* for handling below */
48 #define SDTIME 60 /* time in seconds given to a process to evaporate */
51 Normal operation involves having the file server and the vol server both running.
53 If the vol server terminates, it can simply be restarted.
55 If the file server terminates, the disk must salvaged before the file server
56 can be restarted. In order to restart either the file server or the salvager,
57 the vol server must be shut down.
59 If the file server terminates *normally* (exits after receiving a SIGQUIT)
60 then we don't have to salvage it.
62 The needsSalvage flag is set when the file server is started. It is cleared
63 if the file server exits when fileSDW is true but fileKillSent is false,
64 indicating that it exited after receiving a quit, but before we sent it a kill.
66 The needsSalvage flag is cleared when the salvager exits.
69 struct bnode_ops fsbnode_ops = {
84 afs_int32 timeSDStarted; /* time shutdown operation started */
85 char *filecmd; /* command to start primary file server */
86 char *volcmd; /* command to start secondary vol server */
87 char *salcmd; /* command to start salvager */
88 char *scancmd; /* command to start scanner (MR-AFS) */
89 struct bnode_proc *fileProc; /* process for file server */
90 struct bnode_proc *volProc; /* process for vol server */
91 struct bnode_proc *salProc; /* process for salvager */
92 struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */
93 afs_int32 lastFileStart; /* last start for file */
94 afs_int32 lastVolStart; /* last start for vol */
95 afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */
96 char fileRunning; /* file process is running */
97 char volRunning; /* volser is running */
98 char salRunning; /* salvager is running */
99 char scanRunning; /* scanner is running (MR_AFS) */
100 char fileSDW; /* file shutdown wait */
101 char volSDW; /* vol shutdown wait */
102 char salSDW; /* waiting for the salvager to shutdown */
103 char scanSDW; /* scanner shutdown wait (MR_AFS) */
104 char fileKillSent; /* kill signal has been sent */
107 char scanKillSent; /* kill signal has been sent (MR_AFS) */
108 char needsSalvage; /* salvage before running */
109 char needsClock; /* do we need clock ticks */
112 /* Function to tell whether this bnode has a core file or not. You might
113 * think that this could be in bnode.c, and decide what core files to check
114 * for based on the bnode's coreName property, but that doesn't work because
115 * there may not be an active process for a bnode that dumped core at the
116 * time the query is done.
118 static int fs_hascore(abnode)
119 register struct ezbnode *abnode; {
122 /* see if file server has a core file */
123 bnode_CoreName(abnode, "file", tbuffer);
124 if (access(tbuffer, 0) == 0) return 1;
126 /* see if volserver has a core file */
127 bnode_CoreName(abnode, "vol", tbuffer);
128 if (access(tbuffer, 0) == 0) return 1;
130 /* see if salvager left a core file */
131 bnode_CoreName(abnode, "salv", tbuffer);
132 if (access(tbuffer, 0) == 0) return 1;
134 /* see if scanner left a core file (MR-AFS) */
135 bnode_CoreName(abnode, "scan", tbuffer);
136 if (access(tbuffer, 0) == 0) return 1;
138 /* no one left a core file */
142 static int fs_restartp (abnode)
143 register struct fsbnode *abnode; {
144 struct bnode_token *tt;
145 register afs_int32 code;
148 code = bnode_ParseLine(abnode->filecmd, &tt);
151 code = stat(tt->key, &tstat);
153 bnode_FreeTokens(tt);
156 if (tstat.st_ctime > abnode->lastFileStart) code = 1;
158 bnode_FreeTokens(tt);
159 if (code) return code;
161 /* now do same for volcmd */
162 code = bnode_ParseLine(abnode->volcmd, &tt);
165 code = stat(tt->key, &tstat);
167 bnode_FreeTokens(tt);
170 if (tstat.st_ctime > abnode->lastVolStart) code = 1;
172 bnode_FreeTokens(tt);
173 if (code) return code;
175 if (abnode->scancmd) { /* Only in MR-AFS */
176 /* now do same for scancmd (MR-AFS) */
177 code = bnode_ParseLine(abnode->scancmd, &tt);
180 code = stat(tt->key, &tstat);
182 bnode_FreeTokens(tt);
185 if (tstat.st_ctime > abnode->lastScanStart) code = 1;
187 bnode_FreeTokens(tt);
193 /* set needsSalvage flag, creating file SALVAGE.<instancename> if
194 we need to salvage the file system (so we can tell over panic reboots */
195 static SetSalFlag(abnode, aflag)
196 register struct fsbnode *abnode;
197 register int aflag; {
198 char tbuffer[AFSDIR_PATH_MAX];
201 abnode->needsSalvage = aflag;
202 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
203 abnode->b.name, NULL);
205 fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
214 /* set the needsSalvage flag according to the existence of the salvage file */
215 static RestoreSalFlag(abnode)
216 register struct fsbnode *abnode; {
217 char tbuffer[AFSDIR_PATH_MAX];
219 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
220 abnode->b.name, NULL);
221 if (access(tbuffer, 0) == 0) {
222 /* file exists, so need to salvage */
223 abnode->needsSalvage = 1;
226 abnode->needsSalvage = 0;
234 b = (char *) malloc(strlen(a)+1);
239 static int fs_delete(abnode)
240 struct fsbnode *abnode; {
241 free(abnode->filecmd);
242 free(abnode->volcmd);
243 free(abnode->salcmd);
244 if (abnode->scancmd) free(abnode->scancmd);
251 static void AppendExecutableExtension(char *cmd)
253 char cmdext[_MAX_EXT];
255 _splitpath(cmd, NULL, NULL, NULL, cmdext);
256 if (*cmdext == '\0') {
257 /* no filename extension supplied for cmd; append .exe */
261 #endif /* AFS_NT40_ENV */
264 struct bnode *fs_create(ainstance, afilecmd, avolcmd, asalcmd, ascancmd)
271 register struct fsbnode *te;
272 char cmdname[AFSDIR_PATH_MAX];
273 char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
276 fileCmdpath = volCmdpath = salCmdpath = NULL;
278 /* construct local paths from canonical (wire-format) paths */
279 if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
280 bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
283 if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
284 bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
287 if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
288 bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
292 if (ascancmd && strlen(ascancmd)) {
293 if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
294 bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
300 sscanf(fileCmdpath, "%s", cmdname);
302 AppendExecutableExtension(cmdname);
304 if (stat(cmdname, &tstat)) {
305 bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
309 sscanf(volCmdpath, "%s", cmdname);
311 AppendExecutableExtension(cmdname);
313 if (stat(cmdname, &tstat)) {
314 bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
318 sscanf(salCmdpath, "%s", cmdname);
320 AppendExecutableExtension(cmdname);
322 if (stat(cmdname, &tstat)) {
323 bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
327 if (ascancmd && strlen(ascancmd)) {
328 sscanf(scanCmdpath, "%s", cmdname);
330 AppendExecutableExtension(cmdname);
332 if (stat(cmdname, &tstat)) {
333 bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
340 free(fileCmdpath); free(volCmdpath); free(salCmdpath);
341 return (struct bnode *)0;
344 te = (struct fsbnode *) malloc(sizeof(struct fsbnode));
345 bzero(te, sizeof(struct fsbnode));
346 te->filecmd = fileCmdpath;
347 te->volcmd = volCmdpath;
348 te->salcmd = salCmdpath;
349 if (ascancmd && strlen(ascancmd))
350 te->scancmd = scanCmdpath;
352 te->scancmd = (char *)0;
353 bnode_InitBnode(te, &fsbnode_ops, ainstance);
354 bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */
355 RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
356 SetNeedsClock(te); /* compute needsClock field */
357 return (struct bnode *) te;
360 /* called to SIGKILL a process if it doesn't terminate normally */
361 static int fs_timeout(abnode)
362 struct fsbnode *abnode; {
363 register afs_int32 now;
365 now = FT_ApproxTime();
367 if (abnode->volSDW) {
368 if (!abnode->volKillSent && now - abnode->timeSDStarted > SDTIME) {
369 bnode_StopProc(abnode->volProc, SIGKILL);
370 abnode->volKillSent = 1;
371 bozo_Log("bos shutdown: volserver failed to shutdown within %d seconds\n",
375 if (abnode->salSDW) {
376 if (!abnode->salKillSent && now - abnode->timeSDStarted > SDTIME) {
377 bnode_StopProc(abnode->salProc, SIGKILL);
378 abnode->salKillSent = 1;
379 bozo_Log("bos shutdown: salvager failed to shutdown within %d seconds\n",
383 if (abnode->fileSDW) {
384 if (!abnode->fileKillSent && now - abnode->timeSDStarted > FSSDTIME) {
385 bnode_StopProc(abnode->fileProc, SIGKILL);
386 abnode->fileKillSent = 1;
387 bozo_Log("bos shutdown: fileserver failed to shutdown within %d seconds\n",
391 if (abnode->scanSDW) {
392 if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
393 bnode_StopProc(abnode->scanProc, SIGKILL);
394 abnode->scanKillSent = 1;
395 bozo_Log("bos shutdown: scanner failed to shutdown within %d seconds\n",
399 SetNeedsClock(abnode);
402 static int fs_getstat(abnode, astatus)
403 struct fsbnode *abnode;
404 afs_int32 *astatus; {
405 register afs_int32 temp;
406 if (abnode->volSDW || abnode->fileSDW || abnode->salSDW || abnode->scanSDW)
407 temp = BSTAT_SHUTTINGDOWN;
408 else if (abnode->salRunning) temp = BSTAT_NORMAL;
409 else if (abnode->volRunning && abnode->fileRunning && (!abnode->scancmd ||
410 abnode->scanRunning)) temp = BSTAT_NORMAL;
411 else if (!abnode->salRunning && !abnode->volRunning && !abnode->fileRunning
412 && !abnode->scanRunning) temp = BSTAT_SHUTDOWN;
413 else temp = BSTAT_STARTINGUP;
418 static int fs_setstat(abnode, astatus)
419 register struct fsbnode *abnode;
421 return NudgeProcs(abnode);
424 static int fs_procexit(abnode, aproc)
425 struct fsbnode *abnode;
426 struct bnode_proc *aproc; {
427 /* process has exited */
429 if (aproc == abnode->volProc) {
431 abnode->volRunning = 0;
433 abnode->volKillSent = 0;
435 else if (aproc == abnode->fileProc) {
436 /* if we were expecting a shutdown and we didn't send a kill signal
437 * and exited (didn't have a signal termination), then we assume that
438 * the file server exited after putting the appropriate volumes safely
439 * offline, and don't salvage next time.
441 if (abnode->fileSDW && !abnode->fileKillSent && aproc->lastSignal == 0)
442 SetSalFlag(abnode, 0); /* shut down normally */
443 abnode->fileProc = 0;
444 abnode->fileRunning = 0;
446 abnode->fileKillSent = 0;
448 else if (aproc == abnode->salProc) {
449 /* if we didn't shutdown the salvager, then assume it exited ok, and thus
450 that we don't have to salvage again */
452 SetSalFlag(abnode, 0); /* salvage just completed */
454 abnode->salRunning = 0;
456 abnode->salKillSent = 0;
458 else if (aproc == abnode->scanProc) {
459 abnode->scanProc = 0;
460 abnode->scanRunning = 0;
462 abnode->scanKillSent = 0;
465 /* now restart anyone who needs to restart */
466 return NudgeProcs(abnode);
469 /* make sure we're periodically checking the state if we need to */
470 static SetNeedsClock(ab)
471 register struct fsbnode *ab;
473 if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
474 && (!ab->scancmd || ab->scanRunning))
475 ab->needsClock = 0; /* running normally */
476 else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
477 && !ab->salRunning && !ab->scanRunning)
478 ab->needsClock = 0; /* halted normally */
479 else ab->needsClock = 1; /* other */
480 if (ab->needsClock && !bnode_PendingTimeout(ab))
481 bnode_SetTimeout(ab, POLLTIME);
482 if (!ab->needsClock) bnode_SetTimeout(ab, 0);
485 static NudgeProcs(abnode)
486 register struct fsbnode *abnode; {
487 struct bnode_proc *tp; /* not register */
488 register afs_int32 code;
491 now = FT_ApproxTime();
492 if (abnode->b.goal == 1) {
493 /* we're trying to run the system. If the file server is running, then we
494 are trying to start up the system. If it is not running, then needsSalvage
495 tells us if we need to run the salvager or not */
496 if (abnode->fileRunning) {
497 if (abnode->salRunning) {
498 bozo_Log("Salvager running along with file server!\n");
499 bozo_Log("Emergency shutdown\n");
501 bnode_SetGoal(abnode, BSTAT_SHUTDOWN);
502 bnode_StopProc(abnode->salProc, SIGKILL);
503 SetNeedsClock(abnode);
506 if (!abnode->volRunning) {
507 abnode->lastVolStart = FT_ApproxTime();
508 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
510 abnode->volProc = tp;
511 abnode->volRunning = 1;
514 if (abnode->scancmd) {
515 if (!abnode->scanRunning) {
516 abnode->lastScanStart = FT_ApproxTime();
517 code = bnode_NewProc(abnode, abnode->scancmd, "scanner", &tp);
519 abnode->scanProc = tp;
520 abnode->scanRunning = 1;
525 else { /* file is not running */
526 /* see how to start */
527 if (!abnode->needsSalvage) {
528 /* no crash apparent, just start up normally */
529 if (!abnode->fileRunning) {
530 abnode->lastFileStart = FT_ApproxTime();
531 code = bnode_NewProc(abnode, abnode->filecmd, "file", &tp);
533 abnode->fileProc = tp;
534 abnode->fileRunning = 1;
535 SetSalFlag(abnode, 1);
538 if (!abnode->volRunning) {
539 abnode->lastVolStart = FT_ApproxTime();
540 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
542 abnode->volProc = tp;
543 abnode->volRunning = 1;
546 if (abnode->scancmd && !abnode->scanRunning) {
547 abnode->lastScanStart = FT_ApproxTime();
548 code = bnode_NewProc(abnode, abnode->scancmd, "scanner",
551 abnode->scanProc = tp;
552 abnode->scanRunning = 1;
556 else { /* needs to be salvaged */
557 /* make sure file server and volser are gone */
558 if (abnode->volRunning) {
559 bnode_StopProc(abnode->volProc, SIGTERM);
560 if (!abnode->volSDW) abnode->timeSDStarted = now;
563 if (abnode->fileRunning) {
564 bnode_StopProc(abnode->fileProc, SIGQUIT);
565 if (!abnode->fileSDW) abnode->timeSDStarted = now;
568 if (abnode->scanRunning) {
569 bnode_StopProc(abnode->scanProc, SIGTERM);
570 if (!abnode->scanSDW) abnode->timeSDStarted = now;
573 if (abnode->volRunning || abnode->fileRunning
574 || abnode->scanRunning) return 0;
575 /* otherwise, it is safe to start salvager */
576 if (!abnode->salRunning) {
577 code = bnode_NewProc(abnode, abnode->salcmd, "salv", &tp);
579 abnode->salProc = tp;
580 abnode->salRunning = 1;
586 else { /* goal is 0, we're shutting down */
587 /* trying to shutdown */
588 if (abnode->salRunning && !abnode->salSDW) {
589 bnode_StopProc(abnode->salProc, SIGTERM);
591 abnode->timeSDStarted = now;
593 if (abnode->fileRunning && !abnode->fileSDW) {
594 bnode_StopProc(abnode->fileProc, SIGQUIT);
596 abnode->timeSDStarted = now;
598 if (abnode->volRunning && !abnode->volSDW) {
599 bnode_StopProc(abnode->volProc, SIGTERM);
601 abnode->timeSDStarted = now;
603 if (abnode->scanRunning && !abnode->scanSDW) {
604 bnode_StopProc(abnode->scanProc, SIGTERM);
606 abnode->timeSDStarted = now;
609 SetNeedsClock(abnode);
613 static int fs_getstring(abnode, abuffer, alen)
614 struct fsbnode *abnode;
617 if (alen < 40) return -1;
618 if (abnode->b.goal == 1) {
619 if (abnode->fileRunning) {
620 if (abnode->fileSDW) strcpy(abuffer, "file server shutting down");
621 else if (abnode->scancmd) {
622 if (!abnode->volRunning && !abnode->scanRunning)
623 strcpy(abuffer, "file server up; volser and scanner down");
624 else if (abnode->volRunning && !abnode->scanRunning)
625 strcpy(abuffer, "file server up; volser up; scanner down");
626 else if (!abnode->volRunning && abnode->scanRunning)
627 strcpy(abuffer, "file server up; volser down; scanner up");
629 else strcpy(abuffer, "file server running");
631 else if (!abnode->volRunning)
632 strcpy(abuffer, "file server up; volser down");
633 else strcpy(abuffer, "file server running");
635 else if (abnode->salRunning) {
636 strcpy(abuffer, "salvaging file system");
638 else strcpy(abuffer, "starting file server");
642 if (abnode->fileRunning || abnode->volRunning || abnode->scanRunning) {
643 strcpy(abuffer, "file server shutting down");
645 else if (abnode->salRunning)
646 strcpy(abuffer, "salvager shutting down");
647 else strcpy(abuffer, "file server shut down");
652 static fs_getparm(abnode, aindex, abuffer, alen)
653 struct fsbnode *abnode;
658 strcpy(abuffer, abnode->filecmd);
659 else if (aindex == 1)
660 strcpy(abuffer, abnode->volcmd);
661 else if (aindex == 2)
662 strcpy(abuffer, abnode->salcmd);
663 else if (aindex == 3 && abnode->scancmd)
664 strcpy(abuffer, abnode->scancmd);