2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afs/param.h>
11 #include <sys/types.h>
23 #endif /* AFS_NT40_ENV */
25 #include <afs/procmgmt.h> /* signal(), kill(), wait(), etc. */
26 #include <afs/afsutil.h>
29 static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
30 static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
31 static int fs_hascore();
32 struct bnode *fs_create();
33 struct bnode *fsmr_create();
35 static SetNeedsClock();
38 static int emergency = 0;
40 /* if this file exists, then we have to salvage the file system */
41 #define SALFILE "SALVAGE."
43 #define POLLTIME 20 /* for handling below */
44 #define SDTIME 60 /* time in seconds given to a process to evaporate */
47 Normal operation involves having the file server and the vol server both running.
49 If the vol server terminates, it can simply be restarted.
51 If the file server terminates, the disk must salvaged before the file server
52 can be restarted. In order to restart either the file server or the salvager,
53 the vol server must be shut down.
55 If the file server terminates *normally* (exits after receiving a SIGQUIT)
56 then we don't have to salvage it.
58 The needsSalvage flag is set when the file server is started. It is cleared
59 if the file server exits when fileSDW is true but fileKillSent is false,
60 indicating that it exited after receiving a quit, but before we sent it a kill.
62 The needsSalvage flag is cleared when the salvager exits.
65 struct bnode_ops fsbnode_ops = {
80 afs_int32 timeSDStarted; /* time shutdown operation started */
81 char *filecmd; /* command to start primary file server */
82 char *volcmd; /* command to start secondary vol server */
83 char *salcmd; /* command to start salvager */
84 char *scancmd; /* command to start scanner (MR-AFS) */
85 struct bnode_proc *fileProc; /* process for file server */
86 struct bnode_proc *volProc; /* process for vol server */
87 struct bnode_proc *salProc; /* process for salvager */
88 struct bnode_proc *scanProc; /* process for scanner (MR-AFS) */
89 afs_int32 lastFileStart; /* last start for file */
90 afs_int32 lastVolStart; /* last start for vol */
91 afs_int32 lastScanStart; /* last start for scanner (MR-AFS) */
92 char fileRunning; /* file process is running */
93 char volRunning; /* volser is running */
94 char salRunning; /* salvager is running */
95 char scanRunning; /* scanner is running (MR_AFS) */
96 char fileSDW; /* file shutdown wait */
97 char volSDW; /* vol shutdown wait */
98 char salSDW; /* waiting for the salvager to shutdown */
99 char scanSDW; /* scanner shutdown wait (MR_AFS) */
100 char fileKillSent; /* kill signal has been sent */
103 char scanKillSent; /* kill signal has been sent (MR_AFS) */
104 char needsSalvage; /* salvage before running */
105 char needsClock; /* do we need clock ticks */
108 /* Function to tell whether this bnode has a core file or not. You might
109 * think that this could be in bnode.c, and decide what core files to check
110 * for based on the bnode's coreName property, but that doesn't work because
111 * there may not be an active process for a bnode that dumped core at the
112 * time the query is done.
114 static int fs_hascore(abnode)
115 register struct ezbnode *abnode; {
118 /* see if file server has a core file */
119 bnode_CoreName(abnode, "file", tbuffer);
120 if (access(tbuffer, 0) == 0) return 1;
122 /* see if volserver has a core file */
123 bnode_CoreName(abnode, "vol", tbuffer);
124 if (access(tbuffer, 0) == 0) return 1;
126 /* see if salvager left a core file */
127 bnode_CoreName(abnode, "salv", tbuffer);
128 if (access(tbuffer, 0) == 0) return 1;
130 /* see if scanner left a core file (MR-AFS) */
131 bnode_CoreName(abnode, "scan", tbuffer);
132 if (access(tbuffer, 0) == 0) return 1;
134 /* no one left a core file */
138 static int fs_restartp (abnode)
139 register struct fsbnode *abnode; {
140 struct bnode_token *tt;
141 register afs_int32 code;
144 code = bnode_ParseLine(abnode->filecmd, &tt);
147 code = stat(tt->key, &tstat);
149 bnode_FreeTokens(tt);
152 if (tstat.st_ctime > abnode->lastFileStart) code = 1;
154 bnode_FreeTokens(tt);
155 if (code) return code;
157 /* now do same for volcmd */
158 code = bnode_ParseLine(abnode->volcmd, &tt);
161 code = stat(tt->key, &tstat);
163 bnode_FreeTokens(tt);
166 if (tstat.st_ctime > abnode->lastVolStart) code = 1;
168 bnode_FreeTokens(tt);
169 if (code) return code;
171 if (abnode->scancmd) { /* Only in MR-AFS */
172 /* now do same for scancmd (MR-AFS) */
173 code = bnode_ParseLine(abnode->scancmd, &tt);
176 code = stat(tt->key, &tstat);
178 bnode_FreeTokens(tt);
181 if (tstat.st_ctime > abnode->lastScanStart) code = 1;
183 bnode_FreeTokens(tt);
189 /* set needsSalvage flag, creating file SALVAGE.<instancename> if
190 we need to salvage the file system (so we can tell over panic reboots */
191 static SetSalFlag(abnode, aflag)
192 register struct fsbnode *abnode;
193 register int aflag; {
194 char tbuffer[AFSDIR_PATH_MAX];
197 abnode->needsSalvage = aflag;
198 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
199 abnode->b.name, NULL);
201 fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
210 /* set the needsSalvage flag according to the existence of the salvage file */
211 static RestoreSalFlag(abnode)
212 register struct fsbnode *abnode; {
213 char tbuffer[AFSDIR_PATH_MAX];
215 strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/", SALFILE,
216 abnode->b.name, NULL);
217 if (access(tbuffer, 0) == 0) {
218 /* file exists, so need to salvage */
219 abnode->needsSalvage = 1;
222 abnode->needsSalvage = 0;
230 b = (char *) malloc(strlen(a)+1);
235 static int fs_delete(abnode)
236 struct fsbnode *abnode; {
237 free(abnode->filecmd);
238 free(abnode->volcmd);
239 free(abnode->salcmd);
240 if (abnode->scancmd) free(abnode->scancmd);
247 static void AppendExecutableExtension(char *cmd)
249 char cmdext[_MAX_EXT];
251 _splitpath(cmd, NULL, NULL, NULL, cmdext);
252 if (*cmdext == '\0') {
253 /* no filename extension supplied for cmd; append .exe */
257 #endif /* AFS_NT40_ENV */
260 struct bnode *fs_create(ainstance, afilecmd, avolcmd, asalcmd, ascancmd)
267 register struct fsbnode *te;
268 char cmdname[AFSDIR_PATH_MAX];
269 char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
272 fileCmdpath = volCmdpath = salCmdpath = NULL;
274 /* construct local paths from canonical (wire-format) paths */
275 if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
276 bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
279 if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
280 bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
283 if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
284 bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
288 if (strlen(ascancmd)) {
289 if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
290 bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
296 sscanf(fileCmdpath, "%s", cmdname);
298 AppendExecutableExtension(cmdname);
300 if (stat(cmdname, &tstat)) {
301 bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
305 sscanf(volCmdpath, "%s", cmdname);
307 AppendExecutableExtension(cmdname);
309 if (stat(cmdname, &tstat)) {
310 bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
314 sscanf(salCmdpath, "%s", cmdname);
316 AppendExecutableExtension(cmdname);
318 if (stat(cmdname, &tstat)) {
319 bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
323 if (strlen(ascancmd)) {
324 sscanf(scanCmdpath, "%s", cmdname);
326 AppendExecutableExtension(cmdname);
328 if (stat(cmdname, &tstat)) {
329 bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
336 free(fileCmdpath); free(volCmdpath); free(salCmdpath);
337 return (struct bnode *)0;
340 te = (struct fsbnode *) malloc(sizeof(struct fsbnode));
341 bzero(te, sizeof(struct fsbnode));
342 te->filecmd = fileCmdpath;
343 te->volcmd = volCmdpath;
344 te->salcmd = salCmdpath;
345 if (strlen(ascancmd))
346 te->scancmd = scanCmdpath;
348 te->scancmd = (char *)0;
349 bnode_InitBnode(te, &fsbnode_ops, ainstance);
350 bnode_SetTimeout(te, POLLTIME); /* ask for timeout activations every 10 seconds */
351 RestoreSalFlag(te); /* restore needsSalvage flag based on file's existence */
352 SetNeedsClock(te); /* compute needsClock field */
353 return (struct bnode *) te;
356 /* called to SIGKILL a process if it doesn't terminate normally */
357 static int fs_timeout(abnode)
358 struct fsbnode *abnode; {
359 register afs_int32 now;
361 now = FT_ApproxTime();
363 if (abnode->volSDW) {
364 if (!abnode->volKillSent && now - abnode->timeSDStarted > SDTIME) {
365 bnode_StopProc(abnode->volProc, SIGKILL);
366 abnode->volKillSent = 1;
367 bozo_Log("bos shutdown: volserver failed to shutdown within %d seconds\n",
371 if (abnode->salSDW) {
372 if (!abnode->salKillSent && now - abnode->timeSDStarted > SDTIME) {
373 bnode_StopProc(abnode->salProc, SIGKILL);
374 abnode->salKillSent = 1;
375 bozo_Log("bos shutdown: salvager failed to shutdown within %d seconds\n",
379 if (abnode->fileSDW) {
380 if (!abnode->fileKillSent && now - abnode->timeSDStarted > FSSDTIME) {
381 bnode_StopProc(abnode->fileProc, SIGKILL);
382 abnode->fileKillSent = 1;
383 bozo_Log("bos shutdown: fileserver failed to shutdown within %d seconds\n",
387 if (abnode->scanSDW) {
388 if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
389 bnode_StopProc(abnode->scanProc, SIGKILL);
390 abnode->scanKillSent = 1;
391 bozo_Log("bos shutdown: scanner failed to shutdown within %d seconds\n",
395 SetNeedsClock(abnode);
398 static int fs_getstat(abnode, astatus)
399 struct fsbnode *abnode;
400 afs_int32 *astatus; {
401 register afs_int32 temp;
402 if (abnode->volSDW || abnode->fileSDW || abnode->salSDW || abnode->scanSDW)
403 temp = BSTAT_SHUTTINGDOWN;
404 else if (abnode->salRunning) temp = BSTAT_NORMAL;
405 else if (abnode->volRunning && abnode->fileRunning && (!abnode->scancmd ||
406 abnode->scanRunning)) temp = BSTAT_NORMAL;
407 else if (!abnode->salRunning && !abnode->volRunning && !abnode->fileRunning
408 && !abnode->scanRunning) temp = BSTAT_SHUTDOWN;
409 else temp = BSTAT_STARTINGUP;
414 static int fs_setstat(abnode, astatus)
415 register struct fsbnode *abnode;
417 return NudgeProcs(abnode);
420 static int fs_procexit(abnode, aproc)
421 struct fsbnode *abnode;
422 struct bnode_proc *aproc; {
423 /* process has exited */
425 if (aproc == abnode->volProc) {
427 abnode->volRunning = 0;
429 abnode->volKillSent = 0;
431 else if (aproc == abnode->fileProc) {
432 /* if we were expecting a shutdown and we didn't send a kill signal
433 * and exited (didn't have a signal termination), then we assume that
434 * the file server exited after putting the appropriate volumes safely
435 * offline, and don't salvage next time.
437 if (abnode->fileSDW && !abnode->fileKillSent && aproc->lastSignal == 0)
438 SetSalFlag(abnode, 0); /* shut down normally */
439 abnode->fileProc = 0;
440 abnode->fileRunning = 0;
442 abnode->fileKillSent = 0;
444 else if (aproc == abnode->salProc) {
445 /* if we didn't shutdown the salvager, then assume it exited ok, and thus
446 that we don't have to salvage again */
448 SetSalFlag(abnode, 0); /* salvage just completed */
450 abnode->salRunning = 0;
452 abnode->salKillSent = 0;
454 else if (aproc == abnode->scanProc) {
455 abnode->scanProc = 0;
456 abnode->scanRunning = 0;
458 abnode->scanKillSent = 0;
461 /* now restart anyone who needs to restart */
462 return NudgeProcs(abnode);
465 /* make sure we're periodically checking the state if we need to */
466 static SetNeedsClock(ab)
467 register struct fsbnode *ab;
469 if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
470 && (!ab->scancmd || ab->scanRunning))
471 ab->needsClock = 0; /* running normally */
472 else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
473 && !ab->salRunning && !ab->scanRunning)
474 ab->needsClock = 0; /* halted normally */
475 else ab->needsClock = 1; /* other */
476 if (ab->needsClock && !bnode_PendingTimeout(ab))
477 bnode_SetTimeout(ab, POLLTIME);
478 if (!ab->needsClock) bnode_SetTimeout(ab, 0);
481 static NudgeProcs(abnode)
482 register struct fsbnode *abnode; {
483 struct bnode_proc *tp; /* not register */
484 register afs_int32 code;
487 now = FT_ApproxTime();
488 if (abnode->b.goal == 1) {
489 /* we're trying to run the system. If the file server is running, then we
490 are trying to start up the system. If it is not running, then needsSalvage
491 tells us if we need to run the salvager or not */
492 if (abnode->fileRunning) {
493 if (abnode->salRunning) {
494 bozo_Log("Salvager running along with file server!\n");
495 bozo_Log("Emergency shutdown\n");
497 bnode_SetGoal(abnode, BSTAT_SHUTDOWN);
498 bnode_StopProc(abnode->salProc, SIGKILL);
499 SetNeedsClock(abnode);
502 if (!abnode->volRunning) {
503 abnode->lastVolStart = FT_ApproxTime();
504 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
506 abnode->volProc = tp;
507 abnode->volRunning = 1;
510 if (abnode->scancmd) {
511 if (!abnode->scanRunning) {
512 abnode->lastScanStart = FT_ApproxTime();
513 code = bnode_NewProc(abnode, abnode->scancmd, "scanner", &tp);
515 abnode->scanProc = tp;
516 abnode->scanRunning = 1;
521 else { /* file is not running */
522 /* see how to start */
523 if (!abnode->needsSalvage) {
524 /* no crash apparent, just start up normally */
525 if (!abnode->fileRunning) {
526 abnode->lastFileStart = FT_ApproxTime();
527 code = bnode_NewProc(abnode, abnode->filecmd, "file", &tp);
529 abnode->fileProc = tp;
530 abnode->fileRunning = 1;
531 SetSalFlag(abnode, 1);
534 if (!abnode->volRunning) {
535 abnode->lastVolStart = FT_ApproxTime();
536 code = bnode_NewProc(abnode, abnode->volcmd, "vol", &tp);
538 abnode->volProc = tp;
539 abnode->volRunning = 1;
542 if (abnode->scancmd && !abnode->scanRunning) {
543 abnode->lastScanStart = FT_ApproxTime();
544 code = bnode_NewProc(abnode, abnode->scancmd, "scanner",
547 abnode->scanProc = tp;
548 abnode->scanRunning = 1;
552 else { /* needs to be salvaged */
553 /* make sure file server and volser are gone */
554 if (abnode->volRunning) {
555 bnode_StopProc(abnode->volProc, SIGTERM);
556 if (!abnode->volSDW) abnode->timeSDStarted = now;
559 if (abnode->fileRunning) {
560 bnode_StopProc(abnode->fileProc, SIGQUIT);
561 if (!abnode->fileSDW) abnode->timeSDStarted = now;
564 if (abnode->scanRunning) {
565 bnode_StopProc(abnode->scanProc, SIGTERM);
566 if (!abnode->scanSDW) abnode->timeSDStarted = now;
569 if (abnode->volRunning || abnode->fileRunning
570 || abnode->scanRunning) return 0;
571 /* otherwise, it is safe to start salvager */
572 if (!abnode->salRunning) {
573 code = bnode_NewProc(abnode, abnode->salcmd, "salv", &tp);
575 abnode->salProc = tp;
576 abnode->salRunning = 1;
582 else { /* goal is 0, we're shutting down */
583 /* trying to shutdown */
584 if (abnode->salRunning && !abnode->salSDW) {
585 bnode_StopProc(abnode->salProc, SIGTERM);
587 abnode->timeSDStarted = now;
589 if (abnode->fileRunning && !abnode->fileSDW) {
590 bnode_StopProc(abnode->fileProc, SIGQUIT);
592 abnode->timeSDStarted = now;
594 if (abnode->volRunning && !abnode->volSDW) {
595 bnode_StopProc(abnode->volProc, SIGTERM);
597 abnode->timeSDStarted = now;
599 if (abnode->scanRunning && !abnode->scanSDW) {
600 bnode_StopProc(abnode->scanProc, SIGTERM);
602 abnode->timeSDStarted = now;
605 SetNeedsClock(abnode);
609 static int fs_getstring(abnode, abuffer, alen)
610 struct fsbnode *abnode;
613 if (alen < 40) return -1;
614 if (abnode->b.goal == 1) {
615 if (abnode->fileRunning) {
616 if (abnode->fileSDW) strcpy(abuffer, "file server shutting down");
617 else if (abnode->scancmd) {
618 if (!abnode->volRunning && !abnode->scanRunning)
619 strcpy(abuffer, "file server up; volser and scanner down");
620 else if (abnode->volRunning && !abnode->scanRunning)
621 strcpy(abuffer, "file server up; volser up; scanner down");
622 else if (!abnode->volRunning && abnode->scanRunning)
623 strcpy(abuffer, "file server up; volser down; scanner up");
625 else strcpy(abuffer, "file server running");
627 else if (!abnode->volRunning)
628 strcpy(abuffer, "file server up; volser down");
629 else strcpy(abuffer, "file server running");
631 else if (abnode->salRunning) {
632 strcpy(abuffer, "salvaging file system");
634 else strcpy(abuffer, "starting file server");
638 if (abnode->fileRunning || abnode->volRunning || abnode->scanRunning) {
639 strcpy(abuffer, "file server shutting down");
641 else if (abnode->salRunning)
642 strcpy(abuffer, "salvager shutting down");
643 else strcpy(abuffer, "file server shut down");
648 static fs_getparm(abnode, aindex, abuffer, alen)
649 struct fsbnode *abnode;
654 strcpy(abuffer, abnode->filecmd);
655 else if (aindex == 1)
656 strcpy(abuffer, abnode->volcmd);
657 else if (aindex == 2)
658 strcpy(abuffer, abnode->salcmd);
659 else if (aindex == 3 && abnode->scancmd)
660 strcpy(abuffer, abnode->scancmd);