From: Andrew Deason Date: Wed, 25 Nov 2009 21:23:06 +0000 (-0500) Subject: Add a watchdog timer for ShutDownAndCore(PANIC) X-Git-Tag: openafs-devel-1_5_67~11 X-Git-Url: https://git.openafs.org/?p=openafs.git;a=commitdiff_plain;h=9bff2027cb963343778001512d5cdf00cc7094c6;hp=c4780421a41a05683bcec8d1092534c2ad58f341 Add a watchdog timer for ShutDownAndCore(PANIC) Add a watchdog timer that is started when ShutDownAndCore(PANIC) is called, to ensure we actually panic and do not e.g. get stuck in deadlock. Change-Id: I9201fe7d09aeb6819beefaf1755b51129c7bda6b Reviewed-on: http://gerrit.openafs.org/873 Tested-by: Andrew Deason Reviewed-by: Alistair Ferguson Tested-by: Derrick Brashear Reviewed-by: Derrick Brashear --- diff --git a/src/viced/viced.c b/src/viced/viced.c index ad5b134..a7f38a3 100644 --- a/src/viced/viced.c +++ b/src/viced/viced.c @@ -177,6 +177,16 @@ int SawLock; #endif time_t StartTime; +/** + * seconds to wait until forcing a panic during ShutDownAndCore(PANIC) + * in case we get stuck. + */ +#ifdef AFS_DEMAND_ATTACH_FS +static int panic_timeout = 2 * 60; +#else +static int panic_timeout = 30 * 60; +#endif + int rxpackets = 150; /* 100 */ int nSmallVns = 400; /* 200 */ int large = 400; /* 200 */ @@ -772,12 +782,36 @@ CheckSignal(void *unused) return 0; } /*CheckSignal */ +static void * +ShutdownWatchdogLWP(void *unused) +{ + sleep(panic_timeout); + ViceLog(0, ("ShutdownWatchdogLWP: Failed to shutdown and panic " + "within %d seconds; forcing panic\n", panic_timeout)); + assert(0); + return NULL; +} + void ShutDownAndCore(int dopanic) { time_t now = time(0); char tbuffer[32]; + if (dopanic) { +#ifdef AFS_PTHREAD_ENV + pthread_t watchdogPid; + pthread_attr_t tattr; + assert(pthread_attr_init(&tattr) == 0); + assert(pthread_create(&watchdogPid, &tattr, ShutdownWatchdogLWP, NULL) == 0); +#else + PROCESS watchdogPid; + assert(LWP_CreateProcess + (ShutdownWatchdogLWP, stack * 1024, LWP_MAX_PRIORITY - 2, + NULL, "ShutdownWatchdog", &watchdogPid) == LWP_SUCCESS); +#endif + } + /* do not allows new reqests to be served from now on, all new requests * are returned with an error code of RX_RESTARTING ( transient failure ) */ rx_SetRxTranquil(); /* dhruba */