From 660a0855bb9351a72ef45cd72e02503c86bf2cea Mon Sep 17 00:00:00 2001 From: Andrew Deason Date: Wed, 11 Sep 2019 16:42:47 -0500 Subject: [PATCH 1/1] ubik: Log urecovery_CheckTid-aborted txes Log when urecovery_CheckTid aborts/ends a running remote transaction. This is usually a rare event, occurring when some ubik sites get "stuck" or confused about the state of the quorum. Logging some details when this happens can be useful when investigating issues post-mortem, or just to see why a transaction failed. Change-Id: If0a7cd134aaac3722fe7214a1d8f0efab550ad11 Reviewed-on: https://gerrit.openafs.org/13862 Tested-by: BuildBot Reviewed-by: Andrew Deason Reviewed-by: Marcio Brito Barbosa Reviewed-by: Benjamin Kaduk --- src/ubik/recovery.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/ubik/recovery.c b/src/ubik/recovery.c index 5e42b54..99b9fd8 100644 --- a/src/ubik/recovery.c +++ b/src/ubik/recovery.c @@ -159,8 +159,20 @@ urecovery_CheckTid(struct ubik_tid *atid, int abortalways) if (atid->epoch != ubik_currentTrans->tid.epoch || atid->counter > ubik_currentTrans->tid.counter || abortalways) { /* don't match, abort it */ + int endit = 0; /* If the thread is not waiting for lock - ok to end it */ if (ubik_currentTrans->locktype != LOCKWAIT) { + endit = 1; + } + + ViceLog(0, ("urecovery_CheckTid: Aborting/ending bad remote " + "transaction. (tx %d.%d, atid %d.%d, abortalways %d, " + "endit %d)\n", + ubik_currentTrans->tid.epoch, + ubik_currentTrans->tid.counter, + atid->epoch, atid->counter, + abortalways, endit)); + if (endit) { udisk_end(ubik_currentTrans); } ubik_currentTrans = (struct ubik_trans *)0; -- 1.9.4