From 9cd983799e622c9acf5dd6e0b9ae3a3a75eaa8ce Mon Sep 17 00:00:00 2001
From: Andrew Deason <adeason@sinenomine.net>
Date: Thu, 2 Aug 2012 11:58:12 -0400
Subject: [PATCH] rx: Process ICMP unreachable errors

When a machine receives ICMP errors, we can detect them in
AFS_RXERRQ_ENV environments. Many of these errors indicate that a
machine is not reachable, so we are guaranteed to not get a response
from them. When we get such an error for a particular peer, mark all
relevant calls with an RX_CALL_DEAD error, since we know we won't get
a response from them. This allows some calls to dead/unreachable hosts
to fail much more quickly.

Do not immediately kill new calls, since obviously the host may have
come back up since then (or the routing/firewall/etc was fixed), but
only calls that were started before the current error was received.

Note that a call doesn't actually notice until the next rxi_CheckCall,
since directly killing each of the relevant calls would be rather
slow. So, we don't notice a dead peer immediately, though we notice
much more quickly than we used to.

Reorganize the error queue processing a little bit to make this easier
to do.

Change-Id: I403540e0677fe2d432901e4ecc19f7f385610b7f
Reviewed-on: http://gerrit.openafs.org/7929
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Derrick Brashear <shadow@dementix.org>
---
 src/rx/LINUX/rx_knet.c | 10 +-----
 src/rx/rx.c            | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/rx/rx_call.h       |  3 ++
 src/rx/rx_internal.h   | 12 ++++++++
 src/rx/rx_peer.h       |  3 ++
 src/rx/rx_user.c       |  8 +----
 6 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/src/rx/LINUX/rx_knet.c b/src/rx/LINUX/rx_knet.c
index fdf042e..5d2fc4a 100644
--- a/src/rx/LINUX/rx_knet.c
+++ b/src/rx/LINUX/rx_knet.c
@@ -147,15 +147,7 @@ osi_HandleSocketError(osi_socket so)
 
     memcpy(&addr, offender, sizeof(addr));
 
-# ifdef AFS_ADAPT_PMTU
-    if (err->ee_origin == SO_EE_ORIGIN_ICMP &&
-	err->ee_type == ICMP_DEST_UNREACH &&
-	err->ee_code == ICMP_FRAG_NEEDED) {
-	rxi_SetPeerMtu(NULL, ntohl(addr.sin_addr.s_addr), ntohs(addr.sin_port),
-		       err->ee_info);
-    }
-# endif
-    /* other DEST_UNREACH's and TIME_EXCEEDED should be dealt with too */
+    rxi_ProcessNetError(err, addr.sin_addr.s_addr, addr.sin_port);
 
  out:
     if (controlmsgbuf) {
diff --git a/src/rx/rx.c b/src/rx/rx.c
index 17ab869..b2226a3 100644
--- a/src/rx/rx.c
+++ b/src/rx/rx.c
@@ -1626,6 +1626,13 @@ rx_NewCall(struct rx_connection *conn)
     else
 	call->mode = RX_MODE_SENDING;
 
+#ifdef AFS_RXERRQ_ENV
+    /* remember how many network errors the peer has when we started, so if
+     * more errors are encountered after the call starts, we know the other endpoint won't be
+     * responding to us */
+    call->neterr_gen = rx_atomic_read(&conn->peer->neterrs);
+#endif
+
     /* remember start time for call in case we have hard dead time limit */
     call->queueTime = queueTime;
     clock_GetTime(&call->startTime);
@@ -2910,6 +2917,51 @@ rxi_SetPeerMtu(struct rx_peer *peer, afs_uint32 host, afs_uint32 port, int mtu)
     MUTEX_EXIT(&rx_peerHashTable_lock);
 }
 
+#ifdef AFS_RXERRQ_ENV
+static void
+rxi_SetPeerDead(afs_uint32 host, afs_uint16 port)
+{
+    int hashIndex = PEER_HASH(host, port);
+    struct rx_peer *peer;
+
+    MUTEX_ENTER(&rx_peerHashTable_lock);
+
+    for (peer = rx_peerHashTable[hashIndex]; peer; peer = peer->next) {
+	if (peer->host == host && peer->port == port) {
+	    break;
+	}
+    }
+
+    if (peer) {
+	rx_atomic_inc(&peer->neterrs);
+    }
+
+    MUTEX_EXIT(&rx_peerHashTable_lock);
+}
+
+void
+rxi_ProcessNetError(struct sock_extended_err *err, afs_uint32 addr, afs_uint16 port)
+{
+# ifdef AFS_ADAPT_PMTU
+    if (err->ee_errno == EMSGSIZE && err->ee_info >= 68) {
+	rxi_SetPeerMtu(NULL, addr, port, err->ee_info - RX_IPUDP_SIZE);
+	return;
+    }
+# endif
+    if (err->ee_origin == SO_EE_ORIGIN_ICMP && err->ee_type == ICMP_DEST_UNREACH) {
+	switch (err->ee_code) {
+	case ICMP_NET_UNREACH:
+	case ICMP_HOST_UNREACH:
+	case ICMP_PORT_UNREACH:
+	case ICMP_NET_ANO:
+	case ICMP_HOST_ANO:
+	    rxi_SetPeerDead(addr, port);
+	    break;
+	}
+    }
+}
+#endif /* AFS_RXERRQ_ENV */
+
 /* Find the peer process represented by the supplied (host,port)
  * combination.  If there is no appropriate active peer structure, a
  * new one will be allocated and initialized
@@ -2933,6 +2985,9 @@ rxi_FindPeer(afs_uint32 host, u_short port,
 	    pp = rxi_AllocPeer();	/* This bzero's *pp */
 	    pp->host = host;	/* set here or in InitPeerParams is zero */
 	    pp->port = port;
+#ifdef AFS_RXERRQ_ENV
+	    rx_atomic_set(&pp->neterrs, 0);
+#endif
 	    MUTEX_INIT(&pp->peer_lock, "peer_lock", MUTEX_DEFAULT, 0);
 	    queue_Init(&pp->rpcStats);
 	    pp->next = rx_peerHashTable[hashIndex];
@@ -3200,6 +3255,11 @@ rxi_ReceivePacket(struct rx_packet *np, osi_socket socket,
 	 */
 
 	if (peer && (peer->refCount > 0)) {
+#ifdef AFS_RXERRQ_ENV
+	    if (rx_atomic_read(&peer->neterrs)) {
+		rx_atomic_set(&peer->neterrs, 0);
+	    }
+#endif
 	    MUTEX_ENTER(&peer->peer_lock);
 	    peer->bytesReceived += np->length;
 	    MUTEX_EXIT(&peer->peer_lock);
@@ -3254,6 +3314,12 @@ rxi_ReceivePacket(struct rx_packet *np, osi_socket socket,
         return np;
     }
 
+#ifdef AFS_RXERRQ_ENV
+    if (rx_atomic_read(&conn->peer->neterrs)) {
+	rx_atomic_set(&conn->peer->neterrs, 0);
+    }
+#endif
+
     /* If we're doing statistics, then account for the incoming packet */
     if (rx_stats_active) {
 	MUTEX_ENTER(&conn->peer->peer_lock);
@@ -6179,6 +6245,23 @@ rxi_CheckCall(struct rx_call *call)
     int idle_timeout = 0;
     afs_int32  clock_diff = 0;
 
+#ifdef AFS_RXERRQ_ENV
+    int peererrs = rx_atomic_read(&call->conn->peer->neterrs);
+    if (call->neterr_gen < peererrs) {
+	/* we have received network errors since this call started; kill
+	 * the call */
+	if (call->state == RX_STATE_ACTIVE) {
+	    rxi_CallError(call, RX_CALL_DEAD);
+	}
+	return -1;
+    }
+    if (call->neterr_gen > peererrs) {
+	/* someone has reset the number of peer errors; set the call error gen
+	 * so we can detect if more errors are encountered */
+	call->neterr_gen = peererrs;
+    }
+#endif
+
     now = clock_Sec();
 
     /* Large swings in the clock can have a significant impact on
diff --git a/src/rx/rx_call.h b/src/rx/rx_call.h
index e873eb4..a1450f1 100644
--- a/src/rx/rx_call.h
+++ b/src/rx/rx_call.h
@@ -157,6 +157,9 @@ struct rx_call {
 #endif
     afs_uint32 call_id;
 #endif
+#ifdef AFS_RXERRQ_ENV
+    int neterr_gen;
+#endif
 };
 
 /* Only include this once, even when re-loading for kdump. */
diff --git a/src/rx/rx_internal.h b/src/rx/rx_internal.h
index e54d10c..83761d3 100644
--- a/src/rx/rx_internal.h
+++ b/src/rx/rx_internal.h
@@ -5,6 +5,14 @@
  * customers of RX belong in rx_private.h, which is installed.
  */
 
+#ifdef AFS_RXERRQ_ENV
+# if defined(AFS_LINUX26_ENV) || defined(AFS_USR_LINUX26_ENV)
+#  include <linux/module.h>
+#  include <linux/types.h>
+#  include <linux/errqueue.h>
+#  include <linux/icmp.h>
+# endif
+#endif
 
 /* Globals that we don't want the world to know about */
 extern rx_atomic_t rx_nWaiting;
@@ -16,6 +24,10 @@ extern rx_atomic_t rx_nWaited;
 extern void rxi_PacketsUnWait(void);
 extern void rxi_SetPeerMtu(struct rx_peer *peer, afs_uint32 host,
 			   afs_uint32 port, int mtu);
+#ifdef AFS_RXERRQ_ENV
+extern void rxi_ProcessNetError(struct sock_extended_err *err,
+                                afs_uint32 addr, afs_uint16 port);
+#endif
 extern struct rx_peer *rxi_FindPeer(afs_uint32 host, u_short port,
 				    struct rx_peer *origPeer, int create);
 extern struct rx_packet *rxi_ReceivePacket(struct rx_packet *np,
diff --git a/src/rx/rx_peer.h b/src/rx/rx_peer.h
index 318cbe8..baf37b0 100644
--- a/src/rx/rx_peer.h
+++ b/src/rx/rx_peer.h
@@ -63,6 +63,9 @@ struct rx_peer {
     struct rx_queue rpcStats;	/* rpc statistic list */
     int lastReachTime;		/* Last time we verified reachability */
     afs_int32 maxPacketSize;    /* peer packetsize hint */
+#ifdef AFS_RXERRQ_ENV
+    rx_atomic_t neterrs;
+#endif
 };
 
 #endif
diff --git a/src/rx/rx_user.c b/src/rx/rx_user.c
index a0427e4..be74a64 100644
--- a/src/rx/rx_user.c
+++ b/src/rx/rx_user.c
@@ -815,13 +815,7 @@ rxi_HandleSocketError(int socket)
     ret = 1;
     err = (struct sock_extended_err *) CMSG_DATA(cmsg);
 
-# ifdef AFS_ADAPT_PMTU
-    if (err->ee_errno == EMSGSIZE && err->ee_info >= 68) {
-	rxi_SetPeerMtu(NULL, addr.sin_addr.s_addr, addr.sin_port,
-                       err->ee_info - RX_IPUDP_SIZE);
-    }
-# endif
-    /* other DEST_UNREACH's and TIME_EXCEEDED should be dealt with too */
+    rxi_ProcessNetError(err, addr.sin_addr.s_addr, addr.sin_port);
 
 out:
     return ret;
-- 
1.9.4