From: Derrick Brashear Date: Wed, 21 May 2008 05:22:27 +0000 (+0000) Subject: DEVEL15-pmtu-and-prefetch-20080520 X-Git-Tag: openafs-devel-1_5_50~121 X-Git-Url: http://git.openafs.org/?p=openafs.git;a=commitdiff_plain;h=149b3fa69e2b02d6c6f7eb301d4cb78d4daff9f1 DEVEL15-pmtu-and-prefetch-20080520 LICENSE IPL10 try hard to optimize using the wire for high latency connections (cherry picked from commit 1206e7538be86f073b21cd289266286b60a95d0a) --- diff --git a/acinclude.m4 b/acinclude.m4 index 65808c0..2c0a252 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -39,6 +39,8 @@ AC_ARG_ENABLE( unix-sockets, [ --enable-unix-sockets enable use of unix domain sockets for fssync],, enable_unix_sockets="yes") AC_ARG_ENABLE( full-vos-listvol-switch, [ --disable-full-vos-listvol-switch disable vos full listvol switch for formatted output],, enable_full_vos_listvol_switch="yes") +AC_ARG_ENABLE( icmp-pmtu-discovery, +[ --enable-icmp-pmtu-discovery enable path MTU discovery by decoding ICMP unreachable replies],, enable_icmp_pmtu_discovery="no") AC_ARG_WITH(dux-kernel-headers, [ --with-dux-kernel-headers=path use the kernel headers found at path(optional, defaults to first match in /usr/sys)] ) @@ -1084,6 +1086,19 @@ else fi +AC_CACHE_VAL(ac_cv_setsockopt_iprecverr, +[ +AC_MSG_CHECKING([for setsockopt(, SOL_IP, IP_RECVERR)]) +AC_TRY_COMPILE( [#include +#include +#include ], +[int on=1; +setsockopt(0, SOL_IP, IP_RECVERR, &on, sizeof(on));], ac_cv_setsockopt_iprecverr=yes, ac_cv_setsockopt_iprecverr=no) +AC_MSG_RESULT($ac_cv_setsockopt_iprecverr)]) +if test "$ac_cv_setsockopt_iprecverr" = "yes"; then + AC_DEFINE(ADAPT_PMTU_RECVERR, 1, [define if asynchronous socket errors can be received]) +fi + PTHREAD_LIBS=error if test "x$MKAFS_OSTYPE" = OBSD; then PTHREAD_LIBS="-pthread" @@ -1178,6 +1193,12 @@ if test "$enable_full_vos_listvol_switch" = "yes"; then AC_DEFINE(FULL_LISTVOL_SWITCH, 1, [define if you want to want listvol switch]) fi +if test "$enable_icmp_pmtu_discovery" = "yes"; then + if test "$ac_cv_setsockopt_iprecverr" = "yes"; then + AC_DEFINE(ADAPT_PMTU, 1, [define if you want to decode icmp unreachable packets to discover path mtu]) + fi +fi + if test "$enable_bos_restricted_mode" = "yes"; then AC_DEFINE(BOS_RESTRICTED_MODE, 1, [define if you want to want bos restricted mode]) fi @@ -1230,6 +1251,7 @@ AC_CHECK_HEADERS(mntent.h sys/vfs.h sys/param.h sys/fs_types.h sys/fstyp.h) AC_CHECK_HEADERS(sys/mount.h strings.h termios.h signal.h poll.h) AC_CHECK_HEADERS(windows.h malloc.h winsock2.h direct.h io.h sys/user.h) AC_CHECK_HEADERS(security/pam_modules.h siad.h usersec.h ucontext.h regex.h values.h) +AC_CHECK_HEADERS(linux/errqueue.h,,,[#include ]) if test "$ac_cv_header_security_pam_modules_h" = yes -a "$enable_pam" = yes; then HAVE_PAM="yes" diff --git a/src/afs/VNOPS/afs_vnop_open.c b/src/afs/VNOPS/afs_vnop_open.c index cab75bb..998cf32 100644 --- a/src/afs/VNOPS/afs_vnop_open.c +++ b/src/afs/VNOPS/afs_vnop_open.c @@ -150,6 +150,35 @@ afs_open(struct vcache **avcp, afs_int32 aflags, struct AFS_UCRED *acred) } #endif ReleaseReadLock(&tvc->lock); + if ((afs_preCache != 0) && (writing == 0) && (vType(tvc) != VDIR) && + (!afs_BBusy())) { + register struct dcache *tdc; + afs_size_t offset, len, totallen = 0; + + tdc = afs_GetDCache(tvc, 0, &treq, &offset, &len, 1); + + ObtainSharedLock(&tdc->mflock, 865); + if (!(tdc->mflags & DFFetchReq)) { + struct brequest *bp; + + /* start the daemon (may already be running, however) */ + UpgradeSToWLock(&tdc->mflock, 666); + tdc->mflags |= DFFetchReq; /* guaranteed to be cleared by BKG or + GetDCache */ + /* last parm (1) tells bkg daemon to do an afs_PutDCache when it + is done, since we don't want to wait for it to finish before + doing so ourselves. + */ + bp = afs_BQueue(BOP_FETCH, tvc, B_DONTWAIT, 0, acred, + (afs_size_t) 0, (afs_size_t) 1, tdc); + if (!bp) { + tdc->mflags &= ~DFFetchReq; + } + ReleaseWriteLock(&tdc->mflock); + } else { + ReleaseSharedLock(&tdc->mflock); + } + } done: afs_PutFakeStat(&fakestate); code = afs_CheckCode(code, &treq, 4); /* avoid AIX -O bug */ diff --git a/src/afs/VNOPS/afs_vnop_read.c b/src/afs/VNOPS/afs_vnop_read.c index c1cd88b..77af418 100644 --- a/src/afs/VNOPS/afs_vnop_read.c +++ b/src/afs/VNOPS/afs_vnop_read.c @@ -388,12 +388,16 @@ afs_MemRead(register struct vcache *avc, struct uio *auio, */ if (tdc) { ReleaseReadLock(&tdc->lock); -#if !defined(AFS_VM_RDWR_ENV) /* try to queue prefetch, if needed */ - if (!noLock) { + if (!noLock && +#ifndef AFS_VM_RDWR_ENV + afs_preCache +#else + 1 +#endif + ) { afs_PrefetchChunk(avc, tdc, acred, &treq); } -#endif afs_PutDCache(tdc); } if (!noLock) diff --git a/src/afs/afs_daemons.c b/src/afs/afs_daemons.c index 4a2f838..051d017 100644 --- a/src/afs/afs_daemons.c +++ b/src/afs/afs_daemons.c @@ -52,6 +52,7 @@ afs_int32 afs_CheckServerDaemonStarted = 0; afs_int32 afs_probe_interval = DEFAULT_PROBE_INTERVAL; afs_int32 afs_probe_all_interval = 600; afs_int32 afs_nat_probe_interval = 60; +afs_int32 afs_preCache = 0; #define PROBE_WAIT() (1000 * (afs_probe_interval - ((afs_random() & 0x7fffffff) \ % (afs_probe_interval/2)))) @@ -478,17 +479,22 @@ BPrefetch(register struct brequest *ab) { register struct dcache *tdc; register struct vcache *tvc; - afs_size_t offset, len; + afs_size_t offset, len, abyte, totallen = 0; struct vrequest treq; AFS_STATCNT(BPrefetch); if ((len = afs_InitReq(&treq, ab->cred))) return; + abyte = ab->size_parm[0]; tvc = ab->vc; - tdc = afs_GetDCache(tvc, ab->size_parm[0], &treq, &offset, &len, 1); - if (tdc) { - afs_PutDCache(tdc); - } + do { + tdc = afs_GetDCache(tvc, abyte, &treq, &offset, &len, 1); + if (tdc) { + afs_PutDCache(tdc); + } + abyte+=len; + totallen += len; + } while ((totallen < afs_preCache) && tdc && (len > 0)); /* now, dude may be waiting for us to clear DFFetchReq bit; do so. Can't * use tdc from GetDCache since afs_GetDCache may fail, but someone may * be waiting for our wakeup anyway. diff --git a/src/afs/afs_pioctl.c b/src/afs/afs_pioctl.c index 346caf9..d35d334 100644 --- a/src/afs/afs_pioctl.c +++ b/src/afs/afs_pioctl.c @@ -92,6 +92,7 @@ DECL_PIOCTL(PResidencyCmd); DECL_PIOCTL(PCallBackAddr); DECL_PIOCTL(PNFSNukeCreds); DECL_PIOCTL(PNewUuid); +DECL_PIOCTL(PPrecache); /* * A macro that says whether we're going to need HandleClientContext(). @@ -200,6 +201,9 @@ static int (*(CpioctlSw[])) () = { PBogus, /* 7 */ PBogus, /* 8 */ PNewUuid, /* 9 */ + PBogus, /* 0 */ + PBogus, /* 0 */ + PPrecache, /* 12 */ }; static int (*(OpioctlSw[])) () = { @@ -2063,6 +2067,18 @@ DECL_PIOCTL(PViceAccess) return EACCES; } +DECL_PIOCTL(PPrecache) +{ + afs_int32 newValue; + + /*AFS_STATCNT(PPrecache);*/ + if (!afs_osi_suser(*acred)) + return EACCES; + memcpy((char *)&newValue, ain, sizeof(afs_int32)); + afs_preCache = newValue*1024; + return 0; +} + DECL_PIOCTL(PSetCacheSize) { afs_int32 newValue; diff --git a/src/afs/afs_prototypes.h b/src/afs/afs_prototypes.h index ba4d5e3..28aa6ef 100644 --- a/src/afs/afs_prototypes.h +++ b/src/afs/afs_prototypes.h @@ -199,6 +199,7 @@ extern afs_int32 afs_gcpags; extern afs_int32 afs_gcpags_procsize; extern afs_int32 afs_CheckServerDaemonStarted; extern afs_int32 afs_probe_interval; +extern afs_int32 afs_preCache; extern void afs_Daemon(void); extern struct brequest *afs_BQueue(register short aopcode, diff --git a/src/config/venus.h b/src/config/venus.h index c940829..2e5317f 100644 --- a/src/config/venus.h +++ b/src/config/venus.h @@ -183,6 +183,7 @@ struct cm_initparams { #define VIOC_CBADDR _CVICEIOCTL(3) /* push callback addr */ #define VIOC_DISCON _CVICEIOCTL(5) /* set/get discon mode */ #define VIOC_NEWUUID _CVICEIOCTL(9) /* new uuid */ +#define VIOCPRECACHE _CVICEIOCTL(12) /* precache size */ /* OpenAFS-specific 'O' pioctl's */ #define VIOC_NFS_NUKE_CREDS _OVICEIOCTL(1) /* nuke creds for all PAG's */ diff --git a/src/rx/LINUX/rx_knet.c b/src/rx/LINUX/rx_knet.c index ab1ec2b..054152c 100644 --- a/src/rx/LINUX/rx_knet.c +++ b/src/rx/LINUX/rx_knet.c @@ -25,6 +25,10 @@ RCSID #include "h/smp_lock.h" #endif #include +#ifdef ADAPT_PMTU +#include +#include +#endif /* rxk_NewSocket * open and bind RX socket @@ -36,8 +40,12 @@ rxk_NewSocketHost(afs_uint32 ahost, short aport) struct sockaddr_in myaddr; int code; KERNEL_SPACE_DECL; +#ifdef ADAPT_PMTU + int pmtu = IP_PMTUDISC_WANT; + int do_recverr = 1; +#else int pmtu = IP_PMTUDISC_DONT; - +#endif /* We need a better test for this. if you need it back, tell us * how to detect it. @@ -69,6 +77,10 @@ rxk_NewSocketHost(afs_uint32 ahost, short aport) TO_USER_SPACE(); sockp->ops->setsockopt(sockp, SOL_IP, IP_MTU_DISCOVER, (char *)&pmtu, sizeof(pmtu)); +#ifdef ADAPT_PMTU + sockp->ops->setsockopt(sockp, SOL_IP, IP_RECVERR, (char *)&do_recverr, + sizeof(do_recverr)); +#endif TO_KERNEL_SPACE(); return (osi_socket *)sockp; } @@ -87,6 +99,65 @@ rxk_FreeSocket(register struct socket *asocket) return 0; } +#ifdef ADAPT_PMTU +void +handle_socket_error(osi_socket so) +{ + KERNEL_SPACE_DECL; + struct msghdr msg; + struct cmsghdr *cmsg; + struct sock_extended_err *err; + struct sockaddr_in addr; + struct sockaddr *offender; + char *controlmsgbuf; + int code; + struct socket *sop = (struct socket *)so; + + if (!(controlmsgbuf=rxi_Alloc(256))) + return; + msg.msg_name = &addr; + msg.msg_namelen = sizeof(addr); + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_control = controlmsgbuf; + msg.msg_controllen = 256; + msg.msg_flags = 0; + + TO_USER_SPACE(); + code = sock_recvmsg(sop, &msg, 256, MSG_ERRQUEUE|MSG_DONTWAIT|MSG_TRUNC); + TO_KERNEL_SPACE(); + + if (code < 0 || !(msg.msg_flags & MSG_ERRQUEUE)) + goto out; + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (CMSG_OK(&msg, cmsg) && cmsg->cmsg_level == SOL_IP && + cmsg->cmsg_type == IP_RECVERR) + break; + } + if (!cmsg) + goto out; + err = CMSG_DATA(cmsg); + offender = SO_EE_OFFENDER(err); + + if (offender->sa_family != AF_INET) + goto out; + + memcpy(&addr, offender, sizeof(addr)); + + if (err->ee_origin == SO_EE_ORIGIN_ICMP && + err->ee_type == ICMP_DEST_UNREACH && + err->ee_code == ICMP_FRAG_NEEDED) { + rxi_SetPeerMtu(ntohl(addr.sin_addr.s_addr), ntohs(addr.sin_port), + err->ee_info); + } + /* other DEST_UNREACH's and TIME_EXCEEDED should be dealt with too */ + +out: + rxi_Free(controlmsgbuf, 256); + return; +} +#endif /* osi_NetSend * @@ -100,7 +171,22 @@ osi_NetSend(osi_socket sop, struct sockaddr_in *to, struct iovec *iovec, { KERNEL_SPACE_DECL; struct msghdr msg; - int code; + int code, sockerr; + size_t esize; + +#ifdef ADAPT_PMTU + while (1) { + sockerr=0; + esize = sizeof(sockerr); + TO_USER_SPACE(); + sop->ops->getsockopt(sop, SOL_SOCKET, SO_ERROR, (char *)&sockerr, + &esize); + TO_KERNEL_SPACE(); + if (sockerr == 0) + break; + handle_socket_error(sop); + } +#endif msg.msg_iovlen = iovcnt; msg.msg_iov = iovec; @@ -144,13 +230,27 @@ osi_NetReceive(osi_socket so, struct sockaddr_in *from, struct iovec *iov, { KERNEL_SPACE_DECL; struct msghdr msg; - int code; + int code, sockerr; + size_t esize; struct iovec tmpvec[RX_MAXWVECS + 2]; struct socket *sop = (struct socket *)so; if (iovcnt > RX_MAXWVECS + 2) { osi_Panic("Too many (%d) iovecs passed to osi_NetReceive\n", iovcnt); } +#ifdef ADAPT_PMTU + while (1) { + sockerr=0; + esize = sizeof(sockerr); + TO_USER_SPACE(); + sop->ops->getsockopt(sop, SOL_SOCKET, SO_ERROR, (char *)&sockerr, + &esize); + TO_KERNEL_SPACE(); + if (sockerr == 0) + break; + handle_socket_error(so); + } +#endif memcpy(tmpvec, iov, iovcnt * sizeof(struct iovec)); msg.msg_name = from; msg.msg_iov = tmpvec; diff --git a/src/rx/rx.c b/src/rx/rx.c index 995c807..564691f 100644 --- a/src/rx/rx.c +++ b/src/rx/rx.c @@ -33,6 +33,11 @@ RCSID #include "h/socket.h" #endif #include "netinet/in.h" +#ifdef AFS_SUN57_ENV +#include "inet/common.h" +#include "inet/ip.h" +#include "inet/ip_ire.h" +#endif #include "afs/afs_args.h" #include "afs/afs_osi.h" #ifdef RX_KERNEL_TRACE @@ -2325,6 +2330,43 @@ rxi_Free(void *addr, register size_t size) osi_Free(addr, size); } +void +rxi_SetPeerMtu(register afs_uint32 host, register afs_uint32 port, int mtu) +{ + struct rx_peer **peer_ptr, **peer_end; + int hashIndex; + + MUTEX_ENTER(&rx_peerHashTable_lock); + if (port == 0) { + for (peer_ptr = &rx_peerHashTable[0], peer_end = + &rx_peerHashTable[rx_hashTableSize]; peer_ptr < peer_end; + peer_ptr++) { + struct rx_peer *peer, *next; + for (peer = *peer_ptr; peer; peer = next) { + next = peer->next; + if (host == peer->host) { + MUTEX_ENTER(&peer->peer_lock); + peer->ifMTU=MIN(mtu, peer->ifMTU); + peer->natMTU = rxi_AdjustIfMTU(peer->ifMTU); + MUTEX_EXIT(&peer->peer_lock); + } + } + } + } else { + struct rx_peer *peer, *next; + hashIndex = PEER_HASH(host, port); + for (peer = rx_peerHashTable[hashIndex]; peer; peer = peer->next) { + if ((peer->host == host) && (peer->port == port)) { + MUTEX_ENTER(&peer->peer_lock); + peer->ifMTU=MIN(mtu, peer->ifMTU); + peer->natMTU = rxi_AdjustIfMTU(peer->ifMTU); + MUTEX_EXIT(&peer->peer_lock); + } + } + } + MUTEX_EXIT(&rx_peerHashTable_lock); +} + /* Find the peer process represented by the supplied (host,port) * combination. If there is no appropriate active peer structure, a * new one will be allocated and initialized @@ -5496,6 +5538,32 @@ rxi_CheckCall(register struct rx_call *call) * number of seconds. */ if (now > (call->lastReceiveTime + deadTime)) { if (call->state == RX_STATE_ACTIVE) { +#ifdef ADAPT_PMTU +#if defined(KERNEL) && defined(AFS_SUN57_ENV) + ire_t *ire; +#if defined(AFS_SUN510_ENV) && defined(GLOBAL_NETSTACKID) + netstack_t *ns = netstack_find_by_stackid(GLOBAL_NETSTACKID); + ip_stack_t *ipst = ns->netstack_ip; +#endif + ire = ire_cache_lookup(call->conn->peer->host +#if defined(AFS_SUN510_ENV) && defined(ALL_ZONES) + , ALL_ZONES +#if defined(AFS_SUN510_ENV) && (defined(ICL_3_ARG) || defined(GLOBAL_NETSTACKID)) + , NULL +#if defined(AFS_SUN510_ENV) && defined(GLOBAL_NETSTACKID) + , ipst +#endif +#endif +#endif + ); + + if (ire && ire->ire_max_frag > 0) + rxi_SetPeerMtu(call->conn->peer->host, 0, ire->ire_max_frag); +#if defined(GLOBAL_NETSTACKID) + netstack_rele(ns); +#endif +#endif +#endif /* ADAPT_PMTU */ rxi_CallError(call, RX_CALL_DEAD); return -1; } else { diff --git a/src/rx/rx_globals.h b/src/rx/rx_globals.h index bfecc28..2e16cb9 100644 --- a/src/rx/rx_globals.h +++ b/src/rx/rx_globals.h @@ -397,7 +397,7 @@ EXT int rx_packetReclaims GLOBALSINIT(0); * This is provided for backward compatibility with peers which may be unable * to swallow anything larger. THIS MUST NEVER DECREASE WHILE AN APPLICATION * IS RUNNING! */ -EXT afs_uint32 rx_maxReceiveSize GLOBALSINIT(OLD_MAX_PACKET_SIZE * RX_MAX_FRAGS + +EXT afs_uint32 rx_maxReceiveSize GLOBALSINIT(_OLD_MAX_PACKET_SIZE * RX_MAX_FRAGS + UDP_HDR_SIZE * (RX_MAX_FRAGS - 1)); /* this is the maximum packet size that the user wants us to receive */ @@ -605,4 +605,5 @@ EXT int rx_max_clones_per_connection GLOBALSINIT(0); EXT int rx_max_clones_per_connection GLOBALSINIT(2); #endif +EXT int RX_IPUDP_SIZE GLOBALSINIT(_RX_IPUDP_SIZE); #endif /* AFS_RX_GLOBALS_H */ diff --git a/src/rx/rx_lwp.c b/src/rx/rx_lwp.c index 8733f4b..2bedf82 100644 --- a/src/rx/rx_lwp.c +++ b/src/rx/rx_lwp.c @@ -428,6 +428,10 @@ rxi_Listen(osi_socket sock) int rxi_Recvmsg(osi_socket socket, struct msghdr *msg_p, int flags) { +#if defined(HAVE_LINUX_ERRQUEUE_H) && defined(ADAPT_PMTU) + while((rxi_HandleSocketError(socket)) > 0) + ; +#endif return recvmsg(socket, msg_p, flags); } @@ -451,6 +455,10 @@ rxi_Sendmsg(osi_socket socket, struct msghdr *msg_p, int flags) } FD_SET(socket, sfds); } +#if defined(HAVE_LINUX_ERRQUEUE_H) && defined(ADAPT_PMTU) + while((rxi_HandleSocketError(socket)) > 0) + ; +#endif #ifdef AFS_NT40_ENV if (WSAGetLastError()) #elif defined(AFS_LINUX22_ENV) diff --git a/src/rx/rx_packet.c b/src/rx/rx_packet.c index 92a0e46..7d62865 100644 --- a/src/rx/rx_packet.c +++ b/src/rx/rx_packet.c @@ -2630,6 +2630,8 @@ rxi_AdjustIfMTU(int mtu) int adjMTU; int frags; + if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1) + return mtu; adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE; if (mtu <= adjMTU) { return mtu; diff --git a/src/rx/rx_packet.h b/src/rx/rx_packet.h index 61f1680..93fa8e9 100644 --- a/src/rx/rx_packet.h +++ b/src/rx/rx_packet.h @@ -51,7 +51,7 @@ #define IPv6_FRAG_HDR_SIZE 8 /* IPv6 Fragment Header */ #define UDP_HDR_SIZE 8 /* UDP Header */ #define RX_IP_SIZE (IPv6_HDR_SIZE + IPv6_FRAG_HDR_SIZE) -#define RX_IPUDP_SIZE (RX_IP_SIZE + UDP_HDR_SIZE) +#define _RX_IPUDP_SIZE (RX_IP_SIZE + UDP_HDR_SIZE) /* REMOTE_PACKET_SIZE is currently the same as local. This is because REMOTE * is defined much too generally for my tastes, and includes the case of @@ -102,11 +102,15 @@ /* The minimum MTU for an IP network is 576 bytes including headers */ #define RX_MIN_PACKET_SIZE (576 - RX_IPUDP_SIZE) #define RX_PP_PACKET_SIZE RX_MIN_PACKET_SIZE +#define _RX_MIN_PACKET_SIZE (576 - _RX_IPUDP_SIZE) +#define _RX_PP_PACKET_SIZE _RX_MIN_PACKET_SIZE #define OLD_MAX_PACKET_SIZE (1500 - RX_IPUDP_SIZE) +#define _OLD_MAX_PACKET_SIZE (1500 - _RX_IPUDP_SIZE) /* if the other guy is not on the local net, use this size */ #define RX_REMOTE_PACKET_SIZE (1500 - RX_IPUDP_SIZE) +#define _RX_REMOTE_PACKET_SIZE (1500 - _RX_IPUDP_SIZE) /* for now, never send more data than this */ #define RX_MAX_PACKET_SIZE 16384 diff --git a/src/rx/rx_prototypes.h b/src/rx/rx_prototypes.h index 4dde343..1084ffd 100644 --- a/src/rx/rx_prototypes.h +++ b/src/rx/rx_prototypes.h @@ -83,6 +83,8 @@ extern void rxi_FreeCall(register struct rx_call *call); extern char *rxi_Alloc(register size_t size); extern void rxi_Free(void *addr, register size_t size); +extern void rxi_SetPeerMtu(register afs_uint32 host, register afs_uint32 port, + int mtu); extern struct rx_peer *rxi_FindPeer(register afs_uint32 host, register u_short port, struct rx_peer *origPeer, int create); @@ -584,6 +586,7 @@ extern osi_socket rxi_GetUDPSocket(u_short port); extern void osi_AssertFailU(const char *expr, const char *file, int line); extern int rx_getAllAddr(afs_int32 * buffer, int maxSize); extern void rxi_InitPeerParams(struct rx_peer *pp); +extern int rxi_HandleSocketError(int socket); #if defined(AFS_AIX32_ENV) && !defined(KERNEL) extern void *osi_Alloc(afs_int32 x); diff --git a/src/rx/rx_pthread.c b/src/rx/rx_pthread.c index 1131942..69857ea 100644 --- a/src/rx/rx_pthread.c +++ b/src/rx/rx_pthread.c @@ -412,6 +412,10 @@ int rxi_Recvmsg(osi_socket socket, struct msghdr *msg_p, int flags) { int ret; +#if defined(HAVE_LINUX_ERRQUEUE_H) && defined(ADAPT_PMTU) + while((rxi_HandleSocketError(socket)) > 0) + ; +#endif ret = recvmsg(socket, msg_p, flags); return ret; } diff --git a/src/rx/rx_user.c b/src/rx/rx_user.c index b12a44d..a1100ae 100644 --- a/src/rx/rx_user.c +++ b/src/rx/rx_user.c @@ -95,8 +95,20 @@ rxi_GetHostUDPSocket(u_int ahost, u_short port) struct sockaddr_in taddr; char *name = "rxi_GetUDPSocket: "; #ifdef AFS_LINUX22_ENV +#if defined(ADAPT_PMTU) + int pmtu=IP_PMTUDISC_WANT; + int recverr=1; +#else int pmtu=IP_PMTUDISC_DONT; #endif +#endif +#if defined(HAVE_LINUX_ERRQUEUE_H) && defined(ADAPT_PMTU) +#include +#include +#ifndef IP_MTU +#define IP_MTU 14 +#endif +#endif #if !defined(AFS_NT40_ENV) && !defined(AFS_DJGPP_ENV) if (ntohs(port) >= IPPORT_RESERVED && ntohs(port) < IPPORT_USERRESERVED) { @@ -180,8 +192,10 @@ rxi_GetHostUDPSocket(u_int ahost, u_short port) #ifdef AFS_LINUX22_ENV setsockopt(socketFd, SOL_IP, IP_MTU_DISCOVER, &pmtu, sizeof(pmtu)); +#if defined(ADAPT_PMTU) + setsockopt(socketFd, SOL_IP, IP_RECVERR, &recverr, sizeof(recverr)); +#endif #endif - if (rxi_Listen(socketFd) < 0) { goto error; } @@ -614,6 +628,10 @@ rxi_InitPeerParams(struct rx_peer *pp) afs_uint32 ppaddr; u_short rxmtu; int ix; +#if defined(ADAPT_PMTU) && defined(IP_MTU) + int sock; + struct sockaddr_in addr; +#endif @@ -665,6 +683,22 @@ rxi_InitPeerParams(struct rx_peer *pp) pp->timeout.sec = 2; pp->ifMTU = MIN(rx_MyMaxSendSize, OLD_MAX_PACKET_SIZE); #endif /* ADAPT_MTU */ +#if defined(ADAPT_PMTU) && defined(IP_MTU) + sock=socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sock >= 0) { + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = pp->host; + addr.sin_port = pp->port; + if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) == 0) { + int mtu=0; + socklen_t s = sizeof(mtu); + if (getsockopt(sock, SOL_IP, IP_MTU, &mtu, &s)== 0) { + pp->ifMTU = MIN(mtu - RX_IPUDP_SIZE, pp->ifMTU); + } + } + close(sock); + } +#endif pp->ifMTU = rxi_AdjustIfMTU(pp->ifMTU); pp->maxMTU = OLD_MAX_PACKET_SIZE; /* for compatibility with old guys */ pp->natMTU = MIN((int)pp->ifMTU, OLD_MAX_PACKET_SIZE); @@ -697,3 +731,54 @@ rx_SetMaxMTU(int mtu) { rx_MyMaxSendSize = rx_maxReceiveSizeUser = rx_maxReceiveSize = mtu; } + +#if defined(HAVE_LINUX_ERRQUEUE_H) && defined(ADAPT_PMTU) +int +rxi_HandleSocketError(int socket) +{ + struct msghdr msg; + struct cmsghdr *cmsg; + struct sock_extended_err *err; + struct sockaddr_in addr; + struct sockaddr *offender; + char controlmsgbuf[256]; + int ret=0; + int code; + + msg.msg_name = &addr; + msg.msg_namelen = sizeof(addr); + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_control = controlmsgbuf; + msg.msg_controllen = 256; + msg.msg_flags = 0; + code = recvmsg(socket, &msg, MSG_ERRQUEUE|MSG_DONTWAIT|MSG_TRUNC); + + if (code < 0 || !(msg.msg_flags & MSG_ERRQUEUE)) + goto out; + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if ((char *)cmsg - controlmsgbuf > msg.msg_controllen - CMSG_SPACE(0) || + (char *)cmsg - controlmsgbuf > msg.msg_controllen - CMSG_SPACE(cmsg->cmsg_len) || + cmsg->cmsg_len == 0) { + cmsg = 0; + break; + } + if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type == IP_RECVERR) + break; + } + if (!cmsg) + goto out; + ret=1; + err =(struct sock_extended_err *) CMSG_DATA(cmsg); + + if (err->ee_errno == EMSGSIZE && err->ee_info >= 68) { + rxi_SetPeerMtu(addr.sin_addr.s_addr, addr.sin_port, + err->ee_info - RX_IPUDP_SIZE); + } + /* other DEST_UNREACH's and TIME_EXCEEDED should be dealt with too */ + +out: + return ret; +} +#endif diff --git a/src/rx/rxperf.c b/src/rx/rxperf.c index 90aca79..c61dd7a 100644 --- a/src/rx/rxperf.c +++ b/src/rx/rxperf.c @@ -32,9 +32,7 @@ * SUCH DAMAGE. */ -#ifdef HAVE_CONFIG_H -#include -#endif +#include /* nn * We are using getopt since we want it to be possible to link to @@ -57,9 +55,17 @@ RCSID("$Id$"); #include #include #include +#ifdef HAVE_STRING_H +#include +#else +#ifdef HAVE_STRINGS_H #include +#endif +#endif #include +#ifdef HAVE_UNISTD_H #include +#endif #include #ifdef HAVE_ERRX #include /* not stricly right, but if we have a errx() there @@ -311,7 +317,7 @@ rxperf_ExecuteRequest(struct rx_call *call) DBFPRINT(("got a request\n")); - if (rx_Read(call, &version, 4) != 4) { + if (rx_Read32(call, &version) != 4) { warn("rx_Read failed to read version"); return -1; } @@ -321,13 +327,13 @@ rxperf_ExecuteRequest(struct rx_call *call) return -1; } - if (rx_Read(call, &command, 4) != 4) { + if (rx_Read32(call, &command) != 4) { warnx("rx_Read failed to read command"); return -1; } command = ntohl(command); - if (rx_Read(call, &data, 4) != 4) { + if (rx_Read32(call, &data) != 4) { warnx("rx_Read failed to read size"); return -1; } @@ -337,7 +343,7 @@ rxperf_ExecuteRequest(struct rx_call *call) return -1; } - if (rx_Read(call, &data, 4) != 4) { + if (rx_Read32(call, &data) != 4) { warnx("rx_Read failed to write size"); return -1; } @@ -351,7 +357,7 @@ rxperf_ExecuteRequest(struct rx_call *call) case RX_PERF_SEND: DBFPRINT(("got a send request\n")); - if (rx_Read(call, &bytes, 4) != 4) { + if (rx_Read32(call, &bytes) != 4) { warnx("rx_Read failed to read bytes"); return -1; } @@ -361,7 +367,7 @@ rxperf_ExecuteRequest(struct rx_call *call) readbytes(call, bytes); data = htonl(RXPERF_MAGIC_COOKIE); - if (rx_Write(call, &data, 4) != 4) { + if (rx_Write32(call, &data) != 4) { warnx("rx_Write failed when sending back result"); return -1; } @@ -371,12 +377,12 @@ rxperf_ExecuteRequest(struct rx_call *call) case RX_PERF_RPC: DBFPRINT(("got a rpc request, reading commands\n")); - if (rx_Read(call, &recvb, 4) != 4) { + if (rx_Read32(call, &recvb) != 4) { warnx("rx_Read failed to read recvbytes"); return -1; } recvb = ntohl(recvb); - if (rx_Read(call, &sendb, 4) != 4) { + if (rx_Read32(call, &sendb) != 4) { warnx("rx_Read failed to read sendbytes"); return -1; } @@ -396,14 +402,14 @@ rxperf_ExecuteRequest(struct rx_call *call) DBFPRINT(("done\n")); data = htonl(RXPERF_MAGIC_COOKIE); - if (rx_Write(call, &data, 4) != 4) { + if (rx_Write32(call, &data) != 4) { warnx("rx_Write failed when sending back magic cookie"); return -1; } break; case RX_PERF_FILE: - if (rx_Read(call, &data, 4) != 4) + if (rx_Read32(call, &data) != 4) errx(1, "failed to read num from client"); num = ntohl(data); @@ -436,7 +442,7 @@ rxperf_ExecuteRequest(struct rx_call *call) case RX_PERF_RECV: DBFPRINT(("got a recv request\n")); - if (rx_Read(call, &bytes, 4) != 4) { + if (rx_Read32(call, &bytes) != 4) { warnx("rx_Read failed to read bytes"); return -1; } @@ -446,7 +452,7 @@ rxperf_ExecuteRequest(struct rx_call *call) sendbytes(call, bytes); data = htonl(RXPERF_MAGIC_COOKIE); - if (rx_Write(call, &data, 4) != 4) { + if (rx_Write32(call, &data) != 4) { warnx("rx_Write failed when sending back result"); return -1; } @@ -467,7 +473,7 @@ rxperf_ExecuteRequest(struct rx_call *call) */ static void -do_server(int port) +do_server(int port, int nojumbo, int maxmtu) { struct rx_service *service; struct rx_securityClass *secureobj; @@ -478,6 +484,10 @@ do_server(int port) if (ret) errx(1, "rx_Init failed"); + if (nojumbo) + rx_SetNoJumbo(); + if (maxmtu) + rx_SetMaxMTU(maxmtu); get_sec(1, &secureobj, &secureindex); service = @@ -547,7 +557,8 @@ readfile(const char *filename, u_int32_t ** readwrite, u_int32_t * size) static void do_client(const char *server, int port, char *filename, int32_t command, - int32_t times, int32_t bytes, int32_t sendtimes, int32_t recvtimes) + int32_t times, int32_t bytes, int32_t sendtimes, int32_t recvtimes, + int dumpstats, int nojumbo, int maxmtu) { struct rx_connection *conn; struct rx_call *call; @@ -568,6 +579,10 @@ do_client(const char *server, int port, char *filename, int32_t command, if (ret) errx(1, "rx_Init failed"); + if (nojumbo) + rx_SetNoJumbo(); + if (maxmtu) + rx_SetMaxMTU(maxmtu); get_sec(0, &secureobj, &secureindex); conn = rx_NewConnection(addr, port, RX_SERVER_ID, secureobj, secureindex); @@ -587,19 +602,19 @@ do_client(const char *server, int port, char *filename, int32_t command, errx(1, "rx_NewCall failed"); data = htonl(RX_PERF_VERSION); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send version"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send version (err %d)", rx_Error(call)); data = htonl(command); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send command"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send command (err %d)", rx_Error(call)); data = htonl(rxread_size); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send read size"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send read size (err %d)", rx_Error(call)); data = htonl(rxwrite_size); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send write read"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send write read (err %d)", rx_Error(call)); switch (command) { @@ -607,15 +622,15 @@ do_client(const char *server, int port, char *filename, int32_t command, DBFPRINT(("command ")); data = htonl(bytes); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send size"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send size (err %d)", rx_Error(call)); DBFPRINT(("sending(%d) ", bytes)); if (readbytes(call, bytes)) - errx(1, "sendbytes"); + errx(1, "sendbytes (err %d)", rx_Error(call)); - if (rx_Read(call, &data, 4) != 4) - errx(1, "failed to read result from server"); + if (rx_Read32(call, &data) != 4) + errx(1, "failed to read result from server (err %d)", rx_Error(call)); if (data != htonl(RXPERF_MAGIC_COOKIE)) warn("server send wrong magic cookie in responce"); @@ -627,15 +642,15 @@ do_client(const char *server, int port, char *filename, int32_t command, DBFPRINT(("command ")); data = htonl(bytes); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send size"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send size (err %d)", rx_Error(call)); DBFPRINT(("sending(%d) ", bytes)); if (sendbytes(call, bytes)) - errx(1, "sendbytes"); + errx(1, "sendbytes (err %d)", rx_Error(call)); - if (rx_Read(call, &data, 4) != 4) - errx(1, "failed to read result from server"); + if (rx_Read32(call, &data) != 4) + errx(1, "failed to read result from server (err %d)", rx_Error(call)); if (data != htonl(RXPERF_MAGIC_COOKIE)) warn("server send wrong magic cookie in responce"); @@ -647,21 +662,23 @@ do_client(const char *server, int port, char *filename, int32_t command, DBFPRINT(("commands ")); data = htonl(sendtimes); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send command"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send command (err %d)", rx_Error(call)); data = htonl(recvtimes); - if (rx_Write(call, &data, 4) != 4) - errx(1, "rx_Write failed to send command"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send command (err %d)", rx_Error(call)); DBFPRINT(("send(%d) ", sendtimes)); - sendbytes(call, sendtimes); + if (sendbytes(call, sendtimes)) + errx(1, "sendbytes (err %d)", rx_Error(call)); DBFPRINT(("recv(%d) ", recvtimes)); - readbytes(call, recvtimes); + if (readbytes(call, recvtimes)) + errx(1, "sendbytes (err %d)", rx_Error(call)); - if (rx_Read(call, &bytes, 4) != 4) - errx(1, "failed to read result from server"); + if (rx_Read32(call, &bytes) != 4) + errx(1, "failed to read result from server (err %d)", rx_Error(call)); if (bytes != htonl(RXPERF_MAGIC_COOKIE)) warn("server send wrong magic cookie in responce"); @@ -673,12 +690,12 @@ do_client(const char *server, int port, char *filename, int32_t command, readfile(filename, &readwrite, &num); data = htonl(num); - if (rx_Write(call, &data, sizeof(data)) != 4) - errx(1, "rx_Write failed to send size"); + if (rx_Write32(call, &data) != 4) + errx(1, "rx_Write failed to send size (err %d)", rx_Error(call)); if (rx_Write(call, readwrite, num * sizeof(u_int32_t)) != num * sizeof(u_int32_t)) - errx(1, "rx_Write failed to send list"); + errx(1, "rx_Write failed to send list (err %d)", rx_Error(call)); for (i = 0; i < num; i++) { if (readwrite[i] == 0) @@ -687,10 +704,12 @@ do_client(const char *server, int port, char *filename, int32_t command, size = ntohl(readwrite[i]) * sizeof(u_int32_t); if (readp) { - readbytes(call, size); + if (readbytes(call, size)) + errx(1, "sendbytes (err %d)", rx_Error(call)); DBFPRINT(("read\n")); } else { - sendbytes(call, size); + if (sendbytes(call, size)) + errx(1, "sendbytes (err %d)", rx_Error(call)); DBFPRINT(("send\n")); } } @@ -705,6 +724,10 @@ do_client(const char *server, int port, char *filename, int32_t command, end_and_print_timer(stamp); DBFPRINT(("done for good\n")); + if (dumpstats) { + rx_PrintStats(stdout); + rx_PrintPeerStats(stdout, conn->peer); + } rx_Finalize(); } @@ -721,7 +744,7 @@ usage() fprintf(stderr, "usage: %s client -c file -f filename\n", __progname); fprintf(stderr, "%s: usage: common option to the client " - "-w -r -T times -p port -s server\n", + "-w -r -T times -p port -s server -D\n", __progname); fprintf(stderr, "usage: %s server -p port\n", __progname); #undef COMMMON @@ -736,10 +759,12 @@ static int rxperf_server(int argc, char **argv) { int port = DEFAULT_PORT; + int nojumbo = 0; + int maxmtu = 0; char *ptr; int ch; - while ((ch = getopt(argc, argv, "r:d:p:w:")) != -1) { + while ((ch = getopt(argc, argv, "r:d:p:w:jm:4")) != -1) { switch (ch) { case 'd': #ifdef RXDEBUG @@ -771,6 +796,17 @@ rxperf_server(int argc, char **argv) errx(1, "%d > sizeof(somebuf) (%d)", rxwrite_size, sizeof(somebuf)); break; + case 'j': + nojumbo=1; + break; + case 'm': + maxmtu = strtol(optarg, &ptr, 0); + if (ptr && *ptr != '\0') + errx(1, "can't resolve rx maxmtu to use"); + break; + case '4': + RX_IPUDP_SIZE = 28; + break; default: usage(); } @@ -779,7 +815,7 @@ rxperf_server(int argc, char **argv) if (optind != argc) usage(); - do_server(htons(port)); + do_server(htons(port), nojumbo, maxmtu); return 0; } @@ -799,12 +835,15 @@ rxperf_client(int argc, char **argv) int sendtimes = 3; int recvtimes = 30; int times = 100; + int dumpstats = 0; + int nojumbo = 0; + int maxmtu = 0; char *ptr; int ch; cmd = RX_PERF_UNKNOWN; - while ((ch = getopt(argc, argv, "T:S:R:b:c:d:p:r:s:w:f:")) != -1) { + while ((ch = getopt(argc, argv, "T:S:R:b:c:d:p:r:s:w:f:Djm:4")) != -1) { switch (ch) { case 'b': bytes = strtol(optarg, &ptr, 0); @@ -876,6 +915,24 @@ rxperf_client(int argc, char **argv) case 'f': filename = optarg; break; + case 'D': +#ifdef RXDEBUG + dumpstats = 1; +#else + errx(1, "compiled without RXDEBUG"); +#endif + break; + case 'j': + nojumbo=1; + break; + case 'm': + maxmtu = strtol(optarg, &ptr, 0); + if (ptr && *ptr != '\0') + errx(1, "can't resolve rx maxmtu to use"); + break; + case '4': + RX_IPUDP_SIZE = 28; + break; default: usage(); } @@ -888,7 +945,7 @@ rxperf_client(int argc, char **argv) errx(1, "no command given to the client"); do_client(host, htons(port), filename, cmd, times, bytes, sendtimes, - recvtimes); + recvtimes, dumpstats, nojumbo, maxmtu); return 0; } diff --git a/src/venus/fs.c b/src/venus/fs.c index 924b254..f51b2b0 100644 --- a/src/venus/fs.c +++ b/src/venus/fs.c @@ -1996,6 +1996,39 @@ CheckVolumesCmd(struct cmd_syndesc *as, void *arock) } static int +PreCacheCmd(struct cmd_syndesc *as, char *arock) +{ + afs_int32 code; + struct ViceIoctl blob; + afs_int32 temp; + + if (!as->parms[0].items && !as->parms[1].items) { + fprintf(stderr, "%s: syntax error in precache cmd.\n", pn); + return 1; + } + if (as->parms[0].items) { + code = util_GetInt32(as->parms[0].items->data, &temp); + if (code) { + fprintf(stderr, "%s: bad integer specified for precache size.\n", + pn); + return 1; + } + } else + temp = 0; + blob.in = (char *)&temp; + blob.in_size = sizeof(afs_int32); + blob.out_size = 0; + code = pioctl(0, VIOCPRECACHE, &blob, 1); + if (code) { + Die(errno, NULL); + return 1; + } + + printf("New precache size set.\n"); + return 0; +} + +static int SetCacheSizeCmd(struct cmd_syndesc *as, void *arock) { afs_int32 code; @@ -3654,6 +3687,11 @@ defect 3069 ts = cmd_CreateSyntax("uuid", UuidCmd, NULL, "manage the UUID for the cache manager"); cmd_AddParm(ts, "-generate", CMD_FLAG, CMD_REQUIRED, "generate a new UUID"); + ts = cmd_CreateSyntax("precache", PreCacheCmd, 0, + "set precache size"); + cmd_AddParm(ts, "-blocks", CMD_SINGLE, CMD_OPTIONAL, + "size in 1K byte blocks (0 => disable)"); + code = cmd_Dispatch(argc, argv); if (rxInitDone) rx_Finalize();