2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
12 #include "afs/param.h"
14 #include <afs/param.h>
22 #include "afs/sysincludes.h"
23 #include "afsincludes.h"
24 #include "rx/rx_kcommon.h"
25 #include "rx/rx_clock.h"
26 #include "rx/rx_queue.h"
27 #include "rx/rx_packet.h"
28 #else /* defined(UKERNEL) */
29 #ifdef RX_KERNEL_TRACE
30 #include "../rx/rx_kcommon.h"
33 #ifndef AFS_LINUX20_ENV
36 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
37 #include "afs/sysincludes.h"
39 #if defined(AFS_OBSD_ENV)
43 #if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
44 #if !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
45 #include "sys/mount.h" /* it gets pulled in by something later anyway */
49 #include "netinet/in.h"
50 #include "afs/afs_osi.h"
51 #include "rx_kmutex.h"
52 #include "rx/rx_clock.h"
53 #include "rx/rx_queue.h"
55 #include <sys/sysmacros.h>
57 #include "rx/rx_packet.h"
58 #endif /* defined(UKERNEL) */
59 #include "rx/rx_globals.h"
61 #include "sys/types.h"
64 #if defined(AFS_NT40_ENV) || defined(AFS_DJGPP_ENV)
68 #define EWOULDBLOCK WSAEWOULDBLOCK
71 #include <sys/socket.h>
72 #include <netinet/in.h>
73 #endif /* AFS_NT40_ENV */
74 #include "rx_xmit_nt.h"
77 #include <sys/socket.h>
78 #include <netinet/in.h>
84 #include <sys/sysmacros.h>
86 #include "rx_packet.h"
87 #include "rx_globals.h"
103 /* rxdb_fileID is used to identify the lock location, along with line#. */
104 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
105 #endif /* RX_LOCKS_DB */
106 struct rx_packet *rx_mallocedP = 0;
108 extern char cml_version_number[];
109 extern int (*rx_almostSent) ();
111 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
112 afs_int32 ahost, short aport,
115 /* some rules about packets:
116 * 1. When a packet is allocated, the final iov_buf contains room for
117 * a security trailer, but iov_len masks that fact. If the security
118 * package wants to add the trailer, it may do so, and then extend
119 * iov_len appropriately. For this reason, packet's niovecs and
120 * iov_len fields should be accurate before calling PreparePacket.
124 * all packet buffers (iov_base) are integral multiples of
126 * offset is an integral multiple of the word size.
129 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
133 for (l = 0, i = 1; i < packet->niovecs; i++) {
134 if (l + packet->wirevec[i].iov_len > offset) {
136 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
139 l += packet->wirevec[i].iov_len;
146 * all packet buffers (iov_base) are integral multiples of the word size.
147 * offset is an integral multiple of the word size.
150 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
154 for (l = 0, i = 1; i < packet->niovecs; i++) {
155 if (l + packet->wirevec[i].iov_len > offset) {
156 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
157 (offset - l))) = data;
160 l += packet->wirevec[i].iov_len;
167 * all packet buffers (iov_base) are integral multiples of the
169 * offset is an integral multiple of the word size.
171 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
174 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
177 unsigned int i, j, l, r;
178 for (l = 0, i = 1; i < packet->niovecs; i++) {
179 if (l + packet->wirevec[i].iov_len > offset) {
182 l += packet->wirevec[i].iov_len;
185 /* i is the iovec which contains the first little bit of data in which we
186 * are interested. l is the total length of everything prior to this iovec.
187 * j is the number of bytes we can safely copy out of this iovec.
188 * offset only applies to the first iovec.
191 while ((resid > 0) && (i < packet->niovecs)) {
192 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
193 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
196 l += packet->wirevec[i].iov_len;
201 return (resid ? (r - resid) : r);
206 * all packet buffers (iov_base) are integral multiples of the
208 * offset is an integral multiple of the word size.
211 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
216 for (l = 0, i = 1; i < packet->niovecs; i++) {
217 if (l + packet->wirevec[i].iov_len > offset) {
220 l += packet->wirevec[i].iov_len;
223 /* i is the iovec which contains the first little bit of data in which we
224 * are interested. l is the total length of everything prior to this iovec.
225 * j is the number of bytes we can safely copy out of this iovec.
226 * offset only applies to the first iovec.
229 while ((resid > 0) && (i < RX_MAXWVECS)) {
230 if (i >= packet->niovecs)
231 if (rxi_AllocDataBuf(packet, resid, RX_PACKET_CLASS_SEND_CBUF) > 0) /* ++niovecs as a side-effect */
234 b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
235 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
239 l += packet->wirevec[i].iov_len;
244 return (resid ? (r - resid) : r);
247 static struct rx_packet *
254 MUTEX_ENTER(&rx_freePktQ_lock);
257 if (rxi_OverQuota(class)) {
259 rxi_NeedMorePackets = TRUE;
260 MUTEX_ENTER(&rx_stats_mutex);
262 case RX_PACKET_CLASS_RECEIVE:
263 rx_stats.receivePktAllocFailures++;
265 case RX_PACKET_CLASS_SEND:
266 rx_stats.sendPktAllocFailures++;
268 case RX_PACKET_CLASS_SPECIAL:
269 rx_stats.specialPktAllocFailures++;
271 case RX_PACKET_CLASS_RECV_CBUF:
272 rx_stats.receiveCbufPktAllocFailures++;
274 case RX_PACKET_CLASS_SEND_CBUF:
275 rx_stats.sendCbufPktAllocFailures++;
278 MUTEX_EXIT(&rx_stats_mutex);
282 if (queue_IsEmpty(&rx_freePacketQueue)) {
284 rxi_NeedMorePackets = TRUE;
288 if (queue_IsEmpty(&rx_freePacketQueue)) {
289 rxi_MorePacketsNoLock(rx_initSendWindow);
294 c = queue_First(&rx_freePacketQueue, rx_packet);
296 if (!(c->flags & RX_PKTFLAG_FREE))
297 osi_Panic("rxi_AllocPacket: packet not free\n");
298 c->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
304 MUTEX_EXIT(&rx_freePktQ_lock);
311 * Free a packet currently used as a continuation buffer
314 rxi_freeCBuf(struct rx_packet *c)
319 MUTEX_ENTER(&rx_freePktQ_lock);
321 rxi_FreePacketNoLock(c);
322 /* Wakeup anyone waiting for packets */
325 MUTEX_EXIT(&rx_freePktQ_lock);
329 /* this one is kind of awful.
330 * In rxkad, the packet has been all shortened, and everything, ready for
331 * sending. All of a sudden, we discover we need some of that space back.
332 * This isn't terribly general, because it knows that the packets are only
333 * rounded up to the EBS (userdata + security header).
336 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
340 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
341 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
342 p->wirevec[i].iov_len += nb;
346 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
347 p->wirevec[i].iov_len += nb;
355 /* get sufficient space to store nb bytes of data (or more), and hook
356 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
357 * returns the number of bytes >0 which it failed to come up with.
358 * Don't need to worry about locking on packet, since only
359 * one thread can manipulate one at a time. Locking on continution
360 * packets is handled by allocCBuf */
361 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
363 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
367 for (i = p->niovecs; nb > 0 && i < RX_MAXWVECS; i++) {
368 register struct rx_packet *cb;
369 if ((cb = allocCBuf(class))) {
370 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
371 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
372 nb -= RX_CBUFFERSIZE;
373 p->length += RX_CBUFFERSIZE;
382 /* Add more packet buffers */
384 rxi_MorePackets(int apackets)
386 struct rx_packet *p, *e;
390 getme = apackets * sizeof(struct rx_packet);
391 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
393 PIN(p, getme); /* XXXXX */
394 memset((char *)p, 0, getme);
397 MUTEX_ENTER(&rx_freePktQ_lock);
399 for (e = p + apackets; p < e; p++) {
400 p->wirevec[0].iov_base = (char *)(p->wirehead);
401 p->wirevec[0].iov_len = RX_HEADER_SIZE;
402 p->wirevec[1].iov_base = (char *)(p->localdata);
403 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
404 p->flags |= RX_PKTFLAG_FREE;
407 queue_Append(&rx_freePacketQueue, p);
409 rx_nFreePackets += apackets;
410 rxi_NeedMorePackets = FALSE;
414 MUTEX_EXIT(&rx_freePktQ_lock);
419 /* Add more packet buffers */
421 rxi_MorePacketsNoLock(int apackets)
423 struct rx_packet *p, *e;
426 /* allocate enough packets that 1/4 of the packets will be able
427 * to hold maximal amounts of data */
428 apackets += (apackets / 4)
429 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
430 getme = apackets * sizeof(struct rx_packet);
431 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
433 memset((char *)p, 0, getme);
435 for (e = p + apackets; p < e; p++) {
436 p->wirevec[0].iov_base = (char *)(p->wirehead);
437 p->wirevec[0].iov_len = RX_HEADER_SIZE;
438 p->wirevec[1].iov_base = (char *)(p->localdata);
439 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
440 p->flags |= RX_PKTFLAG_FREE;
443 queue_Append(&rx_freePacketQueue, p);
445 rx_nFreePackets += apackets;
446 rxi_NeedMorePackets = FALSE;
452 rxi_FreeAllPackets(void)
454 /* must be called at proper interrupt level, etcetera */
455 /* MTUXXX need to free all Packets */
456 osi_Free(rx_mallocedP,
457 (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
458 UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
461 /* Allocate more packets iff we need more continuation buffers */
462 /* In kernel, can't page in memory with interrupts disabled, so we
463 * don't use the event mechanism. */
465 rx_CheckPackets(void)
467 if (rxi_NeedMorePackets) {
468 rxi_MorePackets(rx_initSendWindow);
472 /* In the packet freeing routine below, the assumption is that
473 we want all of the packets to be used equally frequently, so that we
474 don't get packet buffers paging out. It would be just as valid to
475 assume that we DO want them to page out if not many are being used.
476 In any event, we assume the former, and append the packets to the end
478 /* This explanation is bogus. The free list doesn't remain in any kind of
479 useful order for afs_int32: the packets in use get pretty much randomly scattered
480 across all the pages. In order to permit unused {packets,bufs} to page out, they
481 must be stored so that packets which are adjacent in memory are adjacent in the
482 free list. An array springs rapidly to mind.
485 /* Actually free the packet p. */
487 rxi_FreePacketNoLock(struct rx_packet *p)
489 dpf(("Free %lx\n", (unsigned long)p));
491 if (p->flags & RX_PKTFLAG_FREE)
492 osi_Panic("rxi_FreePacketNoLock: packet already free\n");
494 p->flags |= RX_PKTFLAG_FREE;
495 queue_Append(&rx_freePacketQueue, p);
499 rxi_FreeDataBufsNoLock(struct rx_packet *p, int first)
501 struct iovec *iov, *end;
503 if (first != 1) /* MTUXXX */
504 osi_Panic("FreeDataBufs 1: first must be 1");
505 iov = &p->wirevec[1];
506 end = iov + (p->niovecs - 1);
507 if (iov->iov_base != (caddr_t) p->localdata) /* MTUXXX */
508 osi_Panic("FreeDataBufs 2: vec 1 must be localdata");
509 for (iov++; iov < end; iov++) {
511 osi_Panic("FreeDataBufs 3: vecs 2-niovecs must not be NULL");
512 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
520 int rxi_nBadIovecs = 0;
522 /* rxi_RestoreDataBufs
524 * Restore the correct sizes to the iovecs. Called when reusing a packet
525 * for reading off the wire.
528 rxi_RestoreDataBufs(struct rx_packet *p)
531 struct iovec *iov = &p->wirevec[2];
533 p->wirevec[0].iov_base = (char *)(p->wirehead);
534 p->wirevec[0].iov_len = RX_HEADER_SIZE;
535 p->wirevec[1].iov_base = (char *)(p->localdata);
536 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
538 for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
539 if (!iov->iov_base) {
544 iov->iov_len = RX_CBUFFERSIZE;
549 rxi_TrimDataBufs(struct rx_packet *p, int first)
552 struct iovec *iov, *end;
556 osi_Panic("TrimDataBufs 1: first must be 1");
558 /* Skip over continuation buffers containing message data */
559 iov = &p->wirevec[2];
560 end = iov + (p->niovecs - 2);
561 length = p->length - p->wirevec[1].iov_len;
562 for (; iov < end && length > 0; iov++) {
564 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
565 length -= iov->iov_len;
568 /* iov now points to the first empty data buffer. */
573 MUTEX_ENTER(&rx_freePktQ_lock);
575 for (; iov < end; iov++) {
577 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
578 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
583 MUTEX_EXIT(&rx_freePktQ_lock);
589 /* Free the packet p. P is assumed not to be on any queue, i.e.
590 * remove it yourself first if you call this routine. */
592 rxi_FreePacket(struct rx_packet *p)
597 MUTEX_ENTER(&rx_freePktQ_lock);
599 rxi_FreeDataBufsNoLock(p, 1);
600 rxi_FreePacketNoLock(p);
601 /* Wakeup anyone waiting for packets */
604 MUTEX_EXIT(&rx_freePktQ_lock);
609 /* rxi_AllocPacket sets up p->length so it reflects the number of
610 * bytes in the packet at this point, **not including** the header.
611 * The header is absolutely necessary, besides, this is the way the
612 * length field is usually used */
614 rxi_AllocPacketNoLock(int class)
616 register struct rx_packet *p;
619 if (rxi_OverQuota(class)) {
620 rxi_NeedMorePackets = TRUE;
621 MUTEX_ENTER(&rx_stats_mutex);
623 case RX_PACKET_CLASS_RECEIVE:
624 rx_stats.receivePktAllocFailures++;
626 case RX_PACKET_CLASS_SEND:
627 rx_stats.sendPktAllocFailures++;
629 case RX_PACKET_CLASS_SPECIAL:
630 rx_stats.specialPktAllocFailures++;
632 case RX_PACKET_CLASS_RECV_CBUF:
633 rx_stats.receiveCbufPktAllocFailures++;
635 case RX_PACKET_CLASS_SEND_CBUF:
636 rx_stats.sendCbufPktAllocFailures++;
639 MUTEX_EXIT(&rx_stats_mutex);
640 return (struct rx_packet *)0;
644 MUTEX_ENTER(&rx_stats_mutex);
645 rx_stats.packetRequests++;
646 MUTEX_EXIT(&rx_stats_mutex);
649 if (queue_IsEmpty(&rx_freePacketQueue))
650 osi_Panic("rxi_AllocPacket error");
652 if (queue_IsEmpty(&rx_freePacketQueue))
653 rxi_MorePacketsNoLock(rx_initSendWindow);
657 p = queue_First(&rx_freePacketQueue, rx_packet);
658 if (!(p->flags & RX_PKTFLAG_FREE))
659 osi_Panic("rxi_AllocPacket: packet not free\n");
661 dpf(("Alloc %lx, class %d\n", (unsigned long)p, class));
664 p->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
667 /* have to do this here because rx_FlushWrite fiddles with the iovs in
668 * order to truncate outbound packets. In the near future, may need
669 * to allocate bufs from a static pool here, and/or in AllocSendPacket
671 p->wirevec[0].iov_base = (char *)(p->wirehead);
672 p->wirevec[0].iov_len = RX_HEADER_SIZE;
673 p->wirevec[1].iov_base = (char *)(p->localdata);
674 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
676 p->length = RX_FIRSTBUFFERSIZE;
681 rxi_AllocPacket(int class)
683 register struct rx_packet *p;
685 MUTEX_ENTER(&rx_freePktQ_lock);
686 p = rxi_AllocPacketNoLock(class);
687 MUTEX_EXIT(&rx_freePktQ_lock);
691 /* This guy comes up with as many buffers as it {takes,can get} given
692 * the MTU for this call. It also sets the packet length before
693 * returning. caution: this is often called at NETPRI
694 * Called with call locked.
697 rxi_AllocSendPacket(register struct rx_call *call, int want)
699 register struct rx_packet *p = (struct rx_packet *)0;
701 register unsigned delta;
704 mud = call->MTU - RX_HEADER_SIZE;
706 rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
707 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
709 while (!(call->error)) {
710 MUTEX_ENTER(&rx_freePktQ_lock);
711 /* if an error occurred, or we get the packet we want, we're done */
712 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
713 MUTEX_EXIT(&rx_freePktQ_lock);
716 want = MIN(want, mud);
718 if ((unsigned)want > p->length)
719 (void)rxi_AllocDataBuf(p, (want - p->length),
720 RX_PACKET_CLASS_SEND_CBUF);
722 if ((unsigned)p->length > mud)
725 if (delta >= p->length) {
734 /* no error occurred, and we didn't get a packet, so we sleep.
735 * At this point, we assume that packets will be returned
736 * sooner or later, as packets are acknowledged, and so we
739 call->flags |= RX_CALL_WAIT_PACKETS;
740 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
741 MUTEX_EXIT(&call->lock);
742 rx_waitingForPackets = 1;
744 #ifdef RX_ENABLE_LOCKS
745 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
747 osi_rxSleep(&rx_waitingForPackets);
749 MUTEX_EXIT(&rx_freePktQ_lock);
750 MUTEX_ENTER(&call->lock);
751 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
752 call->flags &= ~RX_CALL_WAIT_PACKETS;
761 /* count the number of used FDs */
763 CountFDs(register int amax)
766 register int i, code;
770 for (i = 0; i < amax; i++) {
771 code = fstat(i, &tstat);
780 #define CountFDs(amax) amax
784 #if !defined(KERNEL) || defined(UKERNEL)
786 /* This function reads a single packet from the interface into the
787 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
788 * (host,port) of the sender are stored in the supplied variables, and
789 * the data length of the packet is stored in the packet structure.
790 * The header is decoded. */
792 rxi_ReadPacket(int socket, register struct rx_packet *p, afs_uint32 * host,
795 struct sockaddr_in from;
798 register afs_int32 tlen, savelen;
800 rx_computelen(p, tlen);
801 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
803 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
804 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
805 * it once in order to avoid races. */
808 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
816 /* Extend the last iovec for padding, it's just to make sure that the
817 * read doesn't return more data than we expect, and is done to get around
818 * our problems caused by the lack of a length field in the rx header.
819 * Use the extra buffer that follows the localdata in each packet
821 savelen = p->wirevec[p->niovecs - 1].iov_len;
822 p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
824 memset((char *)&msg, 0, sizeof(msg));
825 msg.msg_name = (char *)&from;
826 msg.msg_namelen = sizeof(struct sockaddr_in);
827 msg.msg_iov = p->wirevec;
828 msg.msg_iovlen = p->niovecs;
829 nbytes = rxi_Recvmsg(socket, &msg, 0);
831 /* restore the vec to its correct state */
832 p->wirevec[p->niovecs - 1].iov_len = savelen;
834 p->length = (nbytes - RX_HEADER_SIZE);
835 if ((nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
837 rxi_MorePackets(rx_initSendWindow);
838 else if (nbytes < 0 && errno == EWOULDBLOCK) {
839 MUTEX_ENTER(&rx_stats_mutex);
840 rx_stats.noPacketOnRead++;
841 MUTEX_EXIT(&rx_stats_mutex);
843 MUTEX_ENTER(&rx_stats_mutex);
844 rx_stats.bogusPacketOnRead++;
845 rx_stats.bogusHost = from.sin_addr.s_addr;
846 MUTEX_EXIT(&rx_stats_mutex);
847 dpf(("B: bogus packet from [%x,%d] nb=%d", from.sin_addr.s_addr,
848 from.sin_port, nbytes));
852 /* Extract packet header. */
853 rxi_DecodePacketHeader(p);
855 *host = from.sin_addr.s_addr;
856 *port = from.sin_port;
857 if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
858 struct rx_peer *peer;
859 MUTEX_ENTER(&rx_stats_mutex);
860 rx_stats.packetsRead[p->header.type - 1]++;
861 MUTEX_EXIT(&rx_stats_mutex);
863 * Try to look up this peer structure. If it doesn't exist,
864 * don't create a new one -
865 * we don't keep count of the bytes sent/received if a peer
866 * structure doesn't already exist.
868 * The peer/connection cleanup code assumes that there is 1 peer
869 * per connection. If we actually created a peer structure here
870 * and this packet was an rxdebug packet, the peer structure would
871 * never be cleaned up.
873 peer = rxi_FindPeer(*host, *port, 0, 0);
874 /* Since this may not be associated with a connection,
875 * it may have no refCount, meaning we could race with
878 if (peer && (peer->refCount > 0)) {
879 MUTEX_ENTER(&peer->peer_lock);
880 hadd32(peer->bytesReceived, p->length);
881 MUTEX_EXIT(&peer->peer_lock);
885 /* Free any empty packet buffers at the end of this packet */
886 rxi_TrimDataBufs(p, 1);
892 #endif /* !KERNEL || UKERNEL */
894 /* This function splits off the first packet in a jumbo packet.
895 * As of AFS 3.5, jumbograms contain more than one fixed size
896 * packet, and the RX_JUMBO_PACKET flag is set in all but the
897 * last packet header. All packets (except the last) are padded to
898 * fall on RX_CBUFFERSIZE boundaries.
899 * HACK: We store the length of the first n-1 packets in the
900 * last two pad bytes. */
903 rxi_SplitJumboPacket(register struct rx_packet *p, afs_int32 host, short port,
906 struct rx_packet *np;
907 struct rx_jumboHeader *jp;
913 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
914 * bytes in length. All but the first packet are preceded by
915 * an abbreviated four byte header. The length of the last packet
916 * is calculated from the size of the jumbogram. */
917 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
919 if ((int)p->length < length) {
920 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
923 niov = p->niovecs - 2;
925 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
928 iov = &p->wirevec[2];
929 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
931 /* Get a pointer to the abbreviated packet header */
932 jp = (struct rx_jumboHeader *)
933 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
935 /* Set up the iovecs for the next packet */
936 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
937 np->wirevec[0].iov_len = sizeof(struct rx_header);
938 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
939 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
940 np->niovecs = niov + 1;
941 for (i = 2, iov++; i <= niov; i++, iov++) {
942 np->wirevec[i] = *iov;
944 np->length = p->length - length;
945 p->length = RX_JUMBOBUFFERSIZE;
948 /* Convert the jumbo packet header to host byte order */
949 temp = ntohl(*(afs_uint32 *) jp);
950 jp->flags = (u_char) (temp >> 24);
951 jp->cksum = (u_short) (temp);
953 /* Fill in the packet header */
954 np->header = p->header;
955 np->header.serial = p->header.serial + 1;
956 np->header.seq = p->header.seq + 1;
957 np->header.flags = jp->flags;
958 np->header.spare = jp->cksum;
964 /* Send a udp datagram */
966 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
967 int length, int istack)
971 memset(&msg, 0, sizeof(msg));
973 msg.msg_iovlen = nvecs;
975 msg.msg_namelen = sizeof(struct sockaddr_in);
977 rxi_Sendmsg(socket, &msg, 0);
981 #elif !defined(UKERNEL)
983 * message receipt is done in rxk_input or rx_put.
986 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
988 * Copy an mblock to the contiguous area pointed to by cp.
989 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
990 * but it doesn't really.
991 * Returns the number of bytes not transferred.
992 * The message is NOT changed.
995 cpytoc(mblk_t * mp, register int off, register int len, register char *cp)
999 for (; mp && len > 0; mp = mp->b_cont) {
1000 if (mp->b_datap->db_type != M_DATA) {
1003 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1004 memcpy(cp, (char *)mp->b_rptr, n);
1012 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1013 * but it doesn't really.
1014 * This sucks, anyway, do it like m_cpy.... below
1017 cpytoiovec(mblk_t * mp, int off, int len, register struct iovec *iovs,
1020 register int m, n, o, t, i;
1022 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1023 if (mp->b_datap->db_type != M_DATA) {
1026 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1032 t = iovs[i].iov_len;
1035 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1045 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
1046 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1048 #if !defined(AFS_LINUX20_ENV)
1050 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1053 unsigned int l1, l2, i, t;
1055 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1056 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1059 if (m->m_len <= off) {
1069 p1 = mtod(m, caddr_t) + off;
1070 l1 = m->m_len - off;
1072 p2 = iovs[0].iov_base;
1073 l2 = iovs[0].iov_len;
1076 t = MIN(l1, MIN(l2, (unsigned int)len));
1087 p1 = mtod(m, caddr_t);
1093 p2 = iovs[i].iov_base;
1094 l2 = iovs[i].iov_len;
1102 #endif /* AFS_SUN5_ENV */
1104 #if !defined(AFS_LINUX20_ENV)
1106 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1107 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1113 struct rx_packet *phandle;
1114 int hdr_len, data_len;
1119 m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1126 #endif /*KERNEL && !UKERNEL */
1129 /* send a response to a debug packet */
1132 rxi_ReceiveDebugPacket(register struct rx_packet *ap, osi_socket asocket,
1133 afs_int32 ahost, short aport, int istack)
1135 struct rx_debugIn tin;
1137 struct rx_serverQueueEntry *np, *nqe;
1140 * Only respond to client-initiated Rx debug packets,
1141 * and clear the client flag in the response.
1143 if (ap->header.flags & RX_CLIENT_INITIATED) {
1144 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1145 rxi_EncodePacketHeader(ap);
1150 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1151 /* all done with packet, now set length to the truth, so we can
1152 * reuse this packet */
1153 rx_computelen(ap, ap->length);
1155 tin.type = ntohl(tin.type);
1156 tin.index = ntohl(tin.index);
1158 case RX_DEBUGI_GETSTATS:{
1159 struct rx_debugStats tstat;
1161 /* get basic stats */
1162 memset((char *)&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1163 tstat.version = RX_DEBUGI_VERSION;
1164 #ifndef RX_ENABLE_LOCKS
1165 tstat.waitingForPackets = rx_waitingForPackets;
1167 MUTEX_ENTER(&rx_serverPool_lock);
1168 tstat.nFreePackets = htonl(rx_nFreePackets);
1169 tstat.callsExecuted = htonl(rxi_nCalls);
1170 tstat.packetReclaims = htonl(rx_packetReclaims);
1171 tstat.usedFDs = CountFDs(64);
1172 tstat.nWaiting = htonl(rx_nWaiting);
1173 tstat.nWaited = htonl(rx_nWaited);
1174 queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1176 MUTEX_EXIT(&rx_serverPool_lock);
1177 tstat.idleThreads = htonl(tstat.idleThreads);
1178 tl = sizeof(struct rx_debugStats) - ap->length;
1180 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1183 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1185 ap->length = sizeof(struct rx_debugStats);
1186 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1187 rx_computelen(ap, ap->length);
1192 case RX_DEBUGI_GETALLCONN:
1193 case RX_DEBUGI_GETCONN:{
1195 register struct rx_connection *tc;
1196 struct rx_call *tcall;
1197 struct rx_debugConn tconn;
1198 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1201 tl = sizeof(struct rx_debugConn) - ap->length;
1203 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1207 memset((char *)&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1208 /* get N'th (maybe) "interesting" connection info */
1209 for (i = 0; i < rx_hashTableSize; i++) {
1210 #if !defined(KERNEL)
1211 /* the time complexity of the algorithm used here
1212 * exponentially increses with the number of connections.
1214 #ifdef AFS_PTHREAD_ENV
1220 MUTEX_ENTER(&rx_connHashTable_lock);
1221 /* We might be slightly out of step since we are not
1222 * locking each call, but this is only debugging output.
1224 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1225 if ((all || rxi_IsConnInteresting(tc))
1226 && tin.index-- <= 0) {
1227 tconn.host = tc->peer->host;
1228 tconn.port = tc->peer->port;
1229 tconn.cid = htonl(tc->cid);
1230 tconn.epoch = htonl(tc->epoch);
1231 tconn.serial = htonl(tc->serial);
1232 for (j = 0; j < RX_MAXCALLS; j++) {
1233 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1234 if ((tcall = tc->call[j])) {
1235 tconn.callState[j] = tcall->state;
1236 tconn.callMode[j] = tcall->mode;
1237 tconn.callFlags[j] = tcall->flags;
1238 if (queue_IsNotEmpty(&tcall->rq))
1239 tconn.callOther[j] |= RX_OTHER_IN;
1240 if (queue_IsNotEmpty(&tcall->tq))
1241 tconn.callOther[j] |= RX_OTHER_OUT;
1243 tconn.callState[j] = RX_STATE_NOTINIT;
1246 tconn.natMTU = htonl(tc->peer->natMTU);
1247 tconn.error = htonl(tc->error);
1248 tconn.flags = tc->flags;
1249 tconn.type = tc->type;
1250 tconn.securityIndex = tc->securityIndex;
1251 if (tc->securityObject) {
1252 RXS_GetStats(tc->securityObject, tc,
1254 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1255 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1258 DOHTONL(packetsReceived);
1259 DOHTONL(packetsSent);
1260 DOHTONL(bytesReceived);
1264 sizeof(tconn.secStats.spares) /
1269 sizeof(tconn.secStats.sparel) /
1270 sizeof(afs_int32); i++)
1274 MUTEX_EXIT(&rx_connHashTable_lock);
1275 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1278 ap->length = sizeof(struct rx_debugConn);
1279 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1285 MUTEX_EXIT(&rx_connHashTable_lock);
1287 /* if we make it here, there are no interesting packets */
1288 tconn.cid = htonl(0xffffffff); /* means end */
1289 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1292 ap->length = sizeof(struct rx_debugConn);
1293 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1299 * Pass back all the peer structures we have available
1302 case RX_DEBUGI_GETPEER:{
1304 register struct rx_peer *tp;
1305 struct rx_debugPeer tpeer;
1308 tl = sizeof(struct rx_debugPeer) - ap->length;
1310 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1314 memset((char *)&tpeer, 0, sizeof(tpeer));
1315 for (i = 0; i < rx_hashTableSize; i++) {
1316 #if !defined(KERNEL)
1317 /* the time complexity of the algorithm used here
1318 * exponentially increses with the number of peers.
1320 * Yielding after processing each hash table entry
1321 * and dropping rx_peerHashTable_lock.
1322 * also increases the risk that we will miss a new
1323 * entry - but we are willing to live with this
1324 * limitation since this is meant for debugging only
1326 #ifdef AFS_PTHREAD_ENV
1332 MUTEX_ENTER(&rx_peerHashTable_lock);
1333 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1334 if (tin.index-- <= 0) {
1335 tpeer.host = tp->host;
1336 tpeer.port = tp->port;
1337 tpeer.ifMTU = htons(tp->ifMTU);
1338 tpeer.idleWhen = htonl(tp->idleWhen);
1339 tpeer.refCount = htons(tp->refCount);
1340 tpeer.burstSize = tp->burstSize;
1341 tpeer.burst = tp->burst;
1342 tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1343 tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1344 tpeer.rtt = htonl(tp->rtt);
1345 tpeer.rtt_dev = htonl(tp->rtt_dev);
1346 tpeer.timeout.sec = htonl(tp->timeout.sec);
1347 tpeer.timeout.usec = htonl(tp->timeout.usec);
1348 tpeer.nSent = htonl(tp->nSent);
1349 tpeer.reSends = htonl(tp->reSends);
1350 tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1351 tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1352 tpeer.rateFlag = htonl(tp->rateFlag);
1353 tpeer.natMTU = htons(tp->natMTU);
1354 tpeer.maxMTU = htons(tp->maxMTU);
1355 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1356 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1357 tpeer.MTU = htons(tp->MTU);
1358 tpeer.cwind = htons(tp->cwind);
1359 tpeer.nDgramPackets = htons(tp->nDgramPackets);
1360 tpeer.congestSeq = htons(tp->congestSeq);
1361 tpeer.bytesSent.high = htonl(tp->bytesSent.high);
1362 tpeer.bytesSent.low = htonl(tp->bytesSent.low);
1363 tpeer.bytesReceived.high =
1364 htonl(tp->bytesReceived.high);
1365 tpeer.bytesReceived.low =
1366 htonl(tp->bytesReceived.low);
1368 MUTEX_EXIT(&rx_peerHashTable_lock);
1369 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1372 ap->length = sizeof(struct rx_debugPeer);
1373 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1379 MUTEX_EXIT(&rx_peerHashTable_lock);
1381 /* if we make it here, there are no interesting packets */
1382 tpeer.host = htonl(0xffffffff); /* means end */
1383 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1386 ap->length = sizeof(struct rx_debugPeer);
1387 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1392 case RX_DEBUGI_RXSTATS:{
1396 tl = sizeof(rx_stats) - ap->length;
1398 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1402 /* Since its all int32s convert to network order with a loop. */
1403 MUTEX_ENTER(&rx_stats_mutex);
1404 s = (afs_int32 *) & rx_stats;
1405 for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
1406 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
1409 ap->length = sizeof(rx_stats);
1410 MUTEX_EXIT(&rx_stats_mutex);
1411 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1417 /* error response packet */
1418 tin.type = htonl(RX_DEBUGI_BADTYPE);
1419 tin.index = tin.type;
1420 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1422 ap->length = sizeof(struct rx_debugIn);
1423 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1431 rxi_ReceiveVersionPacket(register struct rx_packet *ap, osi_socket asocket,
1432 afs_int32 ahost, short aport, int istack)
1437 * Only respond to client-initiated version requests, and
1438 * clear that flag in the response.
1440 if (ap->header.flags & RX_CLIENT_INITIATED) {
1443 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1444 rxi_EncodePacketHeader(ap);
1445 memset(buf, 0, sizeof(buf));
1446 strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
1447 rx_packetwrite(ap, 0, 65, buf);
1450 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1458 /* send a debug packet back to the sender */
1460 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
1461 afs_int32 ahost, short aport, afs_int32 istack)
1463 struct sockaddr_in taddr;
1469 int waslocked = ISAFS_GLOCK();
1472 taddr.sin_family = AF_INET;
1473 taddr.sin_port = aport;
1474 taddr.sin_addr.s_addr = ahost;
1475 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
1476 taddr.sin_len = sizeof(struct sockaddr_in);
1479 /* We need to trim the niovecs. */
1480 nbytes = apacket->length;
1481 for (i = 1; i < apacket->niovecs; i++) {
1482 if (nbytes <= apacket->wirevec[i].iov_len) {
1483 savelen = apacket->wirevec[i].iov_len;
1484 saven = apacket->niovecs;
1485 apacket->wirevec[i].iov_len = nbytes;
1486 apacket->niovecs = i + 1; /* so condition fails because i == niovecs */
1488 nbytes -= apacket->wirevec[i].iov_len;
1492 #ifdef RX_KERNEL_TRACE
1493 if (ICL_SETACTIVE(afs_iclSetp)) {
1496 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1497 "before osi_NetSend()");
1505 /* debug packets are not reliably delivered, hence the cast below. */
1506 (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
1507 apacket->length + RX_HEADER_SIZE, istack);
1509 #ifdef RX_KERNEL_TRACE
1510 if (ICL_SETACTIVE(afs_iclSetp)) {
1512 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1513 "after osi_NetSend()");
1523 if (saven) { /* means we truncated the packet above. */
1524 apacket->wirevec[i - 1].iov_len = savelen;
1525 apacket->niovecs = saven;
1530 /* Send the packet to appropriate destination for the specified
1531 * call. The header is first encoded and placed in the packet.
1534 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
1535 struct rx_packet *p, int istack)
1541 struct sockaddr_in addr;
1542 register struct rx_peer *peer = conn->peer;
1545 char deliveryType = 'S';
1547 /* The address we're sending the packet to */
1548 memset(&addr, 0, sizeof(addr));
1549 addr.sin_family = AF_INET;
1550 addr.sin_port = peer->port;
1551 addr.sin_addr.s_addr = peer->host;
1553 /* This stuff should be revamped, I think, so that most, if not
1554 * all, of the header stuff is always added here. We could
1555 * probably do away with the encode/decode routines. XXXXX */
1557 /* Stamp each packet with a unique serial number. The serial
1558 * number is maintained on a connection basis because some types
1559 * of security may be based on the serial number of the packet,
1560 * and security is handled on a per authenticated-connection
1562 /* Pre-increment, to guarantee no zero serial number; a zero
1563 * serial number means the packet was never sent. */
1564 MUTEX_ENTER(&conn->conn_data_lock);
1565 p->header.serial = ++conn->serial;
1566 MUTEX_EXIT(&conn->conn_data_lock);
1567 /* This is so we can adjust retransmit time-outs better in the face of
1568 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1570 if (p->firstSerial == 0) {
1571 p->firstSerial = p->header.serial;
1574 /* If an output tracer function is defined, call it with the packet and
1575 * network address. Note this function may modify its arguments. */
1576 if (rx_almostSent) {
1577 int drop = (*rx_almostSent) (p, &addr);
1578 /* drop packet if return value is non-zero? */
1580 deliveryType = 'D'; /* Drop the packet */
1584 /* Get network byte order header */
1585 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1586 * touch ALL the fields */
1588 /* Send the packet out on the same socket that related packets are being
1592 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1595 /* Possibly drop this packet, for testing purposes */
1596 if ((deliveryType == 'D')
1597 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1598 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1599 deliveryType = 'D'; /* Drop the packet */
1601 deliveryType = 'S'; /* Send the packet */
1602 #endif /* RXDEBUG */
1604 /* Loop until the packet is sent. We'd prefer just to use a
1605 * blocking socket, but unfortunately the interface doesn't
1606 * allow us to have the socket block in send mode, and not
1607 * block in receive mode */
1610 waslocked = ISAFS_GLOCK();
1611 #ifdef RX_KERNEL_TRACE
1612 if (ICL_SETACTIVE(afs_iclSetp)) {
1615 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1616 "before osi_NetSend()");
1625 osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
1626 p->length + RX_HEADER_SIZE, istack)) != 0) {
1627 /* send failed, so let's hurry up the resend, eh? */
1628 MUTEX_ENTER(&rx_stats_mutex);
1629 rx_stats.netSendFailures++;
1630 MUTEX_EXIT(&rx_stats_mutex);
1631 p->retryTime = p->timeSent; /* resend it very soon */
1632 clock_Addmsec(&(p->retryTime),
1633 10 + (((afs_uint32) p->backoff) << 8));
1635 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1636 /* Linux is nice -- it can tell us right away that we cannot
1637 * reach this recipient by returning an ENETUNREACH error
1638 * code. So, when this happens let's "down" the host NOW so
1639 * we don't sit around waiting for this host to timeout later.
1641 if (call && code == -ENETUNREACH)
1642 call->lastReceiveTime = 0;
1646 #ifdef RX_KERNEL_TRACE
1647 if (ICL_SETACTIVE(afs_iclSetp)) {
1649 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1650 "after osi_NetSend()");
1662 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %lx resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (unsigned long)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1664 MUTEX_ENTER(&rx_stats_mutex);
1665 rx_stats.packetsSent[p->header.type - 1]++;
1666 MUTEX_EXIT(&rx_stats_mutex);
1667 MUTEX_ENTER(&peer->peer_lock);
1668 hadd32(peer->bytesSent, p->length);
1669 MUTEX_EXIT(&peer->peer_lock);
1672 /* Send a list of packets to appropriate destination for the specified
1673 * connection. The headers are first encoded and placed in the packets.
1676 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
1677 struct rx_packet **list, int len, int istack)
1679 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1682 struct sockaddr_in addr;
1683 register struct rx_peer *peer = conn->peer;
1685 struct rx_packet *p = NULL;
1686 struct iovec wirevec[RX_MAXIOVECS];
1687 int i, length, code;
1690 struct rx_jumboHeader *jp;
1692 char deliveryType = 'S';
1694 /* The address we're sending the packet to */
1695 addr.sin_family = AF_INET;
1696 addr.sin_port = peer->port;
1697 addr.sin_addr.s_addr = peer->host;
1699 if (len + 1 > RX_MAXIOVECS) {
1700 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
1704 * Stamp the packets in this jumbogram with consecutive serial numbers
1706 MUTEX_ENTER(&conn->conn_data_lock);
1707 serial = conn->serial;
1708 conn->serial += len;
1709 MUTEX_EXIT(&conn->conn_data_lock);
1712 /* This stuff should be revamped, I think, so that most, if not
1713 * all, of the header stuff is always added here. We could
1714 * probably do away with the encode/decode routines. XXXXX */
1717 length = RX_HEADER_SIZE;
1718 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
1719 wirevec[0].iov_len = RX_HEADER_SIZE;
1720 for (i = 0; i < len; i++) {
1723 /* The whole 3.5 jumbogram scheme relies on packets fitting
1724 * in a single packet buffer. */
1725 if (p->niovecs > 2) {
1726 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
1729 /* Set the RX_JUMBO_PACKET flags in all but the last packets
1732 if (p->length != RX_JUMBOBUFFERSIZE) {
1733 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
1735 p->header.flags |= RX_JUMBO_PACKET;
1736 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1737 wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1739 wirevec[i + 1].iov_len = p->length;
1740 length += p->length;
1742 wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
1744 /* Convert jumbo packet header to network byte order */
1745 temp = (afs_uint32) (p->header.flags) << 24;
1746 temp |= (afs_uint32) (p->header.spare);
1747 *(afs_uint32 *) jp = htonl(temp);
1749 jp = (struct rx_jumboHeader *)
1750 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
1752 /* Stamp each packet with a unique serial number. The serial
1753 * number is maintained on a connection basis because some types
1754 * of security may be based on the serial number of the packet,
1755 * and security is handled on a per authenticated-connection
1757 /* Pre-increment, to guarantee no zero serial number; a zero
1758 * serial number means the packet was never sent. */
1759 p->header.serial = ++serial;
1760 /* This is so we can adjust retransmit time-outs better in the face of
1761 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1763 if (p->firstSerial == 0) {
1764 p->firstSerial = p->header.serial;
1767 /* If an output tracer function is defined, call it with the packet and
1768 * network address. Note this function may modify its arguments. */
1769 if (rx_almostSent) {
1770 int drop = (*rx_almostSent) (p, &addr);
1771 /* drop packet if return value is non-zero? */
1773 deliveryType = 'D'; /* Drop the packet */
1777 /* Get network byte order header */
1778 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1779 * touch ALL the fields */
1782 /* Send the packet out on the same socket that related packets are being
1786 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1789 /* Possibly drop this packet, for testing purposes */
1790 if ((deliveryType == 'D')
1791 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1792 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1793 deliveryType = 'D'; /* Drop the packet */
1795 deliveryType = 'S'; /* Send the packet */
1796 #endif /* RXDEBUG */
1798 /* Loop until the packet is sent. We'd prefer just to use a
1799 * blocking socket, but unfortunately the interface doesn't
1800 * allow us to have the socket block in send mode, and not
1801 * block in receive mode */
1803 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1804 waslocked = ISAFS_GLOCK();
1805 if (!istack && waslocked)
1809 osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
1811 /* send failed, so let's hurry up the resend, eh? */
1812 MUTEX_ENTER(&rx_stats_mutex);
1813 rx_stats.netSendFailures++;
1814 MUTEX_EXIT(&rx_stats_mutex);
1815 for (i = 0; i < len; i++) {
1817 p->retryTime = p->timeSent; /* resend it very soon */
1818 clock_Addmsec(&(p->retryTime),
1819 10 + (((afs_uint32) p->backoff) << 8));
1821 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1822 /* Linux is nice -- it can tell us right away that we cannot
1823 * reach this recipient by returning an ENETUNREACH error
1824 * code. So, when this happens let's "down" the host NOW so
1825 * we don't sit around waiting for this host to timeout later.
1827 if (call && code == -ENETUNREACH)
1828 call->lastReceiveTime = 0;
1831 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1832 if (!istack && waslocked)
1841 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %lx resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (unsigned long)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1844 MUTEX_ENTER(&rx_stats_mutex);
1845 rx_stats.packetsSent[p->header.type - 1]++;
1846 MUTEX_EXIT(&rx_stats_mutex);
1847 MUTEX_ENTER(&peer->peer_lock);
1849 hadd32(peer->bytesSent, p->length);
1850 MUTEX_EXIT(&peer->peer_lock);
1854 /* Send a "special" packet to the peer connection. If call is
1855 * specified, then the packet is directed to a specific call channel
1856 * associated with the connection, otherwise it is directed to the
1857 * connection only. Uses optionalPacket if it is supplied, rather than
1858 * allocating a new packet buffer. Nbytes is the length of the data
1859 * portion of the packet. If data is non-null, nbytes of data are
1860 * copied into the packet. Type is the type of the packet, as defined
1861 * in rx.h. Bug: there's a lot of duplication between this and other
1862 * routines. This needs to be cleaned up. */
1864 rxi_SendSpecial(register struct rx_call *call,
1865 register struct rx_connection *conn,
1866 struct rx_packet *optionalPacket, int type, char *data,
1867 int nbytes, int istack)
1869 /* Some of the following stuff should be common code for all
1870 * packet sends (it's repeated elsewhere) */
1871 register struct rx_packet *p;
1873 int savelen = 0, saven = 0;
1874 int channel, callNumber;
1876 channel = call->channel;
1877 callNumber = *call->callNumber;
1878 /* BUSY packets refer to the next call on this connection */
1879 if (type == RX_PACKET_TYPE_BUSY) {
1888 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
1890 osi_Panic("rxi_SendSpecial failure");
1897 p->header.serviceId = conn->serviceId;
1898 p->header.securityIndex = conn->securityIndex;
1899 p->header.cid = (conn->cid | channel);
1900 p->header.callNumber = callNumber;
1902 p->header.epoch = conn->epoch;
1903 p->header.type = type;
1904 p->header.flags = 0;
1905 if (conn->type == RX_CLIENT_CONNECTION)
1906 p->header.flags |= RX_CLIENT_INITIATED;
1908 rx_packetwrite(p, 0, nbytes, data);
1910 for (i = 1; i < p->niovecs; i++) {
1911 if (nbytes <= p->wirevec[i].iov_len) {
1912 savelen = p->wirevec[i].iov_len;
1914 p->wirevec[i].iov_len = nbytes;
1915 p->niovecs = i + 1; /* so condition fails because i == niovecs */
1917 nbytes -= p->wirevec[i].iov_len;
1921 rxi_Send(call, p, istack);
1923 rxi_SendPacket((struct rx_call *)0, conn, p, istack);
1924 if (saven) { /* means we truncated the packet above. We probably don't */
1925 /* really need to do this, but it seems safer this way, given that */
1926 /* sneaky optionalPacket... */
1927 p->wirevec[i - 1].iov_len = savelen;
1930 if (!optionalPacket)
1932 return optionalPacket;
1936 /* Encode the packet's header (from the struct header in the packet to
1937 * the net byte order representation in the wire representation of the
1938 * packet, which is what is actually sent out on the wire) */
1940 rxi_EncodePacketHeader(register struct rx_packet *p)
1942 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1944 memset((char *)buf, 0, RX_HEADER_SIZE);
1945 *buf++ = htonl(p->header.epoch);
1946 *buf++ = htonl(p->header.cid);
1947 *buf++ = htonl(p->header.callNumber);
1948 *buf++ = htonl(p->header.seq);
1949 *buf++ = htonl(p->header.serial);
1950 *buf++ = htonl((((afs_uint32) p->header.type) << 24)
1951 | (((afs_uint32) p->header.flags) << 16)
1952 | (p->header.userStatus << 8) | p->header.securityIndex);
1953 /* Note: top 16 bits of this next word were reserved */
1954 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
1957 /* Decode the packet's header (from net byte order to a struct header) */
1959 rxi_DecodePacketHeader(register struct rx_packet *p)
1961 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1964 p->header.epoch = ntohl(*buf);
1966 p->header.cid = ntohl(*buf);
1968 p->header.callNumber = ntohl(*buf);
1970 p->header.seq = ntohl(*buf);
1972 p->header.serial = ntohl(*buf);
1978 /* C will truncate byte fields to bytes for me */
1979 p->header.type = temp >> 24;
1980 p->header.flags = temp >> 16;
1981 p->header.userStatus = temp >> 8;
1982 p->header.securityIndex = temp >> 0;
1987 p->header.serviceId = (temp & 0xffff);
1988 p->header.spare = temp >> 16;
1989 /* Note: top 16 bits of this last word are the security checksum */
1993 rxi_PrepareSendPacket(register struct rx_call *call,
1994 register struct rx_packet *p, register int last)
1996 register struct rx_connection *conn = call->conn;
1998 ssize_t len; /* len must be a signed type; it can go negative */
2000 p->flags &= ~RX_PKTFLAG_ACKED;
2001 p->header.cid = (conn->cid | call->channel);
2002 p->header.serviceId = conn->serviceId;
2003 p->header.securityIndex = conn->securityIndex;
2004 p->header.callNumber = *call->callNumber;
2005 p->header.seq = call->tnext++;
2006 p->header.epoch = conn->epoch;
2007 p->header.type = RX_PACKET_TYPE_DATA;
2008 p->header.flags = 0;
2009 p->header.spare = 0;
2010 if (conn->type == RX_CLIENT_CONNECTION)
2011 p->header.flags |= RX_CLIENT_INITIATED;
2014 p->header.flags |= RX_LAST_PACKET;
2016 clock_Zero(&p->retryTime); /* Never yet transmitted */
2017 clock_Zero(&p->firstSent); /* Never yet transmitted */
2018 p->header.serial = 0; /* Another way of saying never transmitted... */
2021 /* Now that we're sure this is the last data on the call, make sure
2022 * that the "length" and the sum of the iov_lens matches. */
2023 len = p->length + call->conn->securityHeaderSize;
2025 for (i = 1; i < p->niovecs && len > 0; i++) {
2026 len -= p->wirevec[i].iov_len;
2029 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
2031 /* Free any extra elements in the wirevec */
2032 for (j = MAX(2, i); j < p->niovecs; j++) {
2033 rxi_freeCBuf(RX_CBUF_TO_PACKET(p->wirevec[j].iov_base, p));
2036 p->wirevec[i - 1].iov_len += len;
2038 RXS_PreparePacket(conn->securityObject, call, p);
2041 /* Given an interface MTU size, calculate an adjusted MTU size that
2042 * will make efficient use of the RX buffers when the peer is sending
2043 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
2045 rxi_AdjustIfMTU(int mtu)
2050 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2051 if (mtu <= adjMTU) {
2058 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2059 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2062 /* Given an interface MTU size, and the peer's advertised max receive
2063 * size, calculate an adjisted maxMTU size that makes efficient use
2064 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2066 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2068 int maxMTU = mtu * rxi_nSendFrags;
2069 maxMTU = MIN(maxMTU, peerMaxMTU);
2070 return rxi_AdjustIfMTU(maxMTU);
2073 /* Given a packet size, figure out how many datagram packet will fit.
2074 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2075 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2076 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2078 rxi_AdjustDgramPackets(int frags, int mtu)
2081 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2084 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2085 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2086 /* subtract the size of the first and last packets */
2087 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2091 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));