2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
12 #include "afs/param.h"
14 #include <afs/param.h>
22 #include "afs/sysincludes.h"
23 #include "afsincludes.h"
24 #include "rx/rx_kcommon.h"
25 #include "rx/rx_clock.h"
26 #include "rx/rx_queue.h"
27 #include "rx/rx_packet.h"
28 #else /* defined(UKERNEL) */
29 #ifdef RX_KERNEL_TRACE
30 #include "../rx/rx_kcommon.h"
33 #ifndef AFS_LINUX20_ENV
36 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
37 #include "afs/sysincludes.h"
39 #if defined(AFS_OBSD_ENV)
43 #if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
44 #if !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
45 #include "sys/mount.h" /* it gets pulled in by something later anyway */
49 #include "netinet/in.h"
50 #include "afs/afs_osi.h"
51 #include "rx_kmutex.h"
52 #include "rx/rx_clock.h"
53 #include "rx/rx_queue.h"
55 #include <sys/sysmacros.h>
57 #include "rx/rx_packet.h"
58 #endif /* defined(UKERNEL) */
59 #include "rx/rx_globals.h"
61 #include "sys/types.h"
64 #if defined(AFS_NT40_ENV) || defined(AFS_DJGPP_ENV)
68 #define EWOULDBLOCK WSAEWOULDBLOCK
71 #include <sys/socket.h>
72 #include <netinet/in.h>
73 #endif /* AFS_NT40_ENV */
74 #include "rx_xmit_nt.h"
77 #include <sys/socket.h>
78 #include <netinet/in.h>
84 #include <sys/sysmacros.h>
86 #include "rx_packet.h"
87 #include "rx_globals.h"
103 /* rxdb_fileID is used to identify the lock location, along with line#. */
104 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
105 #endif /* RX_LOCKS_DB */
106 struct rx_packet *rx_mallocedP = 0;
108 extern char cml_version_number[];
109 extern int (*rx_almostSent) ();
111 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
112 afs_int32 ahost, short aport,
115 /* some rules about packets:
116 * 1. When a packet is allocated, the final iov_buf contains room for
117 * a security trailer, but iov_len masks that fact. If the security
118 * package wants to add the trailer, it may do so, and then extend
119 * iov_len appropriately. For this reason, packet's niovecs and
120 * iov_len fields should be accurate before calling PreparePacket.
124 * all packet buffers (iov_base) are integral multiples of
126 * offset is an integral multiple of the word size.
129 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
133 for (l = 0, i = 1; i < packet->niovecs; i++) {
134 if (l + packet->wirevec[i].iov_len > offset) {
136 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
139 l += packet->wirevec[i].iov_len;
146 * all packet buffers (iov_base) are integral multiples of the word size.
147 * offset is an integral multiple of the word size.
150 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
154 for (l = 0, i = 1; i < packet->niovecs; i++) {
155 if (l + packet->wirevec[i].iov_len > offset) {
156 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
157 (offset - l))) = data;
160 l += packet->wirevec[i].iov_len;
167 * all packet buffers (iov_base) are integral multiples of the
169 * offset is an integral multiple of the word size.
171 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
174 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
177 unsigned int i, j, l, r;
178 for (l = 0, i = 1; i < packet->niovecs; i++) {
179 if (l + packet->wirevec[i].iov_len > offset) {
182 l += packet->wirevec[i].iov_len;
185 /* i is the iovec which contains the first little bit of data in which we
186 * are interested. l is the total length of everything prior to this iovec.
187 * j is the number of bytes we can safely copy out of this iovec.
190 while ((resid > 0) && (i < packet->niovecs)) {
191 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
192 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
194 l += packet->wirevec[i].iov_len;
198 return (resid ? (r - resid) : r);
203 * all packet buffers (iov_base) are integral multiples of the
205 * offset is an integral multiple of the word size.
208 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
213 for (l = 0, i = 1; i < packet->niovecs; i++) {
214 if (l + packet->wirevec[i].iov_len > offset) {
217 l += packet->wirevec[i].iov_len;
220 /* i is the iovec which contains the first little bit of data in which we
221 * are interested. l is the total length of everything prior to this iovec.
222 * j is the number of bytes we can safely copy out of this iovec.
225 while ((resid > 0) && (i < RX_MAXWVECS)) {
226 if (i >= packet->niovecs)
227 if (rxi_AllocDataBuf(packet, resid, RX_PACKET_CLASS_SEND_CBUF) > 0) /* ++niovecs as a side-effect */
230 b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
231 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
234 l += packet->wirevec[i].iov_len;
238 return (resid ? (r - resid) : r);
241 static struct rx_packet *
248 MUTEX_ENTER(&rx_freePktQ_lock);
251 if (rxi_OverQuota(class)) {
253 rxi_NeedMorePackets = TRUE;
254 MUTEX_ENTER(&rx_stats_mutex);
256 case RX_PACKET_CLASS_RECEIVE:
257 rx_stats.receivePktAllocFailures++;
259 case RX_PACKET_CLASS_SEND:
260 rx_stats.sendPktAllocFailures++;
262 case RX_PACKET_CLASS_SPECIAL:
263 rx_stats.specialPktAllocFailures++;
265 case RX_PACKET_CLASS_RECV_CBUF:
266 rx_stats.receiveCbufPktAllocFailures++;
268 case RX_PACKET_CLASS_SEND_CBUF:
269 rx_stats.sendCbufPktAllocFailures++;
272 MUTEX_EXIT(&rx_stats_mutex);
276 if (queue_IsEmpty(&rx_freePacketQueue)) {
278 rxi_NeedMorePackets = TRUE;
282 if (queue_IsEmpty(&rx_freePacketQueue)) {
283 rxi_MorePacketsNoLock(rx_initSendWindow);
288 c = queue_First(&rx_freePacketQueue, rx_packet);
290 if (!(c->flags & RX_PKTFLAG_FREE))
291 osi_Panic("rxi_AllocPacket: packet not free\n");
292 c->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
298 MUTEX_EXIT(&rx_freePktQ_lock);
305 * Free a packet currently used as a continuation buffer
308 rxi_freeCBuf(struct rx_packet *c)
313 MUTEX_ENTER(&rx_freePktQ_lock);
315 rxi_FreePacketNoLock(c);
316 /* Wakeup anyone waiting for packets */
319 MUTEX_EXIT(&rx_freePktQ_lock);
323 /* this one is kind of awful.
324 * In rxkad, the packet has been all shortened, and everything, ready for
325 * sending. All of a sudden, we discover we need some of that space back.
326 * This isn't terribly general, because it knows that the packets are only
327 * rounded up to the EBS (userdata + security header).
330 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
334 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
335 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
336 p->wirevec[i].iov_len += nb;
340 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
341 p->wirevec[i].iov_len += nb;
349 /* get sufficient space to store nb bytes of data (or more), and hook
350 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
351 * returns the number of bytes >0 which it failed to come up with.
352 * Don't need to worry about locking on packet, since only
353 * one thread can manipulate one at a time. Locking on continution
354 * packets is handled by allocCBuf */
355 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
357 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
361 for (i = p->niovecs; nb > 0 && i < RX_MAXWVECS; i++) {
362 register struct rx_packet *cb;
363 if ((cb = allocCBuf(class))) {
364 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
365 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
366 nb -= RX_CBUFFERSIZE;
367 p->length += RX_CBUFFERSIZE;
376 /* Add more packet buffers */
378 rxi_MorePackets(int apackets)
380 struct rx_packet *p, *e;
384 getme = apackets * sizeof(struct rx_packet);
385 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
387 PIN(p, getme); /* XXXXX */
388 memset((char *)p, 0, getme);
391 MUTEX_ENTER(&rx_freePktQ_lock);
393 for (e = p + apackets; p < e; p++) {
394 p->wirevec[0].iov_base = (char *)(p->wirehead);
395 p->wirevec[0].iov_len = RX_HEADER_SIZE;
396 p->wirevec[1].iov_base = (char *)(p->localdata);
397 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
398 p->flags |= RX_PKTFLAG_FREE;
401 queue_Append(&rx_freePacketQueue, p);
403 rx_nFreePackets += apackets;
404 rxi_NeedMorePackets = FALSE;
408 MUTEX_EXIT(&rx_freePktQ_lock);
413 /* Add more packet buffers */
415 rxi_MorePacketsNoLock(int apackets)
417 struct rx_packet *p, *e;
420 /* allocate enough packets that 1/4 of the packets will be able
421 * to hold maximal amounts of data */
422 apackets += (apackets / 4)
423 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
424 getme = apackets * sizeof(struct rx_packet);
425 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
427 memset((char *)p, 0, getme);
429 for (e = p + apackets; p < e; p++) {
430 p->wirevec[0].iov_base = (char *)(p->wirehead);
431 p->wirevec[0].iov_len = RX_HEADER_SIZE;
432 p->wirevec[1].iov_base = (char *)(p->localdata);
433 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
434 p->flags |= RX_PKTFLAG_FREE;
437 queue_Append(&rx_freePacketQueue, p);
439 rx_nFreePackets += apackets;
440 rxi_NeedMorePackets = FALSE;
446 rxi_FreeAllPackets(void)
448 /* must be called at proper interrupt level, etcetera */
449 /* MTUXXX need to free all Packets */
450 osi_Free(rx_mallocedP,
451 (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
452 UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
455 /* Allocate more packets iff we need more continuation buffers */
456 /* In kernel, can't page in memory with interrupts disabled, so we
457 * don't use the event mechanism. */
459 rx_CheckPackets(void)
461 if (rxi_NeedMorePackets) {
462 rxi_MorePackets(rx_initSendWindow);
466 /* In the packet freeing routine below, the assumption is that
467 we want all of the packets to be used equally frequently, so that we
468 don't get packet buffers paging out. It would be just as valid to
469 assume that we DO want them to page out if not many are being used.
470 In any event, we assume the former, and append the packets to the end
472 /* This explanation is bogus. The free list doesn't remain in any kind of
473 useful order for afs_int32: the packets in use get pretty much randomly scattered
474 across all the pages. In order to permit unused {packets,bufs} to page out, they
475 must be stored so that packets which are adjacent in memory are adjacent in the
476 free list. An array springs rapidly to mind.
479 /* Actually free the packet p. */
481 rxi_FreePacketNoLock(struct rx_packet *p)
483 dpf(("Free %x\n", (int)p));
485 if (p->flags & RX_PKTFLAG_FREE)
486 osi_Panic("rxi_FreePacketNoLock: packet already free\n");
488 p->flags |= RX_PKTFLAG_FREE;
489 queue_Append(&rx_freePacketQueue, p);
493 rxi_FreeDataBufsNoLock(struct rx_packet *p, int first)
495 struct iovec *iov, *end;
497 if (first != 1) /* MTUXXX */
498 osi_Panic("FreeDataBufs 1: first must be 1");
499 iov = &p->wirevec[1];
500 end = iov + (p->niovecs - 1);
501 if (iov->iov_base != (caddr_t) p->localdata) /* MTUXXX */
502 osi_Panic("FreeDataBufs 2: vec 1 must be localdata");
503 for (iov++; iov < end; iov++) {
505 osi_Panic("FreeDataBufs 3: vecs 2-niovecs must not be NULL");
506 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
514 int rxi_nBadIovecs = 0;
516 /* rxi_RestoreDataBufs
518 * Restore the correct sizes to the iovecs. Called when reusing a packet
519 * for reading off the wire.
522 rxi_RestoreDataBufs(struct rx_packet *p)
525 struct iovec *iov = &p->wirevec[2];
527 p->wirevec[0].iov_base = (char *)(p->wirehead);
528 p->wirevec[0].iov_len = RX_HEADER_SIZE;
529 p->wirevec[1].iov_base = (char *)(p->localdata);
530 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
532 for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
533 if (!iov->iov_base) {
538 iov->iov_len = RX_CBUFFERSIZE;
543 rxi_TrimDataBufs(struct rx_packet *p, int first)
546 struct iovec *iov, *end;
550 osi_Panic("TrimDataBufs 1: first must be 1");
552 /* Skip over continuation buffers containing message data */
553 iov = &p->wirevec[2];
554 end = iov + (p->niovecs - 2);
555 length = p->length - p->wirevec[1].iov_len;
556 for (; iov < end && length > 0; iov++) {
558 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
559 length -= iov->iov_len;
562 /* iov now points to the first empty data buffer. */
567 MUTEX_ENTER(&rx_freePktQ_lock);
569 for (; iov < end; iov++) {
571 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
572 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
577 MUTEX_EXIT(&rx_freePktQ_lock);
583 /* Free the packet p. P is assumed not to be on any queue, i.e.
584 * remove it yourself first if you call this routine. */
586 rxi_FreePacket(struct rx_packet *p)
591 MUTEX_ENTER(&rx_freePktQ_lock);
593 rxi_FreeDataBufsNoLock(p, 1);
594 rxi_FreePacketNoLock(p);
595 /* Wakeup anyone waiting for packets */
598 MUTEX_EXIT(&rx_freePktQ_lock);
603 /* rxi_AllocPacket sets up p->length so it reflects the number of
604 * bytes in the packet at this point, **not including** the header.
605 * The header is absolutely necessary, besides, this is the way the
606 * length field is usually used */
608 rxi_AllocPacketNoLock(int class)
610 register struct rx_packet *p;
613 if (rxi_OverQuota(class)) {
614 rxi_NeedMorePackets = TRUE;
615 MUTEX_ENTER(&rx_stats_mutex);
617 case RX_PACKET_CLASS_RECEIVE:
618 rx_stats.receivePktAllocFailures++;
620 case RX_PACKET_CLASS_SEND:
621 rx_stats.sendPktAllocFailures++;
623 case RX_PACKET_CLASS_SPECIAL:
624 rx_stats.specialPktAllocFailures++;
626 case RX_PACKET_CLASS_RECV_CBUF:
627 rx_stats.receiveCbufPktAllocFailures++;
629 case RX_PACKET_CLASS_SEND_CBUF:
630 rx_stats.sendCbufPktAllocFailures++;
633 MUTEX_EXIT(&rx_stats_mutex);
634 return (struct rx_packet *)0;
638 MUTEX_ENTER(&rx_stats_mutex);
639 rx_stats.packetRequests++;
640 MUTEX_EXIT(&rx_stats_mutex);
643 if (queue_IsEmpty(&rx_freePacketQueue))
644 osi_Panic("rxi_AllocPacket error");
646 if (queue_IsEmpty(&rx_freePacketQueue))
647 rxi_MorePacketsNoLock(rx_initSendWindow);
651 p = queue_First(&rx_freePacketQueue, rx_packet);
652 if (!(p->flags & RX_PKTFLAG_FREE))
653 osi_Panic("rxi_AllocPacket: packet not free\n");
655 dpf(("Alloc %x, class %d\n", (int)p, class));
658 p->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
661 /* have to do this here because rx_FlushWrite fiddles with the iovs in
662 * order to truncate outbound packets. In the near future, may need
663 * to allocate bufs from a static pool here, and/or in AllocSendPacket
665 p->wirevec[0].iov_base = (char *)(p->wirehead);
666 p->wirevec[0].iov_len = RX_HEADER_SIZE;
667 p->wirevec[1].iov_base = (char *)(p->localdata);
668 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
670 p->length = RX_FIRSTBUFFERSIZE;
675 rxi_AllocPacket(int class)
677 register struct rx_packet *p;
679 MUTEX_ENTER(&rx_freePktQ_lock);
680 p = rxi_AllocPacketNoLock(class);
681 MUTEX_EXIT(&rx_freePktQ_lock);
685 /* This guy comes up with as many buffers as it {takes,can get} given
686 * the MTU for this call. It also sets the packet length before
687 * returning. caution: this is often called at NETPRI
688 * Called with call locked.
691 rxi_AllocSendPacket(register struct rx_call *call, int want)
693 register struct rx_packet *p = (struct rx_packet *)0;
695 register unsigned delta;
698 mud = call->MTU - RX_HEADER_SIZE;
700 rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
701 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
703 while (!(call->error)) {
704 MUTEX_ENTER(&rx_freePktQ_lock);
705 /* if an error occurred, or we get the packet we want, we're done */
706 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
707 MUTEX_EXIT(&rx_freePktQ_lock);
710 want = MIN(want, mud);
712 if ((unsigned)want > p->length)
713 (void)rxi_AllocDataBuf(p, (want - p->length),
714 RX_PACKET_CLASS_SEND_CBUF);
716 if ((unsigned)p->length > mud)
719 if (delta >= p->length) {
728 /* no error occurred, and we didn't get a packet, so we sleep.
729 * At this point, we assume that packets will be returned
730 * sooner or later, as packets are acknowledged, and so we
733 call->flags |= RX_CALL_WAIT_PACKETS;
734 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
735 MUTEX_EXIT(&call->lock);
736 rx_waitingForPackets = 1;
738 #ifdef RX_ENABLE_LOCKS
739 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
741 osi_rxSleep(&rx_waitingForPackets);
743 MUTEX_EXIT(&rx_freePktQ_lock);
744 MUTEX_ENTER(&call->lock);
745 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
746 call->flags &= ~RX_CALL_WAIT_PACKETS;
755 /* count the number of used FDs */
757 CountFDs(register int amax)
760 register int i, code;
764 for (i = 0; i < amax; i++) {
765 code = fstat(i, &tstat);
774 #define CountFDs(amax) amax
778 #if !defined(KERNEL) || defined(UKERNEL)
780 /* This function reads a single packet from the interface into the
781 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
782 * (host,port) of the sender are stored in the supplied variables, and
783 * the data length of the packet is stored in the packet structure.
784 * The header is decoded. */
786 rxi_ReadPacket(int socket, register struct rx_packet *p, afs_uint32 * host,
789 struct sockaddr_in from;
792 register afs_int32 tlen, savelen;
794 rx_computelen(p, tlen);
795 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
797 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
798 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
799 * it once in order to avoid races. */
802 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
810 /* Extend the last iovec for padding, it's just to make sure that the
811 * read doesn't return more data than we expect, and is done to get around
812 * our problems caused by the lack of a length field in the rx header.
813 * Use the extra buffer that follows the localdata in each packet
815 savelen = p->wirevec[p->niovecs - 1].iov_len;
816 p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
818 memset((char *)&msg, 0, sizeof(msg));
819 msg.msg_name = (char *)&from;
820 msg.msg_namelen = sizeof(struct sockaddr_in);
821 msg.msg_iov = p->wirevec;
822 msg.msg_iovlen = p->niovecs;
823 nbytes = rxi_Recvmsg(socket, &msg, 0);
825 /* restore the vec to its correct state */
826 p->wirevec[p->niovecs - 1].iov_len = savelen;
828 p->length = (nbytes - RX_HEADER_SIZE);
829 if ((nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
831 rxi_MorePackets(rx_initSendWindow);
832 else if (nbytes < 0 && errno == EWOULDBLOCK) {
833 MUTEX_ENTER(&rx_stats_mutex);
834 rx_stats.noPacketOnRead++;
835 MUTEX_EXIT(&rx_stats_mutex);
837 MUTEX_ENTER(&rx_stats_mutex);
838 rx_stats.bogusPacketOnRead++;
839 rx_stats.bogusHost = from.sin_addr.s_addr;
840 MUTEX_EXIT(&rx_stats_mutex);
841 dpf(("B: bogus packet from [%x,%d] nb=%d", from.sin_addr.s_addr,
842 from.sin_port, nbytes));
846 /* Extract packet header. */
847 rxi_DecodePacketHeader(p);
849 *host = from.sin_addr.s_addr;
850 *port = from.sin_port;
851 if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
852 struct rx_peer *peer;
853 MUTEX_ENTER(&rx_stats_mutex);
854 rx_stats.packetsRead[p->header.type - 1]++;
855 MUTEX_EXIT(&rx_stats_mutex);
857 * Try to look up this peer structure. If it doesn't exist,
858 * don't create a new one -
859 * we don't keep count of the bytes sent/received if a peer
860 * structure doesn't already exist.
862 * The peer/connection cleanup code assumes that there is 1 peer
863 * per connection. If we actually created a peer structure here
864 * and this packet was an rxdebug packet, the peer structure would
865 * never be cleaned up.
867 peer = rxi_FindPeer(*host, *port, 0, 0);
869 MUTEX_ENTER(&peer->peer_lock);
870 hadd32(peer->bytesReceived, p->length);
871 MUTEX_EXIT(&peer->peer_lock);
875 /* Free any empty packet buffers at the end of this packet */
876 rxi_TrimDataBufs(p, 1);
882 #endif /* !KERNEL || UKERNEL */
884 /* This function splits off the first packet in a jumbo packet.
885 * As of AFS 3.5, jumbograms contain more than one fixed size
886 * packet, and the RX_JUMBO_PACKET flag is set in all but the
887 * last packet header. All packets (except the last) are padded to
888 * fall on RX_CBUFFERSIZE boundaries.
889 * HACK: We store the length of the first n-1 packets in the
890 * last two pad bytes. */
893 rxi_SplitJumboPacket(register struct rx_packet *p, afs_int32 host, short port,
896 struct rx_packet *np;
897 struct rx_jumboHeader *jp;
903 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
904 * bytes in length. All but the first packet are preceded by
905 * an abbreviated four byte header. The length of the last packet
906 * is calculated from the size of the jumbogram. */
907 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
909 if ((int)p->length < length) {
910 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
913 niov = p->niovecs - 2;
915 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
918 iov = &p->wirevec[2];
919 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
921 /* Get a pointer to the abbreviated packet header */
922 jp = (struct rx_jumboHeader *)
923 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
925 /* Set up the iovecs for the next packet */
926 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
927 np->wirevec[0].iov_len = sizeof(struct rx_header);
928 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
929 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
930 np->niovecs = niov + 1;
931 for (i = 2, iov++; i <= niov; i++, iov++) {
932 np->wirevec[i] = *iov;
934 np->length = p->length - length;
935 p->length = RX_JUMBOBUFFERSIZE;
938 /* Convert the jumbo packet header to host byte order */
939 temp = ntohl(*(afs_uint32 *) jp);
940 jp->flags = (u_char) (temp >> 24);
941 jp->cksum = (u_short) (temp);
943 /* Fill in the packet header */
944 np->header = p->header;
945 np->header.serial = p->header.serial + 1;
946 np->header.seq = p->header.seq + 1;
947 np->header.flags = jp->flags;
948 np->header.spare = jp->cksum;
954 /* Send a udp datagram */
956 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
957 int length, int istack)
961 memset(&msg, 0, sizeof(msg));
963 msg.msg_iovlen = nvecs;
965 msg.msg_namelen = sizeof(struct sockaddr_in);
967 rxi_Sendmsg(socket, &msg, 0);
971 #elif !defined(UKERNEL)
973 * message receipt is done in rxk_input or rx_put.
976 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
978 * Copy an mblock to the contiguous area pointed to by cp.
979 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
980 * but it doesn't really.
981 * Returns the number of bytes not transferred.
982 * The message is NOT changed.
985 cpytoc(mblk_t * mp, register int off, register int len, register char *cp)
989 for (; mp && len > 0; mp = mp->b_cont) {
990 if (mp->b_datap->db_type != M_DATA) {
993 n = MIN(len, (mp->b_wptr - mp->b_rptr));
994 memcpy(cp, (char *)mp->b_rptr, n);
1002 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1003 * but it doesn't really.
1004 * This sucks, anyway, do it like m_cpy.... below
1007 cpytoiovec(mblk_t * mp, int off, int len, register struct iovec *iovs,
1010 register int m, n, o, t, i;
1012 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1013 if (mp->b_datap->db_type != M_DATA) {
1016 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1022 t = iovs[i].iov_len;
1025 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1035 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
1036 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1038 #if !defined(AFS_LINUX20_ENV)
1040 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1043 unsigned int l1, l2, i, t;
1045 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1046 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1049 if (m->m_len <= off) {
1059 p1 = mtod(m, caddr_t) + off;
1060 l1 = m->m_len - off;
1062 p2 = iovs[0].iov_base;
1063 l2 = iovs[0].iov_len;
1066 t = MIN(l1, MIN(l2, (unsigned int)len));
1077 p1 = mtod(m, caddr_t);
1083 p2 = iovs[i].iov_base;
1084 l2 = iovs[i].iov_len;
1092 #endif /* AFS_SUN5_ENV */
1094 #if !defined(AFS_LINUX20_ENV)
1096 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1097 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1103 struct rx_packet *phandle;
1104 int hdr_len, data_len;
1109 m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1116 #endif /*KERNEL && !UKERNEL */
1119 /* send a response to a debug packet */
1122 rxi_ReceiveDebugPacket(register struct rx_packet *ap, osi_socket asocket,
1123 afs_int32 ahost, short aport, int istack)
1125 struct rx_debugIn tin;
1127 struct rx_serverQueueEntry *np, *nqe;
1130 * Only respond to client-initiated Rx debug packets,
1131 * and clear the client flag in the response.
1133 if (ap->header.flags & RX_CLIENT_INITIATED) {
1134 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1135 rxi_EncodePacketHeader(ap);
1140 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1141 /* all done with packet, now set length to the truth, so we can
1142 * reuse this packet */
1143 rx_computelen(ap, ap->length);
1145 tin.type = ntohl(tin.type);
1146 tin.index = ntohl(tin.index);
1148 case RX_DEBUGI_GETSTATS:{
1149 struct rx_debugStats tstat;
1151 /* get basic stats */
1152 memset((char *)&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1153 tstat.version = RX_DEBUGI_VERSION;
1154 #ifndef RX_ENABLE_LOCKS
1155 tstat.waitingForPackets = rx_waitingForPackets;
1157 MUTEX_ENTER(&rx_serverPool_lock);
1158 tstat.nFreePackets = htonl(rx_nFreePackets);
1159 tstat.callsExecuted = htonl(rxi_nCalls);
1160 tstat.packetReclaims = htonl(rx_packetReclaims);
1161 tstat.usedFDs = CountFDs(64);
1162 tstat.nWaiting = htonl(rx_nWaiting);
1163 tstat.nWaited = htonl(rx_nWaited);
1164 queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1166 MUTEX_EXIT(&rx_serverPool_lock);
1167 tstat.idleThreads = htonl(tstat.idleThreads);
1168 tl = sizeof(struct rx_debugStats) - ap->length;
1170 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1173 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1175 ap->length = sizeof(struct rx_debugStats);
1176 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1177 rx_computelen(ap, ap->length);
1182 case RX_DEBUGI_GETALLCONN:
1183 case RX_DEBUGI_GETCONN:{
1185 register struct rx_connection *tc;
1186 struct rx_call *tcall;
1187 struct rx_debugConn tconn;
1188 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1191 tl = sizeof(struct rx_debugConn) - ap->length;
1193 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1197 memset((char *)&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1198 /* get N'th (maybe) "interesting" connection info */
1199 for (i = 0; i < rx_hashTableSize; i++) {
1200 #if !defined(KERNEL)
1201 /* the time complexity of the algorithm used here
1202 * exponentially increses with the number of connections.
1204 #ifdef AFS_PTHREAD_ENV
1210 MUTEX_ENTER(&rx_connHashTable_lock);
1211 /* We might be slightly out of step since we are not
1212 * locking each call, but this is only debugging output.
1214 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1215 if ((all || rxi_IsConnInteresting(tc))
1216 && tin.index-- <= 0) {
1217 tconn.host = tc->peer->host;
1218 tconn.port = tc->peer->port;
1219 tconn.cid = htonl(tc->cid);
1220 tconn.epoch = htonl(tc->epoch);
1221 tconn.serial = htonl(tc->serial);
1222 for (j = 0; j < RX_MAXCALLS; j++) {
1223 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1224 if ((tcall = tc->call[j])) {
1225 tconn.callState[j] = tcall->state;
1226 tconn.callMode[j] = tcall->mode;
1227 tconn.callFlags[j] = tcall->flags;
1228 if (queue_IsNotEmpty(&tcall->rq))
1229 tconn.callOther[j] |= RX_OTHER_IN;
1230 if (queue_IsNotEmpty(&tcall->tq))
1231 tconn.callOther[j] |= RX_OTHER_OUT;
1233 tconn.callState[j] = RX_STATE_NOTINIT;
1236 tconn.natMTU = htonl(tc->peer->natMTU);
1237 tconn.error = htonl(tc->error);
1238 tconn.flags = tc->flags;
1239 tconn.type = tc->type;
1240 tconn.securityIndex = tc->securityIndex;
1241 if (tc->securityObject) {
1242 RXS_GetStats(tc->securityObject, tc,
1244 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1245 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1248 DOHTONL(packetsReceived);
1249 DOHTONL(packetsSent);
1250 DOHTONL(bytesReceived);
1254 sizeof(tconn.secStats.spares) /
1259 sizeof(tconn.secStats.sparel) /
1260 sizeof(afs_int32); i++)
1264 MUTEX_EXIT(&rx_connHashTable_lock);
1265 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1268 ap->length = sizeof(struct rx_debugConn);
1269 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1275 MUTEX_EXIT(&rx_connHashTable_lock);
1277 /* if we make it here, there are no interesting packets */
1278 tconn.cid = htonl(0xffffffff); /* means end */
1279 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1282 ap->length = sizeof(struct rx_debugConn);
1283 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1289 * Pass back all the peer structures we have available
1292 case RX_DEBUGI_GETPEER:{
1294 register struct rx_peer *tp;
1295 struct rx_debugPeer tpeer;
1298 tl = sizeof(struct rx_debugPeer) - ap->length;
1300 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1304 memset((char *)&tpeer, 0, sizeof(tpeer));
1305 for (i = 0; i < rx_hashTableSize; i++) {
1306 #if !defined(KERNEL)
1307 /* the time complexity of the algorithm used here
1308 * exponentially increses with the number of peers.
1310 * Yielding after processing each hash table entry
1311 * and dropping rx_peerHashTable_lock.
1312 * also increases the risk that we will miss a new
1313 * entry - but we are willing to live with this
1314 * limitation since this is meant for debugging only
1316 #ifdef AFS_PTHREAD_ENV
1322 MUTEX_ENTER(&rx_peerHashTable_lock);
1323 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1324 if (tin.index-- <= 0) {
1325 tpeer.host = tp->host;
1326 tpeer.port = tp->port;
1327 tpeer.ifMTU = htons(tp->ifMTU);
1328 tpeer.idleWhen = htonl(tp->idleWhen);
1329 tpeer.refCount = htons(tp->refCount);
1330 tpeer.burstSize = tp->burstSize;
1331 tpeer.burst = tp->burst;
1332 tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1333 tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1334 tpeer.rtt = htonl(tp->rtt);
1335 tpeer.rtt_dev = htonl(tp->rtt_dev);
1336 tpeer.timeout.sec = htonl(tp->timeout.sec);
1337 tpeer.timeout.usec = htonl(tp->timeout.usec);
1338 tpeer.nSent = htonl(tp->nSent);
1339 tpeer.reSends = htonl(tp->reSends);
1340 tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1341 tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1342 tpeer.rateFlag = htonl(tp->rateFlag);
1343 tpeer.natMTU = htons(tp->natMTU);
1344 tpeer.maxMTU = htons(tp->maxMTU);
1345 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1346 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1347 tpeer.MTU = htons(tp->MTU);
1348 tpeer.cwind = htons(tp->cwind);
1349 tpeer.nDgramPackets = htons(tp->nDgramPackets);
1350 tpeer.congestSeq = htons(tp->congestSeq);
1351 tpeer.bytesSent.high = htonl(tp->bytesSent.high);
1352 tpeer.bytesSent.low = htonl(tp->bytesSent.low);
1353 tpeer.bytesReceived.high =
1354 htonl(tp->bytesReceived.high);
1355 tpeer.bytesReceived.low =
1356 htonl(tp->bytesReceived.low);
1358 MUTEX_EXIT(&rx_peerHashTable_lock);
1359 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1362 ap->length = sizeof(struct rx_debugPeer);
1363 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1369 MUTEX_EXIT(&rx_peerHashTable_lock);
1371 /* if we make it here, there are no interesting packets */
1372 tpeer.host = htonl(0xffffffff); /* means end */
1373 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1376 ap->length = sizeof(struct rx_debugPeer);
1377 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1382 case RX_DEBUGI_RXSTATS:{
1386 tl = sizeof(rx_stats) - ap->length;
1388 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1392 /* Since its all int32s convert to network order with a loop. */
1393 MUTEX_ENTER(&rx_stats_mutex);
1394 s = (afs_int32 *) & rx_stats;
1395 for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
1396 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
1399 ap->length = sizeof(rx_stats);
1400 MUTEX_EXIT(&rx_stats_mutex);
1401 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1407 /* error response packet */
1408 tin.type = htonl(RX_DEBUGI_BADTYPE);
1409 tin.index = tin.type;
1410 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1412 ap->length = sizeof(struct rx_debugIn);
1413 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1421 rxi_ReceiveVersionPacket(register struct rx_packet *ap, osi_socket asocket,
1422 afs_int32 ahost, short aport, int istack)
1427 * Only respond to client-initiated version requests, and
1428 * clear that flag in the response.
1430 if (ap->header.flags & RX_CLIENT_INITIATED) {
1433 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1434 rxi_EncodePacketHeader(ap);
1435 memset(buf, 0, sizeof(buf));
1436 strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
1437 rx_packetwrite(ap, 0, 65, buf);
1440 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1448 /* send a debug packet back to the sender */
1450 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
1451 afs_int32 ahost, short aport, afs_int32 istack)
1453 struct sockaddr_in taddr;
1459 int waslocked = ISAFS_GLOCK();
1462 taddr.sin_family = AF_INET;
1463 taddr.sin_port = aport;
1464 taddr.sin_addr.s_addr = ahost;
1465 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
1466 taddr.sin_len = sizeof(struct sockaddr_in);
1469 /* We need to trim the niovecs. */
1470 nbytes = apacket->length;
1471 for (i = 1; i < apacket->niovecs; i++) {
1472 if (nbytes <= apacket->wirevec[i].iov_len) {
1473 savelen = apacket->wirevec[i].iov_len;
1474 saven = apacket->niovecs;
1475 apacket->wirevec[i].iov_len = nbytes;
1476 apacket->niovecs = i + 1; /* so condition fails because i == niovecs */
1478 nbytes -= apacket->wirevec[i].iov_len;
1482 #ifdef RX_KERNEL_TRACE
1483 if (ICL_SETACTIVE(afs_iclSetp)) {
1486 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1487 "before osi_NetSend()");
1495 /* debug packets are not reliably delivered, hence the cast below. */
1496 (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
1497 apacket->length + RX_HEADER_SIZE, istack);
1499 #ifdef RX_KERNEL_TRACE
1500 if (ICL_SETACTIVE(afs_iclSetp)) {
1502 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1503 "after osi_NetSend()");
1513 if (saven) { /* means we truncated the packet above. */
1514 apacket->wirevec[i - 1].iov_len = savelen;
1515 apacket->niovecs = saven;
1520 /* Send the packet to appropriate destination for the specified
1521 * call. The header is first encoded and placed in the packet.
1524 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
1525 struct rx_packet *p, int istack)
1531 struct sockaddr_in addr;
1532 register struct rx_peer *peer = conn->peer;
1535 char deliveryType = 'S';
1537 /* The address we're sending the packet to */
1538 memset(&addr, 0, sizeof(addr));
1539 addr.sin_family = AF_INET;
1540 addr.sin_port = peer->port;
1541 addr.sin_addr.s_addr = peer->host;
1543 /* This stuff should be revamped, I think, so that most, if not
1544 * all, of the header stuff is always added here. We could
1545 * probably do away with the encode/decode routines. XXXXX */
1547 /* Stamp each packet with a unique serial number. The serial
1548 * number is maintained on a connection basis because some types
1549 * of security may be based on the serial number of the packet,
1550 * and security is handled on a per authenticated-connection
1552 /* Pre-increment, to guarantee no zero serial number; a zero
1553 * serial number means the packet was never sent. */
1554 MUTEX_ENTER(&conn->conn_data_lock);
1555 p->header.serial = ++conn->serial;
1556 MUTEX_EXIT(&conn->conn_data_lock);
1557 /* This is so we can adjust retransmit time-outs better in the face of
1558 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1560 if (p->firstSerial == 0) {
1561 p->firstSerial = p->header.serial;
1564 /* If an output tracer function is defined, call it with the packet and
1565 * network address. Note this function may modify its arguments. */
1566 if (rx_almostSent) {
1567 int drop = (*rx_almostSent) (p, &addr);
1568 /* drop packet if return value is non-zero? */
1570 deliveryType = 'D'; /* Drop the packet */
1574 /* Get network byte order header */
1575 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1576 * touch ALL the fields */
1578 /* Send the packet out on the same socket that related packets are being
1582 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1585 /* Possibly drop this packet, for testing purposes */
1586 if ((deliveryType == 'D')
1587 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1588 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1589 deliveryType = 'D'; /* Drop the packet */
1591 deliveryType = 'S'; /* Send the packet */
1592 #endif /* RXDEBUG */
1594 /* Loop until the packet is sent. We'd prefer just to use a
1595 * blocking socket, but unfortunately the interface doesn't
1596 * allow us to have the socket block in send mode, and not
1597 * block in receive mode */
1600 waslocked = ISAFS_GLOCK();
1601 #ifdef RX_KERNEL_TRACE
1602 if (ICL_SETACTIVE(afs_iclSetp)) {
1605 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1606 "before osi_NetSend()");
1615 osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
1616 p->length + RX_HEADER_SIZE, istack)) != 0) {
1617 /* send failed, so let's hurry up the resend, eh? */
1618 MUTEX_ENTER(&rx_stats_mutex);
1619 rx_stats.netSendFailures++;
1620 MUTEX_EXIT(&rx_stats_mutex);
1621 p->retryTime = p->timeSent; /* resend it very soon */
1622 clock_Addmsec(&(p->retryTime),
1623 10 + (((afs_uint32) p->backoff) << 8));
1625 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1626 /* Linux is nice -- it can tell us right away that we cannot
1627 * reach this recipient by returning an ENETUNREACH error
1628 * code. So, when this happens let's "down" the host NOW so
1629 * we don't sit around waiting for this host to timeout later.
1631 if (call && code == -ENETUNREACH)
1632 call->lastReceiveTime = 0;
1636 #ifdef RX_KERNEL_TRACE
1637 if (ICL_SETACTIVE(afs_iclSetp)) {
1639 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1640 "after osi_NetSend()");
1652 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (int)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1654 MUTEX_ENTER(&rx_stats_mutex);
1655 rx_stats.packetsSent[p->header.type - 1]++;
1656 MUTEX_EXIT(&rx_stats_mutex);
1657 MUTEX_ENTER(&peer->peer_lock);
1658 hadd32(peer->bytesSent, p->length);
1659 MUTEX_EXIT(&peer->peer_lock);
1662 /* Send a list of packets to appropriate destination for the specified
1663 * connection. The headers are first encoded and placed in the packets.
1666 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
1667 struct rx_packet **list, int len, int istack)
1669 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1672 struct sockaddr_in addr;
1673 register struct rx_peer *peer = conn->peer;
1675 struct rx_packet *p = NULL;
1676 struct iovec wirevec[RX_MAXIOVECS];
1677 int i, length, code;
1680 struct rx_jumboHeader *jp;
1682 char deliveryType = 'S';
1684 /* The address we're sending the packet to */
1685 addr.sin_family = AF_INET;
1686 addr.sin_port = peer->port;
1687 addr.sin_addr.s_addr = peer->host;
1689 if (len + 1 > RX_MAXIOVECS) {
1690 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
1694 * Stamp the packets in this jumbogram with consecutive serial numbers
1696 MUTEX_ENTER(&conn->conn_data_lock);
1697 serial = conn->serial;
1698 conn->serial += len;
1699 MUTEX_EXIT(&conn->conn_data_lock);
1702 /* This stuff should be revamped, I think, so that most, if not
1703 * all, of the header stuff is always added here. We could
1704 * probably do away with the encode/decode routines. XXXXX */
1707 length = RX_HEADER_SIZE;
1708 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
1709 wirevec[0].iov_len = RX_HEADER_SIZE;
1710 for (i = 0; i < len; i++) {
1713 /* The whole 3.5 jumbogram scheme relies on packets fitting
1714 * in a single packet buffer. */
1715 if (p->niovecs > 2) {
1716 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
1719 /* Set the RX_JUMBO_PACKET flags in all but the last packets
1722 if (p->length != RX_JUMBOBUFFERSIZE) {
1723 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
1725 p->header.flags |= RX_JUMBO_PACKET;
1726 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1727 wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1729 wirevec[i + 1].iov_len = p->length;
1730 length += p->length;
1732 wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
1734 /* Convert jumbo packet header to network byte order */
1735 temp = (afs_uint32) (p->header.flags) << 24;
1736 temp |= (afs_uint32) (p->header.spare);
1737 *(afs_uint32 *) jp = htonl(temp);
1739 jp = (struct rx_jumboHeader *)
1740 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
1742 /* Stamp each packet with a unique serial number. The serial
1743 * number is maintained on a connection basis because some types
1744 * of security may be based on the serial number of the packet,
1745 * and security is handled on a per authenticated-connection
1747 /* Pre-increment, to guarantee no zero serial number; a zero
1748 * serial number means the packet was never sent. */
1749 p->header.serial = ++serial;
1750 /* This is so we can adjust retransmit time-outs better in the face of
1751 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1753 if (p->firstSerial == 0) {
1754 p->firstSerial = p->header.serial;
1757 /* If an output tracer function is defined, call it with the packet and
1758 * network address. Note this function may modify its arguments. */
1759 if (rx_almostSent) {
1760 int drop = (*rx_almostSent) (p, &addr);
1761 /* drop packet if return value is non-zero? */
1763 deliveryType = 'D'; /* Drop the packet */
1767 /* Get network byte order header */
1768 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1769 * touch ALL the fields */
1772 /* Send the packet out on the same socket that related packets are being
1776 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1779 /* Possibly drop this packet, for testing purposes */
1780 if ((deliveryType == 'D')
1781 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1782 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1783 deliveryType = 'D'; /* Drop the packet */
1785 deliveryType = 'S'; /* Send the packet */
1786 #endif /* RXDEBUG */
1788 /* Loop until the packet is sent. We'd prefer just to use a
1789 * blocking socket, but unfortunately the interface doesn't
1790 * allow us to have the socket block in send mode, and not
1791 * block in receive mode */
1793 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1794 waslocked = ISAFS_GLOCK();
1795 if (!istack && waslocked)
1799 osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
1801 /* send failed, so let's hurry up the resend, eh? */
1802 MUTEX_ENTER(&rx_stats_mutex);
1803 rx_stats.netSendFailures++;
1804 MUTEX_EXIT(&rx_stats_mutex);
1805 for (i = 0; i < len; i++) {
1807 p->retryTime = p->timeSent; /* resend it very soon */
1808 clock_Addmsec(&(p->retryTime),
1809 10 + (((afs_uint32) p->backoff) << 8));
1811 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1812 /* Linux is nice -- it can tell us right away that we cannot
1813 * reach this recipient by returning an ENETUNREACH error
1814 * code. So, when this happens let's "down" the host NOW so
1815 * we don't sit around waiting for this host to timeout later.
1817 if (call && code == -ENETUNREACH)
1818 call->lastReceiveTime = 0;
1821 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1822 if (!istack && waslocked)
1831 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (int)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1834 MUTEX_ENTER(&rx_stats_mutex);
1835 rx_stats.packetsSent[p->header.type - 1]++;
1836 MUTEX_EXIT(&rx_stats_mutex);
1837 MUTEX_ENTER(&peer->peer_lock);
1839 hadd32(peer->bytesSent, p->length);
1840 MUTEX_EXIT(&peer->peer_lock);
1844 /* Send a "special" packet to the peer connection. If call is
1845 * specified, then the packet is directed to a specific call channel
1846 * associated with the connection, otherwise it is directed to the
1847 * connection only. Uses optionalPacket if it is supplied, rather than
1848 * allocating a new packet buffer. Nbytes is the length of the data
1849 * portion of the packet. If data is non-null, nbytes of data are
1850 * copied into the packet. Type is the type of the packet, as defined
1851 * in rx.h. Bug: there's a lot of duplication between this and other
1852 * routines. This needs to be cleaned up. */
1854 rxi_SendSpecial(register struct rx_call *call,
1855 register struct rx_connection *conn,
1856 struct rx_packet *optionalPacket, int type, char *data,
1857 int nbytes, int istack)
1859 /* Some of the following stuff should be common code for all
1860 * packet sends (it's repeated elsewhere) */
1861 register struct rx_packet *p;
1863 int savelen = 0, saven = 0;
1864 int channel, callNumber;
1866 channel = call->channel;
1867 callNumber = *call->callNumber;
1868 /* BUSY packets refer to the next call on this connection */
1869 if (type == RX_PACKET_TYPE_BUSY) {
1878 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
1880 osi_Panic("rxi_SendSpecial failure");
1887 p->header.serviceId = conn->serviceId;
1888 p->header.securityIndex = conn->securityIndex;
1889 p->header.cid = (conn->cid | channel);
1890 p->header.callNumber = callNumber;
1892 p->header.epoch = conn->epoch;
1893 p->header.type = type;
1894 p->header.flags = 0;
1895 if (conn->type == RX_CLIENT_CONNECTION)
1896 p->header.flags |= RX_CLIENT_INITIATED;
1898 rx_packetwrite(p, 0, nbytes, data);
1900 for (i = 1; i < p->niovecs; i++) {
1901 if (nbytes <= p->wirevec[i].iov_len) {
1902 savelen = p->wirevec[i].iov_len;
1904 p->wirevec[i].iov_len = nbytes;
1905 p->niovecs = i + 1; /* so condition fails because i == niovecs */
1907 nbytes -= p->wirevec[i].iov_len;
1911 rxi_Send(call, p, istack);
1913 rxi_SendPacket((struct rx_call *)0, conn, p, istack);
1914 if (saven) { /* means we truncated the packet above. We probably don't */
1915 /* really need to do this, but it seems safer this way, given that */
1916 /* sneaky optionalPacket... */
1917 p->wirevec[i - 1].iov_len = savelen;
1920 if (!optionalPacket)
1922 return optionalPacket;
1926 /* Encode the packet's header (from the struct header in the packet to
1927 * the net byte order representation in the wire representation of the
1928 * packet, which is what is actually sent out on the wire) */
1930 rxi_EncodePacketHeader(register struct rx_packet *p)
1932 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1934 memset((char *)buf, 0, RX_HEADER_SIZE);
1935 *buf++ = htonl(p->header.epoch);
1936 *buf++ = htonl(p->header.cid);
1937 *buf++ = htonl(p->header.callNumber);
1938 *buf++ = htonl(p->header.seq);
1939 *buf++ = htonl(p->header.serial);
1940 *buf++ = htonl((((afs_uint32) p->header.type) << 24)
1941 | (((afs_uint32) p->header.flags) << 16)
1942 | (p->header.userStatus << 8) | p->header.securityIndex);
1943 /* Note: top 16 bits of this next word were reserved */
1944 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
1947 /* Decode the packet's header (from net byte order to a struct header) */
1949 rxi_DecodePacketHeader(register struct rx_packet *p)
1951 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1954 p->header.epoch = ntohl(*buf);
1956 p->header.cid = ntohl(*buf);
1958 p->header.callNumber = ntohl(*buf);
1960 p->header.seq = ntohl(*buf);
1962 p->header.serial = ntohl(*buf);
1968 /* C will truncate byte fields to bytes for me */
1969 p->header.type = temp >> 24;
1970 p->header.flags = temp >> 16;
1971 p->header.userStatus = temp >> 8;
1972 p->header.securityIndex = temp >> 0;
1977 p->header.serviceId = (temp & 0xffff);
1978 p->header.spare = temp >> 16;
1979 /* Note: top 16 bits of this last word are the security checksum */
1983 rxi_PrepareSendPacket(register struct rx_call *call,
1984 register struct rx_packet *p, register int last)
1986 register struct rx_connection *conn = call->conn;
1988 ssize_t len; /* len must be a signed type; it can go negative */
1990 p->flags &= ~RX_PKTFLAG_ACKED;
1991 p->header.cid = (conn->cid | call->channel);
1992 p->header.serviceId = conn->serviceId;
1993 p->header.securityIndex = conn->securityIndex;
1994 p->header.callNumber = *call->callNumber;
1995 p->header.seq = call->tnext++;
1996 p->header.epoch = conn->epoch;
1997 p->header.type = RX_PACKET_TYPE_DATA;
1998 p->header.flags = 0;
1999 p->header.spare = 0;
2000 if (conn->type == RX_CLIENT_CONNECTION)
2001 p->header.flags |= RX_CLIENT_INITIATED;
2004 p->header.flags |= RX_LAST_PACKET;
2006 clock_Zero(&p->retryTime); /* Never yet transmitted */
2007 clock_Zero(&p->firstSent); /* Never yet transmitted */
2008 p->header.serial = 0; /* Another way of saying never transmitted... */
2011 /* Now that we're sure this is the last data on the call, make sure
2012 * that the "length" and the sum of the iov_lens matches. */
2013 len = p->length + call->conn->securityHeaderSize;
2015 for (i = 1; i < p->niovecs && len > 0; i++) {
2016 len -= p->wirevec[i].iov_len;
2019 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
2021 /* Free any extra elements in the wirevec */
2022 for (j = MAX(2, i); j < p->niovecs; j++) {
2023 rxi_freeCBuf(RX_CBUF_TO_PACKET(p->wirevec[j].iov_base, p));
2026 p->wirevec[i - 1].iov_len += len;
2028 RXS_PreparePacket(conn->securityObject, call, p);
2031 /* Given an interface MTU size, calculate an adjusted MTU size that
2032 * will make efficient use of the RX buffers when the peer is sending
2033 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
2035 rxi_AdjustIfMTU(int mtu)
2040 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2041 if (mtu <= adjMTU) {
2048 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2049 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2052 /* Given an interface MTU size, and the peer's advertised max receive
2053 * size, calculate an adjisted maxMTU size that makes efficient use
2054 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2056 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2058 int maxMTU = mtu * rxi_nSendFrags;
2059 maxMTU = MIN(maxMTU, peerMaxMTU);
2060 return rxi_AdjustIfMTU(maxMTU);
2063 /* Given a packet size, figure out how many datagram packet will fit.
2064 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2065 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2066 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2068 rxi_AdjustDgramPackets(int frags, int mtu)
2071 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2074 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2075 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2076 /* subtract the size of the first and last packets */
2077 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2081 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));