2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
12 #include "afs/param.h"
14 #include <afs/param.h>
22 #include "afs/sysincludes.h"
23 #include "afsincludes.h"
24 #include "rx/rx_kcommon.h"
25 #include "rx/rx_clock.h"
26 #include "rx/rx_queue.h"
27 #include "rx/rx_packet.h"
28 #else /* defined(UKERNEL) */
29 #ifdef RX_KERNEL_TRACE
30 #include "../rx/rx_kcommon.h"
33 #ifndef AFS_LINUX20_ENV
36 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
37 #include "afs/sysincludes.h"
39 #if defined(AFS_OBSD_ENV)
43 #if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
44 #if !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
45 #include "sys/mount.h" /* it gets pulled in by something later anyway */
49 #include "netinet/in.h"
50 #include "afs/afs_osi.h"
51 #include "rx_kmutex.h"
52 #include "rx/rx_clock.h"
53 #include "rx/rx_queue.h"
55 #include <sys/sysmacros.h>
57 #include "rx/rx_packet.h"
58 #endif /* defined(UKERNEL) */
59 #include "rx/rx_globals.h"
61 #include "sys/types.h"
64 #if defined(AFS_NT40_ENV) || defined(AFS_DJGPP_ENV)
68 #define EWOULDBLOCK WSAEWOULDBLOCK
71 #include <sys/socket.h>
72 #include <netinet/in.h>
73 #endif /* AFS_NT40_ENV */
74 #include "rx_xmit_nt.h"
77 #include <sys/socket.h>
78 #include <netinet/in.h>
84 #include <sys/sysmacros.h>
86 #include "rx_packet.h"
87 #include "rx_globals.h"
103 /* rxdb_fileID is used to identify the lock location, along with line#. */
104 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
105 #endif /* RX_LOCKS_DB */
106 struct rx_packet *rx_mallocedP = 0;
108 extern char cml_version_number[];
109 extern int (*rx_almostSent) ();
111 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
112 afs_int32 ahost, short aport,
115 /* some rules about packets:
116 * 1. When a packet is allocated, the final iov_buf contains room for
117 * a security trailer, but iov_len masks that fact. If the security
118 * package wants to add the trailer, it may do so, and then extend
119 * iov_len appropriately. For this reason, packet's niovecs and
120 * iov_len fields should be accurate before calling PreparePacket.
124 * all packet buffers (iov_base) are integral multiples of
126 * offset is an integral multiple of the word size.
129 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
133 for (l = 0, i = 1; i < packet->niovecs; i++) {
134 if (l + packet->wirevec[i].iov_len > offset) {
136 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
139 l += packet->wirevec[i].iov_len;
146 * all packet buffers (iov_base) are integral multiples of the word size.
147 * offset is an integral multiple of the word size.
150 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
154 for (l = 0, i = 1; i < packet->niovecs; i++) {
155 if (l + packet->wirevec[i].iov_len > offset) {
156 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
157 (offset - l))) = data;
160 l += packet->wirevec[i].iov_len;
167 * all packet buffers (iov_base) are integral multiples of the
169 * offset is an integral multiple of the word size.
171 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
174 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
177 unsigned int i, j, l, r;
178 for (l = 0, i = 1; i < packet->niovecs; i++) {
179 if (l + packet->wirevec[i].iov_len > offset) {
182 l += packet->wirevec[i].iov_len;
185 /* i is the iovec which contains the first little bit of data in which we
186 * are interested. l is the total length of everything prior to this iovec.
187 * j is the number of bytes we can safely copy out of this iovec.
190 while ((resid > 0) && (i < packet->niovecs)) {
191 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
192 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
194 l += packet->wirevec[i].iov_len;
198 return (resid ? (r - resid) : r);
203 * all packet buffers (iov_base) are integral multiples of the
205 * offset is an integral multiple of the word size.
208 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
213 for (l = 0, i = 1; i < packet->niovecs; i++) {
214 if (l + packet->wirevec[i].iov_len > offset) {
217 l += packet->wirevec[i].iov_len;
220 /* i is the iovec which contains the first little bit of data in which we
221 * are interested. l is the total length of everything prior to this iovec.
222 * j is the number of bytes we can safely copy out of this iovec.
225 while ((resid > 0) && (i < RX_MAXWVECS)) {
226 if (i >= packet->niovecs)
227 if (rxi_AllocDataBuf(packet, resid, RX_PACKET_CLASS_SEND_CBUF) > 0) /* ++niovecs as a side-effect */
230 b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
231 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
234 l += packet->wirevec[i].iov_len;
238 return (resid ? (r - resid) : r);
241 static struct rx_packet *
248 MUTEX_ENTER(&rx_freePktQ_lock);
251 if (rxi_OverQuota(class)) {
253 rxi_NeedMorePackets = TRUE;
254 MUTEX_ENTER(&rx_stats_mutex);
256 case RX_PACKET_CLASS_RECEIVE:
257 rx_stats.receivePktAllocFailures++;
259 case RX_PACKET_CLASS_SEND:
260 rx_stats.sendPktAllocFailures++;
262 case RX_PACKET_CLASS_SPECIAL:
263 rx_stats.specialPktAllocFailures++;
265 case RX_PACKET_CLASS_RECV_CBUF:
266 rx_stats.receiveCbufPktAllocFailures++;
268 case RX_PACKET_CLASS_SEND_CBUF:
269 rx_stats.sendCbufPktAllocFailures++;
272 MUTEX_EXIT(&rx_stats_mutex);
276 if (queue_IsEmpty(&rx_freePacketQueue)) {
278 rxi_NeedMorePackets = TRUE;
282 if (queue_IsEmpty(&rx_freePacketQueue)) {
283 rxi_MorePacketsNoLock(rx_initSendWindow);
288 c = queue_First(&rx_freePacketQueue, rx_packet);
290 if (!(c->flags & RX_PKTFLAG_FREE))
291 osi_Panic("rxi_AllocPacket: packet not free\n");
292 c->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
298 MUTEX_EXIT(&rx_freePktQ_lock);
305 * Free a packet currently used as a continuation buffer
308 rxi_freeCBuf(struct rx_packet *c)
313 MUTEX_ENTER(&rx_freePktQ_lock);
315 rxi_FreePacketNoLock(c);
316 /* Wakeup anyone waiting for packets */
319 MUTEX_EXIT(&rx_freePktQ_lock);
323 /* this one is kind of awful.
324 * In rxkad, the packet has been all shortened, and everything, ready for
325 * sending. All of a sudden, we discover we need some of that space back.
326 * This isn't terribly general, because it knows that the packets are only
327 * rounded up to the EBS (userdata + security header).
330 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
334 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
335 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
336 p->wirevec[i].iov_len += nb;
340 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
341 p->wirevec[i].iov_len += nb;
349 /* get sufficient space to store nb bytes of data (or more), and hook
350 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
351 * returns the number of bytes >0 which it failed to come up with.
352 * Don't need to worry about locking on packet, since only
353 * one thread can manipulate one at a time. Locking on continution
354 * packets is handled by allocCBuf */
355 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
357 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
361 for (i = p->niovecs; nb > 0 && i < RX_MAXWVECS; i++) {
362 register struct rx_packet *cb;
363 if ((cb = allocCBuf(class))) {
364 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
365 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
366 nb -= RX_CBUFFERSIZE;
367 p->length += RX_CBUFFERSIZE;
376 /* Add more packet buffers */
378 rxi_MorePackets(int apackets)
380 struct rx_packet *p, *e;
384 getme = apackets * sizeof(struct rx_packet);
385 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
387 PIN(p, getme); /* XXXXX */
388 memset((char *)p, 0, getme);
391 MUTEX_ENTER(&rx_freePktQ_lock);
393 for (e = p + apackets; p < e; p++) {
394 p->wirevec[0].iov_base = (char *)(p->wirehead);
395 p->wirevec[0].iov_len = RX_HEADER_SIZE;
396 p->wirevec[1].iov_base = (char *)(p->localdata);
397 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
398 p->flags |= RX_PKTFLAG_FREE;
401 queue_Append(&rx_freePacketQueue, p);
403 rx_nFreePackets += apackets;
404 rxi_NeedMorePackets = FALSE;
408 MUTEX_EXIT(&rx_freePktQ_lock);
413 /* Add more packet buffers */
415 rxi_MorePacketsNoLock(int apackets)
417 struct rx_packet *p, *e;
420 /* allocate enough packets that 1/4 of the packets will be able
421 * to hold maximal amounts of data */
422 apackets += (apackets / 4)
423 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
424 getme = apackets * sizeof(struct rx_packet);
425 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
427 memset((char *)p, 0, getme);
429 for (e = p + apackets; p < e; p++) {
430 p->wirevec[0].iov_base = (char *)(p->wirehead);
431 p->wirevec[0].iov_len = RX_HEADER_SIZE;
432 p->wirevec[1].iov_base = (char *)(p->localdata);
433 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
434 p->flags |= RX_PKTFLAG_FREE;
437 queue_Append(&rx_freePacketQueue, p);
439 rx_nFreePackets += apackets;
440 rxi_NeedMorePackets = FALSE;
446 rxi_FreeAllPackets(void)
448 /* must be called at proper interrupt level, etcetera */
449 /* MTUXXX need to free all Packets */
450 osi_Free(rx_mallocedP,
451 (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
452 UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
455 /* Allocate more packets iff we need more continuation buffers */
456 /* In kernel, can't page in memory with interrupts disabled, so we
457 * don't use the event mechanism. */
459 rx_CheckPackets(void)
461 if (rxi_NeedMorePackets) {
462 rxi_MorePackets(rx_initSendWindow);
466 /* In the packet freeing routine below, the assumption is that
467 we want all of the packets to be used equally frequently, so that we
468 don't get packet buffers paging out. It would be just as valid to
469 assume that we DO want them to page out if not many are being used.
470 In any event, we assume the former, and append the packets to the end
472 /* This explanation is bogus. The free list doesn't remain in any kind of
473 useful order for afs_int32: the packets in use get pretty much randomly scattered
474 across all the pages. In order to permit unused {packets,bufs} to page out, they
475 must be stored so that packets which are adjacent in memory are adjacent in the
476 free list. An array springs rapidly to mind.
479 /* Actually free the packet p. */
481 rxi_FreePacketNoLock(struct rx_packet *p)
483 dpf(("Free %x\n", (int)p));
485 if (p->flags & RX_PKTFLAG_FREE)
486 osi_Panic("rxi_FreePacketNoLock: packet already free\n");
488 p->flags |= RX_PKTFLAG_FREE;
489 queue_Append(&rx_freePacketQueue, p);
493 rxi_FreeDataBufsNoLock(struct rx_packet *p, int first)
495 struct iovec *iov, *end;
497 if (first != 1) /* MTUXXX */
498 osi_Panic("FreeDataBufs 1: first must be 1");
499 iov = &p->wirevec[1];
500 end = iov + (p->niovecs - 1);
501 if (iov->iov_base != (caddr_t) p->localdata) /* MTUXXX */
502 osi_Panic("FreeDataBufs 2: vec 1 must be localdata");
503 for (iov++; iov < end; iov++) {
505 osi_Panic("FreeDataBufs 3: vecs 2-niovecs must not be NULL");
506 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
514 int rxi_nBadIovecs = 0;
516 /* rxi_RestoreDataBufs
518 * Restore the correct sizes to the iovecs. Called when reusing a packet
519 * for reading off the wire.
522 rxi_RestoreDataBufs(struct rx_packet *p)
525 struct iovec *iov = &p->wirevec[2];
527 p->wirevec[0].iov_base = (char *)(p->wirehead);
528 p->wirevec[0].iov_len = RX_HEADER_SIZE;
529 p->wirevec[1].iov_base = (char *)(p->localdata);
530 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
532 for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
533 if (!iov->iov_base) {
538 iov->iov_len = RX_CBUFFERSIZE;
543 rxi_TrimDataBufs(struct rx_packet *p, int first)
546 struct iovec *iov, *end;
550 osi_Panic("TrimDataBufs 1: first must be 1");
552 /* Skip over continuation buffers containing message data */
553 iov = &p->wirevec[2];
554 end = iov + (p->niovecs - 2);
555 length = p->length - p->wirevec[1].iov_len;
556 for (; iov < end && length > 0; iov++) {
558 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
559 length -= iov->iov_len;
562 /* iov now points to the first empty data buffer. */
567 MUTEX_ENTER(&rx_freePktQ_lock);
569 for (; iov < end; iov++) {
571 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
572 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
577 MUTEX_EXIT(&rx_freePktQ_lock);
583 /* Free the packet p. P is assumed not to be on any queue, i.e.
584 * remove it yourself first if you call this routine. */
586 rxi_FreePacket(struct rx_packet *p)
591 MUTEX_ENTER(&rx_freePktQ_lock);
593 rxi_FreeDataBufsNoLock(p, 1);
594 rxi_FreePacketNoLock(p);
595 /* Wakeup anyone waiting for packets */
598 MUTEX_EXIT(&rx_freePktQ_lock);
603 /* rxi_AllocPacket sets up p->length so it reflects the number of
604 * bytes in the packet at this point, **not including** the header.
605 * The header is absolutely necessary, besides, this is the way the
606 * length field is usually used */
608 rxi_AllocPacketNoLock(int class)
610 register struct rx_packet *p;
613 if (rxi_OverQuota(class)) {
614 rxi_NeedMorePackets = TRUE;
615 MUTEX_ENTER(&rx_stats_mutex);
617 case RX_PACKET_CLASS_RECEIVE:
618 rx_stats.receivePktAllocFailures++;
620 case RX_PACKET_CLASS_SEND:
621 rx_stats.sendPktAllocFailures++;
623 case RX_PACKET_CLASS_SPECIAL:
624 rx_stats.specialPktAllocFailures++;
626 case RX_PACKET_CLASS_RECV_CBUF:
627 rx_stats.receiveCbufPktAllocFailures++;
629 case RX_PACKET_CLASS_SEND_CBUF:
630 rx_stats.sendCbufPktAllocFailures++;
633 MUTEX_EXIT(&rx_stats_mutex);
634 return (struct rx_packet *)0;
638 MUTEX_ENTER(&rx_stats_mutex);
639 rx_stats.packetRequests++;
640 MUTEX_EXIT(&rx_stats_mutex);
643 if (queue_IsEmpty(&rx_freePacketQueue))
644 osi_Panic("rxi_AllocPacket error");
646 if (queue_IsEmpty(&rx_freePacketQueue))
647 rxi_MorePacketsNoLock(rx_initSendWindow);
651 p = queue_First(&rx_freePacketQueue, rx_packet);
652 if (!(p->flags & RX_PKTFLAG_FREE))
653 osi_Panic("rxi_AllocPacket: packet not free\n");
655 dpf(("Alloc %x, class %d\n", (int)p, class));
658 p->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
661 /* have to do this here because rx_FlushWrite fiddles with the iovs in
662 * order to truncate outbound packets. In the near future, may need
663 * to allocate bufs from a static pool here, and/or in AllocSendPacket
665 p->wirevec[0].iov_base = (char *)(p->wirehead);
666 p->wirevec[0].iov_len = RX_HEADER_SIZE;
667 p->wirevec[1].iov_base = (char *)(p->localdata);
668 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
670 p->length = RX_FIRSTBUFFERSIZE;
675 rxi_AllocPacket(int class)
677 register struct rx_packet *p;
679 MUTEX_ENTER(&rx_freePktQ_lock);
680 p = rxi_AllocPacketNoLock(class);
681 MUTEX_EXIT(&rx_freePktQ_lock);
685 /* This guy comes up with as many buffers as it {takes,can get} given
686 * the MTU for this call. It also sets the packet length before
687 * returning. caution: this is often called at NETPRI
688 * Called with call locked.
691 rxi_AllocSendPacket(register struct rx_call *call, int want)
693 register struct rx_packet *p = (struct rx_packet *)0;
695 register unsigned delta;
698 mud = call->MTU - RX_HEADER_SIZE;
700 rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
701 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
703 while (!(call->error)) {
704 MUTEX_ENTER(&rx_freePktQ_lock);
705 /* if an error occurred, or we get the packet we want, we're done */
706 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
707 MUTEX_EXIT(&rx_freePktQ_lock);
710 want = MIN(want, mud);
712 if ((unsigned)want > p->length)
713 (void)rxi_AllocDataBuf(p, (want - p->length),
714 RX_PACKET_CLASS_SEND_CBUF);
716 if ((unsigned)p->length > mud)
719 if (delta >= p->length) {
728 /* no error occurred, and we didn't get a packet, so we sleep.
729 * At this point, we assume that packets will be returned
730 * sooner or later, as packets are acknowledged, and so we
733 call->flags |= RX_CALL_WAIT_PACKETS;
734 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
735 MUTEX_EXIT(&call->lock);
736 rx_waitingForPackets = 1;
738 #ifdef RX_ENABLE_LOCKS
739 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
741 osi_rxSleep(&rx_waitingForPackets);
743 MUTEX_EXIT(&rx_freePktQ_lock);
744 MUTEX_ENTER(&call->lock);
745 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
746 call->flags &= ~RX_CALL_WAIT_PACKETS;
755 /* count the number of used FDs */
757 CountFDs(register int amax)
760 register int i, code;
764 for (i = 0; i < amax; i++) {
765 code = fstat(i, &tstat);
774 #define CountFDs(amax) amax
778 #if !defined(KERNEL) || defined(UKERNEL)
780 /* This function reads a single packet from the interface into the
781 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
782 * (host,port) of the sender are stored in the supplied variables, and
783 * the data length of the packet is stored in the packet structure.
784 * The header is decoded. */
786 rxi_ReadPacket(int socket, register struct rx_packet *p, afs_uint32 * host,
789 struct sockaddr_in from;
792 register afs_int32 tlen, savelen;
794 rx_computelen(p, tlen);
795 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
797 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
798 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
799 * it once in order to avoid races. */
802 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
810 /* Extend the last iovec for padding, it's just to make sure that the
811 * read doesn't return more data than we expect, and is done to get around
812 * our problems caused by the lack of a length field in the rx header.
813 * Use the extra buffer that follows the localdata in each packet
815 savelen = p->wirevec[p->niovecs - 1].iov_len;
816 p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
818 memset((char *)&msg, 0, sizeof(msg));
819 msg.msg_name = (char *)&from;
820 msg.msg_namelen = sizeof(struct sockaddr_in);
821 msg.msg_iov = p->wirevec;
822 msg.msg_iovlen = p->niovecs;
823 nbytes = rxi_Recvmsg(socket, &msg, 0);
825 /* restore the vec to its correct state */
826 p->wirevec[p->niovecs - 1].iov_len = savelen;
828 p->length = (nbytes - RX_HEADER_SIZE);
829 if ((nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
831 rxi_MorePackets(rx_initSendWindow);
832 else if (nbytes < 0 && errno == EWOULDBLOCK) {
833 MUTEX_ENTER(&rx_stats_mutex);
834 rx_stats.noPacketOnRead++;
835 MUTEX_EXIT(&rx_stats_mutex);
837 MUTEX_ENTER(&rx_stats_mutex);
838 rx_stats.bogusPacketOnRead++;
839 rx_stats.bogusHost = from.sin_addr.s_addr;
840 MUTEX_EXIT(&rx_stats_mutex);
841 dpf(("B: bogus packet from [%x,%d] nb=%d", from.sin_addr.s_addr,
842 from.sin_port, nbytes));
846 /* Extract packet header. */
847 rxi_DecodePacketHeader(p);
849 *host = from.sin_addr.s_addr;
850 *port = from.sin_port;
851 if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
852 struct rx_peer *peer;
853 MUTEX_ENTER(&rx_stats_mutex);
854 rx_stats.packetsRead[p->header.type - 1]++;
855 MUTEX_EXIT(&rx_stats_mutex);
857 * Try to look up this peer structure. If it doesn't exist,
858 * don't create a new one -
859 * we don't keep count of the bytes sent/received if a peer
860 * structure doesn't already exist.
862 * The peer/connection cleanup code assumes that there is 1 peer
863 * per connection. If we actually created a peer structure here
864 * and this packet was an rxdebug packet, the peer structure would
865 * never be cleaned up.
867 peer = rxi_FindPeer(*host, *port, 0, 0);
869 MUTEX_ENTER(&peer->peer_lock);
870 hadd32(peer->bytesReceived, p->length);
871 MUTEX_EXIT(&peer->peer_lock);
875 /* Free any empty packet buffers at the end of this packet */
876 rxi_TrimDataBufs(p, 1);
882 #endif /* !KERNEL || UKERNEL */
884 /* This function splits off the first packet in a jumbo packet.
885 * As of AFS 3.5, jumbograms contain more than one fixed size
886 * packet, and the RX_JUMBO_PACKET flag is set in all but the
887 * last packet header. All packets (except the last) are padded to
888 * fall on RX_CBUFFERSIZE boundaries.
889 * HACK: We store the length of the first n-1 packets in the
890 * last two pad bytes. */
893 rxi_SplitJumboPacket(register struct rx_packet *p, afs_int32 host, short port,
896 struct rx_packet *np;
897 struct rx_jumboHeader *jp;
903 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
904 * bytes in length. All but the first packet are preceded by
905 * an abbreviated four byte header. The length of the last packet
906 * is calculated from the size of the jumbogram. */
907 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
909 if ((int)p->length < length) {
910 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
913 niov = p->niovecs - 2;
915 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
918 iov = &p->wirevec[2];
919 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
921 /* Get a pointer to the abbreviated packet header */
922 jp = (struct rx_jumboHeader *)
923 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
925 /* Set up the iovecs for the next packet */
926 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
927 np->wirevec[0].iov_len = sizeof(struct rx_header);
928 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
929 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
930 np->niovecs = niov + 1;
931 for (i = 2, iov++; i <= niov; i++, iov++) {
932 np->wirevec[i] = *iov;
934 np->length = p->length - length;
935 p->length = RX_JUMBOBUFFERSIZE;
938 /* Convert the jumbo packet header to host byte order */
939 temp = ntohl(*(afs_uint32 *) jp);
940 jp->flags = (u_char) (temp >> 24);
941 jp->cksum = (u_short) (temp);
943 /* Fill in the packet header */
944 np->header = p->header;
945 np->header.serial = p->header.serial + 1;
946 np->header.seq = p->header.seq + 1;
947 np->header.flags = jp->flags;
948 np->header.spare = jp->cksum;
954 /* Send a udp datagram */
956 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
957 int length, int istack)
961 memset(&msg, 0, sizeof(msg));
963 msg.msg_iovlen = nvecs;
965 msg.msg_namelen = sizeof(struct sockaddr_in);
967 rxi_Sendmsg(socket, &msg, 0);
971 #elif !defined(UKERNEL)
973 * message receipt is done in rxk_input or rx_put.
976 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
978 * Copy an mblock to the contiguous area pointed to by cp.
979 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
980 * but it doesn't really.
981 * Returns the number of bytes not transferred.
982 * The message is NOT changed.
985 cpytoc(mblk_t * mp, register int off, register int len, register char *cp)
989 for (; mp && len > 0; mp = mp->b_cont) {
990 if (mp->b_datap->db_type != M_DATA) {
993 n = MIN(len, (mp->b_wptr - mp->b_rptr));
994 memcpy(cp, (char *)mp->b_rptr, n);
1002 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1003 * but it doesn't really.
1004 * This sucks, anyway, do it like m_cpy.... below
1007 cpytoiovec(mblk_t * mp, int off, int len, register struct iovec *iovs,
1010 register int m, n, o, t, i;
1012 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1013 if (mp->b_datap->db_type != M_DATA) {
1016 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1022 t = iovs[i].iov_len;
1025 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1035 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
1036 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1038 #if !defined(AFS_LINUX20_ENV)
1040 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1043 unsigned int l1, l2, i, t;
1045 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1046 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1049 if (m->m_len <= off) {
1059 p1 = mtod(m, caddr_t) + off;
1060 l1 = m->m_len - off;
1062 p2 = iovs[0].iov_base;
1063 l2 = iovs[0].iov_len;
1066 t = MIN(l1, MIN(l2, (unsigned int)len));
1077 p1 = mtod(m, caddr_t);
1083 p2 = iovs[i].iov_base;
1084 l2 = iovs[i].iov_len;
1092 #endif /* AFS_SUN5_ENV */
1094 #if !defined(AFS_LINUX20_ENV)
1096 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1097 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1103 struct rx_packet *phandle;
1104 int hdr_len, data_len;
1109 m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1116 #endif /*KERNEL && !UKERNEL */
1119 /* send a response to a debug packet */
1122 rxi_ReceiveDebugPacket(register struct rx_packet *ap, osi_socket asocket,
1123 afs_int32 ahost, short aport, int istack)
1125 struct rx_debugIn tin;
1127 struct rx_serverQueueEntry *np, *nqe;
1130 * Only respond to client-initiated Rx debug packets,
1131 * and clear the client flag in the response.
1133 if (ap->header.flags & RX_CLIENT_INITIATED) {
1134 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1135 rxi_EncodePacketHeader(ap);
1140 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1141 /* all done with packet, now set length to the truth, so we can
1142 * reuse this packet */
1143 rx_computelen(ap, ap->length);
1145 tin.type = ntohl(tin.type);
1146 tin.index = ntohl(tin.index);
1148 case RX_DEBUGI_GETSTATS:{
1149 struct rx_debugStats tstat;
1151 /* get basic stats */
1152 memset((char *)&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1153 tstat.version = RX_DEBUGI_VERSION;
1154 #ifndef RX_ENABLE_LOCKS
1155 tstat.waitingForPackets = rx_waitingForPackets;
1157 MUTEX_ENTER(&rx_serverPool_lock);
1158 tstat.nFreePackets = htonl(rx_nFreePackets);
1159 tstat.callsExecuted = htonl(rxi_nCalls);
1160 tstat.packetReclaims = htonl(rx_packetReclaims);
1161 tstat.usedFDs = CountFDs(64);
1162 tstat.nWaiting = htonl(rx_nWaiting);
1163 queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1165 MUTEX_EXIT(&rx_serverPool_lock);
1166 tstat.idleThreads = htonl(tstat.idleThreads);
1167 tl = sizeof(struct rx_debugStats) - ap->length;
1169 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1172 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1174 ap->length = sizeof(struct rx_debugStats);
1175 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1176 rx_computelen(ap, ap->length);
1181 case RX_DEBUGI_GETALLCONN:
1182 case RX_DEBUGI_GETCONN:{
1184 register struct rx_connection *tc;
1185 struct rx_call *tcall;
1186 struct rx_debugConn tconn;
1187 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1190 tl = sizeof(struct rx_debugConn) - ap->length;
1192 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1196 memset((char *)&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1197 /* get N'th (maybe) "interesting" connection info */
1198 for (i = 0; i < rx_hashTableSize; i++) {
1199 #if !defined(KERNEL)
1200 /* the time complexity of the algorithm used here
1201 * exponentially increses with the number of connections.
1203 #ifdef AFS_PTHREAD_ENV
1209 MUTEX_ENTER(&rx_connHashTable_lock);
1210 /* We might be slightly out of step since we are not
1211 * locking each call, but this is only debugging output.
1213 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1214 if ((all || rxi_IsConnInteresting(tc))
1215 && tin.index-- <= 0) {
1216 tconn.host = tc->peer->host;
1217 tconn.port = tc->peer->port;
1218 tconn.cid = htonl(tc->cid);
1219 tconn.epoch = htonl(tc->epoch);
1220 tconn.serial = htonl(tc->serial);
1221 for (j = 0; j < RX_MAXCALLS; j++) {
1222 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1223 if ((tcall = tc->call[j])) {
1224 tconn.callState[j] = tcall->state;
1225 tconn.callMode[j] = tcall->mode;
1226 tconn.callFlags[j] = tcall->flags;
1227 if (queue_IsNotEmpty(&tcall->rq))
1228 tconn.callOther[j] |= RX_OTHER_IN;
1229 if (queue_IsNotEmpty(&tcall->tq))
1230 tconn.callOther[j] |= RX_OTHER_OUT;
1232 tconn.callState[j] = RX_STATE_NOTINIT;
1235 tconn.natMTU = htonl(tc->peer->natMTU);
1236 tconn.error = htonl(tc->error);
1237 tconn.flags = tc->flags;
1238 tconn.type = tc->type;
1239 tconn.securityIndex = tc->securityIndex;
1240 if (tc->securityObject) {
1241 RXS_GetStats(tc->securityObject, tc,
1243 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1244 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1247 DOHTONL(packetsReceived);
1248 DOHTONL(packetsSent);
1249 DOHTONL(bytesReceived);
1253 sizeof(tconn.secStats.spares) /
1258 sizeof(tconn.secStats.sparel) /
1259 sizeof(afs_int32); i++)
1263 MUTEX_EXIT(&rx_connHashTable_lock);
1264 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1267 ap->length = sizeof(struct rx_debugConn);
1268 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1274 MUTEX_EXIT(&rx_connHashTable_lock);
1276 /* if we make it here, there are no interesting packets */
1277 tconn.cid = htonl(0xffffffff); /* means end */
1278 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1281 ap->length = sizeof(struct rx_debugConn);
1282 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1288 * Pass back all the peer structures we have available
1291 case RX_DEBUGI_GETPEER:{
1293 register struct rx_peer *tp;
1294 struct rx_debugPeer tpeer;
1297 tl = sizeof(struct rx_debugPeer) - ap->length;
1299 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1303 memset((char *)&tpeer, 0, sizeof(tpeer));
1304 for (i = 0; i < rx_hashTableSize; i++) {
1305 #if !defined(KERNEL)
1306 /* the time complexity of the algorithm used here
1307 * exponentially increses with the number of peers.
1309 * Yielding after processing each hash table entry
1310 * and dropping rx_peerHashTable_lock.
1311 * also increases the risk that we will miss a new
1312 * entry - but we are willing to live with this
1313 * limitation since this is meant for debugging only
1315 #ifdef AFS_PTHREAD_ENV
1321 MUTEX_ENTER(&rx_peerHashTable_lock);
1322 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1323 if (tin.index-- <= 0) {
1324 tpeer.host = tp->host;
1325 tpeer.port = tp->port;
1326 tpeer.ifMTU = htons(tp->ifMTU);
1327 tpeer.idleWhen = htonl(tp->idleWhen);
1328 tpeer.refCount = htons(tp->refCount);
1329 tpeer.burstSize = tp->burstSize;
1330 tpeer.burst = tp->burst;
1331 tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1332 tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1333 tpeer.rtt = htonl(tp->rtt);
1334 tpeer.rtt_dev = htonl(tp->rtt_dev);
1335 tpeer.timeout.sec = htonl(tp->timeout.sec);
1336 tpeer.timeout.usec = htonl(tp->timeout.usec);
1337 tpeer.nSent = htonl(tp->nSent);
1338 tpeer.reSends = htonl(tp->reSends);
1339 tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1340 tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1341 tpeer.rateFlag = htonl(tp->rateFlag);
1342 tpeer.natMTU = htons(tp->natMTU);
1343 tpeer.maxMTU = htons(tp->maxMTU);
1344 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1345 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1346 tpeer.MTU = htons(tp->MTU);
1347 tpeer.cwind = htons(tp->cwind);
1348 tpeer.nDgramPackets = htons(tp->nDgramPackets);
1349 tpeer.congestSeq = htons(tp->congestSeq);
1350 tpeer.bytesSent.high = htonl(tp->bytesSent.high);
1351 tpeer.bytesSent.low = htonl(tp->bytesSent.low);
1352 tpeer.bytesReceived.high =
1353 htonl(tp->bytesReceived.high);
1354 tpeer.bytesReceived.low =
1355 htonl(tp->bytesReceived.low);
1357 MUTEX_EXIT(&rx_peerHashTable_lock);
1358 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1361 ap->length = sizeof(struct rx_debugPeer);
1362 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1368 MUTEX_EXIT(&rx_peerHashTable_lock);
1370 /* if we make it here, there are no interesting packets */
1371 tpeer.host = htonl(0xffffffff); /* means end */
1372 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1375 ap->length = sizeof(struct rx_debugPeer);
1376 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1381 case RX_DEBUGI_RXSTATS:{
1385 tl = sizeof(rx_stats) - ap->length;
1387 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1391 /* Since its all int32s convert to network order with a loop. */
1392 MUTEX_ENTER(&rx_stats_mutex);
1393 s = (afs_int32 *) & rx_stats;
1394 for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
1395 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
1398 ap->length = sizeof(rx_stats);
1399 MUTEX_EXIT(&rx_stats_mutex);
1400 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1406 /* error response packet */
1407 tin.type = htonl(RX_DEBUGI_BADTYPE);
1408 tin.index = tin.type;
1409 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1411 ap->length = sizeof(struct rx_debugIn);
1412 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1420 rxi_ReceiveVersionPacket(register struct rx_packet *ap, osi_socket asocket,
1421 afs_int32 ahost, short aport, int istack)
1426 * Only respond to client-initiated version requests, and
1427 * clear that flag in the response.
1429 if (ap->header.flags & RX_CLIENT_INITIATED) {
1432 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1433 rxi_EncodePacketHeader(ap);
1434 memset(buf, 0, sizeof(buf));
1435 strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
1436 rx_packetwrite(ap, 0, 65, buf);
1439 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1447 /* send a debug packet back to the sender */
1449 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
1450 afs_int32 ahost, short aport, afs_int32 istack)
1452 struct sockaddr_in taddr;
1458 int waslocked = ISAFS_GLOCK();
1461 taddr.sin_family = AF_INET;
1462 taddr.sin_port = aport;
1463 taddr.sin_addr.s_addr = ahost;
1464 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
1465 taddr.sin_len = sizeof(struct sockaddr_in);
1468 /* We need to trim the niovecs. */
1469 nbytes = apacket->length;
1470 for (i = 1; i < apacket->niovecs; i++) {
1471 if (nbytes <= apacket->wirevec[i].iov_len) {
1472 savelen = apacket->wirevec[i].iov_len;
1473 saven = apacket->niovecs;
1474 apacket->wirevec[i].iov_len = nbytes;
1475 apacket->niovecs = i + 1; /* so condition fails because i == niovecs */
1477 nbytes -= apacket->wirevec[i].iov_len;
1481 #ifdef RX_KERNEL_TRACE
1482 if (ICL_SETACTIVE(afs_iclSetp)) {
1485 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1486 "before osi_NetSend()");
1494 /* debug packets are not reliably delivered, hence the cast below. */
1495 (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
1496 apacket->length + RX_HEADER_SIZE, istack);
1498 #ifdef RX_KERNEL_TRACE
1499 if (ICL_SETACTIVE(afs_iclSetp)) {
1501 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1502 "after osi_NetSend()");
1512 if (saven) { /* means we truncated the packet above. */
1513 apacket->wirevec[i - 1].iov_len = savelen;
1514 apacket->niovecs = saven;
1519 /* Send the packet to appropriate destination for the specified
1520 * call. The header is first encoded and placed in the packet.
1523 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
1524 struct rx_packet *p, int istack)
1530 struct sockaddr_in addr;
1531 register struct rx_peer *peer = conn->peer;
1534 char deliveryType = 'S';
1536 /* The address we're sending the packet to */
1537 memset(&addr, 0, sizeof(addr));
1538 addr.sin_family = AF_INET;
1539 addr.sin_port = peer->port;
1540 addr.sin_addr.s_addr = peer->host;
1542 /* This stuff should be revamped, I think, so that most, if not
1543 * all, of the header stuff is always added here. We could
1544 * probably do away with the encode/decode routines. XXXXX */
1546 /* Stamp each packet with a unique serial number. The serial
1547 * number is maintained on a connection basis because some types
1548 * of security may be based on the serial number of the packet,
1549 * and security is handled on a per authenticated-connection
1551 /* Pre-increment, to guarantee no zero serial number; a zero
1552 * serial number means the packet was never sent. */
1553 MUTEX_ENTER(&conn->conn_data_lock);
1554 p->header.serial = ++conn->serial;
1555 MUTEX_EXIT(&conn->conn_data_lock);
1556 /* This is so we can adjust retransmit time-outs better in the face of
1557 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1559 if (p->firstSerial == 0) {
1560 p->firstSerial = p->header.serial;
1563 /* If an output tracer function is defined, call it with the packet and
1564 * network address. Note this function may modify its arguments. */
1565 if (rx_almostSent) {
1566 int drop = (*rx_almostSent) (p, &addr);
1567 /* drop packet if return value is non-zero? */
1569 deliveryType = 'D'; /* Drop the packet */
1573 /* Get network byte order header */
1574 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1575 * touch ALL the fields */
1577 /* Send the packet out on the same socket that related packets are being
1581 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1584 /* Possibly drop this packet, for testing purposes */
1585 if ((deliveryType == 'D')
1586 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1587 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1588 deliveryType = 'D'; /* Drop the packet */
1590 deliveryType = 'S'; /* Send the packet */
1591 #endif /* RXDEBUG */
1593 /* Loop until the packet is sent. We'd prefer just to use a
1594 * blocking socket, but unfortunately the interface doesn't
1595 * allow us to have the socket block in send mode, and not
1596 * block in receive mode */
1599 waslocked = ISAFS_GLOCK();
1600 #ifdef RX_KERNEL_TRACE
1601 if (ICL_SETACTIVE(afs_iclSetp)) {
1604 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1605 "before osi_NetSend()");
1614 osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
1615 p->length + RX_HEADER_SIZE, istack)) != 0) {
1616 /* send failed, so let's hurry up the resend, eh? */
1617 MUTEX_ENTER(&rx_stats_mutex);
1618 rx_stats.netSendFailures++;
1619 MUTEX_EXIT(&rx_stats_mutex);
1620 p->retryTime = p->timeSent; /* resend it very soon */
1621 clock_Addmsec(&(p->retryTime),
1622 10 + (((afs_uint32) p->backoff) << 8));
1624 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1625 /* Linux is nice -- it can tell us right away that we cannot
1626 * reach this recipient by returning an ENETUNREACH error
1627 * code. So, when this happens let's "down" the host NOW so
1628 * we don't sit around waiting for this host to timeout later.
1630 if (call && code == -ENETUNREACH)
1631 call->lastReceiveTime = 0;
1635 #ifdef RX_KERNEL_TRACE
1636 if (ICL_SETACTIVE(afs_iclSetp)) {
1638 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1639 "after osi_NetSend()");
1651 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (int)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1653 MUTEX_ENTER(&rx_stats_mutex);
1654 rx_stats.packetsSent[p->header.type - 1]++;
1655 MUTEX_EXIT(&rx_stats_mutex);
1656 MUTEX_ENTER(&peer->peer_lock);
1657 hadd32(peer->bytesSent, p->length);
1658 MUTEX_EXIT(&peer->peer_lock);
1661 /* Send a list of packets to appropriate destination for the specified
1662 * connection. The headers are first encoded and placed in the packets.
1665 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
1666 struct rx_packet **list, int len, int istack)
1668 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1671 struct sockaddr_in addr;
1672 register struct rx_peer *peer = conn->peer;
1674 struct rx_packet *p = NULL;
1675 struct iovec wirevec[RX_MAXIOVECS];
1676 int i, length, code;
1679 struct rx_jumboHeader *jp;
1681 char deliveryType = 'S';
1683 /* The address we're sending the packet to */
1684 addr.sin_family = AF_INET;
1685 addr.sin_port = peer->port;
1686 addr.sin_addr.s_addr = peer->host;
1688 if (len + 1 > RX_MAXIOVECS) {
1689 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
1693 * Stamp the packets in this jumbogram with consecutive serial numbers
1695 MUTEX_ENTER(&conn->conn_data_lock);
1696 serial = conn->serial;
1697 conn->serial += len;
1698 MUTEX_EXIT(&conn->conn_data_lock);
1701 /* This stuff should be revamped, I think, so that most, if not
1702 * all, of the header stuff is always added here. We could
1703 * probably do away with the encode/decode routines. XXXXX */
1706 length = RX_HEADER_SIZE;
1707 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
1708 wirevec[0].iov_len = RX_HEADER_SIZE;
1709 for (i = 0; i < len; i++) {
1712 /* The whole 3.5 jumbogram scheme relies on packets fitting
1713 * in a single packet buffer. */
1714 if (p->niovecs > 2) {
1715 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
1718 /* Set the RX_JUMBO_PACKET flags in all but the last packets
1721 if (p->length != RX_JUMBOBUFFERSIZE) {
1722 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
1724 p->header.flags |= RX_JUMBO_PACKET;
1725 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1726 wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1728 wirevec[i + 1].iov_len = p->length;
1729 length += p->length;
1731 wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
1733 /* Convert jumbo packet header to network byte order */
1734 temp = (afs_uint32) (p->header.flags) << 24;
1735 temp |= (afs_uint32) (p->header.spare);
1736 *(afs_uint32 *) jp = htonl(temp);
1738 jp = (struct rx_jumboHeader *)
1739 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
1741 /* Stamp each packet with a unique serial number. The serial
1742 * number is maintained on a connection basis because some types
1743 * of security may be based on the serial number of the packet,
1744 * and security is handled on a per authenticated-connection
1746 /* Pre-increment, to guarantee no zero serial number; a zero
1747 * serial number means the packet was never sent. */
1748 p->header.serial = ++serial;
1749 /* This is so we can adjust retransmit time-outs better in the face of
1750 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1752 if (p->firstSerial == 0) {
1753 p->firstSerial = p->header.serial;
1756 /* If an output tracer function is defined, call it with the packet and
1757 * network address. Note this function may modify its arguments. */
1758 if (rx_almostSent) {
1759 int drop = (*rx_almostSent) (p, &addr);
1760 /* drop packet if return value is non-zero? */
1762 deliveryType = 'D'; /* Drop the packet */
1766 /* Get network byte order header */
1767 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1768 * touch ALL the fields */
1771 /* Send the packet out on the same socket that related packets are being
1775 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1778 /* Possibly drop this packet, for testing purposes */
1779 if ((deliveryType == 'D')
1780 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1781 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1782 deliveryType = 'D'; /* Drop the packet */
1784 deliveryType = 'S'; /* Send the packet */
1785 #endif /* RXDEBUG */
1787 /* Loop until the packet is sent. We'd prefer just to use a
1788 * blocking socket, but unfortunately the interface doesn't
1789 * allow us to have the socket block in send mode, and not
1790 * block in receive mode */
1792 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1793 waslocked = ISAFS_GLOCK();
1794 if (!istack && waslocked)
1798 osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
1800 /* send failed, so let's hurry up the resend, eh? */
1801 MUTEX_ENTER(&rx_stats_mutex);
1802 rx_stats.netSendFailures++;
1803 MUTEX_EXIT(&rx_stats_mutex);
1804 for (i = 0; i < len; i++) {
1806 p->retryTime = p->timeSent; /* resend it very soon */
1807 clock_Addmsec(&(p->retryTime),
1808 10 + (((afs_uint32) p->backoff) << 8));
1810 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1811 /* Linux is nice -- it can tell us right away that we cannot
1812 * reach this recipient by returning an ENETUNREACH error
1813 * code. So, when this happens let's "down" the host NOW so
1814 * we don't sit around waiting for this host to timeout later.
1816 if (call && code == -ENETUNREACH)
1817 call->lastReceiveTime = 0;
1820 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1821 if (!istack && waslocked)
1830 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (int)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1833 MUTEX_ENTER(&rx_stats_mutex);
1834 rx_stats.packetsSent[p->header.type - 1]++;
1835 MUTEX_EXIT(&rx_stats_mutex);
1836 MUTEX_ENTER(&peer->peer_lock);
1838 hadd32(peer->bytesSent, p->length);
1839 MUTEX_EXIT(&peer->peer_lock);
1843 /* Send a "special" packet to the peer connection. If call is
1844 * specified, then the packet is directed to a specific call channel
1845 * associated with the connection, otherwise it is directed to the
1846 * connection only. Uses optionalPacket if it is supplied, rather than
1847 * allocating a new packet buffer. Nbytes is the length of the data
1848 * portion of the packet. If data is non-null, nbytes of data are
1849 * copied into the packet. Type is the type of the packet, as defined
1850 * in rx.h. Bug: there's a lot of duplication between this and other
1851 * routines. This needs to be cleaned up. */
1853 rxi_SendSpecial(register struct rx_call *call,
1854 register struct rx_connection *conn,
1855 struct rx_packet *optionalPacket, int type, char *data,
1856 int nbytes, int istack)
1858 /* Some of the following stuff should be common code for all
1859 * packet sends (it's repeated elsewhere) */
1860 register struct rx_packet *p;
1862 int savelen = 0, saven = 0;
1863 int channel, callNumber;
1865 channel = call->channel;
1866 callNumber = *call->callNumber;
1867 /* BUSY packets refer to the next call on this connection */
1868 if (type == RX_PACKET_TYPE_BUSY) {
1877 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
1879 osi_Panic("rxi_SendSpecial failure");
1886 p->header.serviceId = conn->serviceId;
1887 p->header.securityIndex = conn->securityIndex;
1888 p->header.cid = (conn->cid | channel);
1889 p->header.callNumber = callNumber;
1891 p->header.epoch = conn->epoch;
1892 p->header.type = type;
1893 p->header.flags = 0;
1894 if (conn->type == RX_CLIENT_CONNECTION)
1895 p->header.flags |= RX_CLIENT_INITIATED;
1897 rx_packetwrite(p, 0, nbytes, data);
1899 for (i = 1; i < p->niovecs; i++) {
1900 if (nbytes <= p->wirevec[i].iov_len) {
1901 savelen = p->wirevec[i].iov_len;
1903 p->wirevec[i].iov_len = nbytes;
1904 p->niovecs = i + 1; /* so condition fails because i == niovecs */
1906 nbytes -= p->wirevec[i].iov_len;
1910 rxi_Send(call, p, istack);
1912 rxi_SendPacket((struct rx_call *)0, conn, p, istack);
1913 if (saven) { /* means we truncated the packet above. We probably don't */
1914 /* really need to do this, but it seems safer this way, given that */
1915 /* sneaky optionalPacket... */
1916 p->wirevec[i - 1].iov_len = savelen;
1919 if (!optionalPacket)
1921 return optionalPacket;
1925 /* Encode the packet's header (from the struct header in the packet to
1926 * the net byte order representation in the wire representation of the
1927 * packet, which is what is actually sent out on the wire) */
1929 rxi_EncodePacketHeader(register struct rx_packet *p)
1931 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1933 memset((char *)buf, 0, RX_HEADER_SIZE);
1934 *buf++ = htonl(p->header.epoch);
1935 *buf++ = htonl(p->header.cid);
1936 *buf++ = htonl(p->header.callNumber);
1937 *buf++ = htonl(p->header.seq);
1938 *buf++ = htonl(p->header.serial);
1939 *buf++ = htonl((((afs_uint32) p->header.type) << 24)
1940 | (((afs_uint32) p->header.flags) << 16)
1941 | (p->header.userStatus << 8) | p->header.securityIndex);
1942 /* Note: top 16 bits of this next word were reserved */
1943 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
1946 /* Decode the packet's header (from net byte order to a struct header) */
1948 rxi_DecodePacketHeader(register struct rx_packet *p)
1950 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1953 p->header.epoch = ntohl(*buf);
1955 p->header.cid = ntohl(*buf);
1957 p->header.callNumber = ntohl(*buf);
1959 p->header.seq = ntohl(*buf);
1961 p->header.serial = ntohl(*buf);
1967 /* C will truncate byte fields to bytes for me */
1968 p->header.type = temp >> 24;
1969 p->header.flags = temp >> 16;
1970 p->header.userStatus = temp >> 8;
1971 p->header.securityIndex = temp >> 0;
1976 p->header.serviceId = (temp & 0xffff);
1977 p->header.spare = temp >> 16;
1978 /* Note: top 16 bits of this last word are the security checksum */
1982 rxi_PrepareSendPacket(register struct rx_call *call,
1983 register struct rx_packet *p, register int last)
1985 register struct rx_connection *conn = call->conn;
1987 ssize_t len; /* len must be a signed type; it can go negative */
1989 p->flags &= ~RX_PKTFLAG_ACKED;
1990 p->header.cid = (conn->cid | call->channel);
1991 p->header.serviceId = conn->serviceId;
1992 p->header.securityIndex = conn->securityIndex;
1993 p->header.callNumber = *call->callNumber;
1994 p->header.seq = call->tnext++;
1995 p->header.epoch = conn->epoch;
1996 p->header.type = RX_PACKET_TYPE_DATA;
1997 p->header.flags = 0;
1998 p->header.spare = 0;
1999 if (conn->type == RX_CLIENT_CONNECTION)
2000 p->header.flags |= RX_CLIENT_INITIATED;
2003 p->header.flags |= RX_LAST_PACKET;
2005 clock_Zero(&p->retryTime); /* Never yet transmitted */
2006 clock_Zero(&p->firstSent); /* Never yet transmitted */
2007 p->header.serial = 0; /* Another way of saying never transmitted... */
2010 /* Now that we're sure this is the last data on the call, make sure
2011 * that the "length" and the sum of the iov_lens matches. */
2012 len = p->length + call->conn->securityHeaderSize;
2014 for (i = 1; i < p->niovecs && len > 0; i++) {
2015 len -= p->wirevec[i].iov_len;
2018 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
2020 /* Free any extra elements in the wirevec */
2021 for (j = MAX(2, i); j < p->niovecs; j++) {
2022 rxi_freeCBuf(RX_CBUF_TO_PACKET(p->wirevec[j].iov_base, p));
2025 p->wirevec[i - 1].iov_len += len;
2027 RXS_PreparePacket(conn->securityObject, call, p);
2030 /* Given an interface MTU size, calculate an adjusted MTU size that
2031 * will make efficient use of the RX buffers when the peer is sending
2032 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
2034 rxi_AdjustIfMTU(int mtu)
2039 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2040 if (mtu <= adjMTU) {
2047 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2048 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2051 /* Given an interface MTU size, and the peer's advertised max receive
2052 * size, calculate an adjisted maxMTU size that makes efficient use
2053 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2055 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2057 int maxMTU = mtu * rxi_nSendFrags;
2058 maxMTU = MIN(maxMTU, peerMaxMTU);
2059 return rxi_AdjustIfMTU(maxMTU);
2062 /* Given a packet size, figure out how many datagram packet will fit.
2063 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2064 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2065 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2067 rxi_AdjustDgramPackets(int frags, int mtu)
2070 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2073 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2074 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2075 /* subtract the size of the first and last packets */
2076 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2080 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));