2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
12 #include "afs/param.h"
14 #include <afs/param.h>
22 #include "afs/sysincludes.h"
23 #include "afsincludes.h"
24 #include "rx/rx_kcommon.h"
25 #include "rx/rx_clock.h"
26 #include "rx/rx_queue.h"
27 #include "rx/rx_packet.h"
28 #else /* defined(UKERNEL) */
29 #ifdef RX_KERNEL_TRACE
30 #include "../rx/rx_kcommon.h"
33 #ifndef AFS_LINUX20_ENV
36 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
37 #include "afs/sysincludes.h"
39 #if defined(AFS_OBSD_ENV)
43 #if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
44 #if !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
45 #include "sys/mount.h" /* it gets pulled in by something later anyway */
49 #include "netinet/in.h"
50 #include "afs/afs_osi.h"
51 #include "rx_kmutex.h"
52 #include "rx/rx_clock.h"
53 #include "rx/rx_queue.h"
55 #include <sys/sysmacros.h>
57 #include "rx/rx_packet.h"
58 #endif /* defined(UKERNEL) */
59 #include "rx/rx_globals.h"
61 #include "sys/types.h"
64 #if defined(AFS_NT40_ENV) || defined(AFS_DJGPP_ENV)
68 #include <sys/socket.h>
69 #include <netinet/in.h>
70 #endif /* AFS_NT40_ENV */
71 #include "rx_xmit_nt.h"
74 #include <sys/socket.h>
75 #include <netinet/in.h>
81 #include <sys/sysmacros.h>
83 #include "rx_packet.h"
84 #include "rx_globals.h"
100 /* rxdb_fileID is used to identify the lock location, along with line#. */
101 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
102 #endif /* RX_LOCKS_DB */
103 struct rx_packet *rx_mallocedP = 0;
105 extern char cml_version_number[];
106 extern int (*rx_almostSent) ();
108 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
109 afs_int32 ahost, short aport,
112 /* some rules about packets:
113 * 1. When a packet is allocated, the final iov_buf contains room for
114 * a security trailer, but iov_len masks that fact. If the security
115 * package wants to add the trailer, it may do so, and then extend
116 * iov_len appropriately. For this reason, packet's niovecs and
117 * iov_len fields should be accurate before calling PreparePacket.
121 * all packet buffers (iov_base) are integral multiples of
123 * offset is an integral multiple of the word size.
126 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
130 for (l = 0, i = 1; i < packet->niovecs; i++) {
131 if (l + packet->wirevec[i].iov_len > offset) {
133 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
136 l += packet->wirevec[i].iov_len;
143 * all packet buffers (iov_base) are integral multiples of the word size.
144 * offset is an integral multiple of the word size.
147 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
151 for (l = 0, i = 1; i < packet->niovecs; i++) {
152 if (l + packet->wirevec[i].iov_len > offset) {
153 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
154 (offset - l))) = data;
157 l += packet->wirevec[i].iov_len;
164 * all packet buffers (iov_base) are integral multiples of the
166 * offset is an integral multiple of the word size.
168 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
171 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
174 unsigned int i, j, l, r;
175 for (l = 0, i = 1; i < packet->niovecs; i++) {
176 if (l + packet->wirevec[i].iov_len > offset) {
179 l += packet->wirevec[i].iov_len;
182 /* i is the iovec which contains the first little bit of data in which we
183 * are interested. l is the total length of everything prior to this iovec.
184 * j is the number of bytes we can safely copy out of this iovec.
187 while ((resid > 0) && (i < packet->niovecs)) {
188 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
189 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
191 l += packet->wirevec[i].iov_len;
195 return (resid ? (r - resid) : r);
200 * all packet buffers (iov_base) are integral multiples of the
202 * offset is an integral multiple of the word size.
205 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
210 for (l = 0, i = 1; i < packet->niovecs; i++) {
211 if (l + packet->wirevec[i].iov_len > offset) {
214 l += packet->wirevec[i].iov_len;
217 /* i is the iovec which contains the first little bit of data in which we
218 * are interested. l is the total length of everything prior to this iovec.
219 * j is the number of bytes we can safely copy out of this iovec.
222 while ((resid > 0) && (i < RX_MAXWVECS)) {
223 if (i >= packet->niovecs)
224 if (rxi_AllocDataBuf(packet, resid, RX_PACKET_CLASS_SEND_CBUF) > 0) /* ++niovecs as a side-effect */
227 b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
228 j = MIN(resid, packet->wirevec[i].iov_len - (offset - l));
231 l += packet->wirevec[i].iov_len;
235 return (resid ? (r - resid) : r);
238 static struct rx_packet *
245 MUTEX_ENTER(&rx_freePktQ_lock);
248 if (rxi_OverQuota(class)) {
250 rxi_NeedMorePackets = TRUE;
251 MUTEX_ENTER(&rx_stats_mutex);
253 case RX_PACKET_CLASS_RECEIVE:
254 rx_stats.receivePktAllocFailures++;
256 case RX_PACKET_CLASS_SEND:
257 rx_stats.sendPktAllocFailures++;
259 case RX_PACKET_CLASS_SPECIAL:
260 rx_stats.specialPktAllocFailures++;
262 case RX_PACKET_CLASS_RECV_CBUF:
263 rx_stats.receiveCbufPktAllocFailures++;
265 case RX_PACKET_CLASS_SEND_CBUF:
266 rx_stats.sendCbufPktAllocFailures++;
269 MUTEX_EXIT(&rx_stats_mutex);
273 if (queue_IsEmpty(&rx_freePacketQueue)) {
275 rxi_NeedMorePackets = TRUE;
279 if (queue_IsEmpty(&rx_freePacketQueue)) {
280 rxi_MorePacketsNoLock(rx_initSendWindow);
285 c = queue_First(&rx_freePacketQueue, rx_packet);
287 if (!(c->flags & RX_PKTFLAG_FREE))
288 osi_Panic("rxi_AllocPacket: packet not free\n");
289 c->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
295 MUTEX_EXIT(&rx_freePktQ_lock);
302 * Free a packet currently used as a continuation buffer
305 rxi_freeCBuf(struct rx_packet *c)
310 MUTEX_ENTER(&rx_freePktQ_lock);
312 rxi_FreePacketNoLock(c);
313 /* Wakeup anyone waiting for packets */
316 MUTEX_EXIT(&rx_freePktQ_lock);
320 /* this one is kind of awful.
321 * In rxkad, the packet has been all shortened, and everything, ready for
322 * sending. All of a sudden, we discover we need some of that space back.
323 * This isn't terribly general, because it knows that the packets are only
324 * rounded up to the EBS (userdata + security header).
327 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
331 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
332 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
333 p->wirevec[i].iov_len += nb;
337 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
338 p->wirevec[i].iov_len += nb;
346 /* get sufficient space to store nb bytes of data (or more), and hook
347 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
348 * returns the number of bytes >0 which it failed to come up with.
349 * Don't need to worry about locking on packet, since only
350 * one thread can manipulate one at a time. Locking on continution
351 * packets is handled by allocCBuf */
352 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
354 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
358 for (i = p->niovecs; nb > 0 && i < RX_MAXWVECS; i++) {
359 register struct rx_packet *cb;
360 if ((cb = allocCBuf(class))) {
361 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
362 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
363 nb -= RX_CBUFFERSIZE;
364 p->length += RX_CBUFFERSIZE;
373 /* Add more packet buffers */
375 rxi_MorePackets(int apackets)
377 struct rx_packet *p, *e;
381 getme = apackets * sizeof(struct rx_packet);
382 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
384 PIN(p, getme); /* XXXXX */
385 memset((char *)p, 0, getme);
388 MUTEX_ENTER(&rx_freePktQ_lock);
390 for (e = p + apackets; p < e; p++) {
391 p->wirevec[0].iov_base = (char *)(p->wirehead);
392 p->wirevec[0].iov_len = RX_HEADER_SIZE;
393 p->wirevec[1].iov_base = (char *)(p->localdata);
394 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
395 p->flags |= RX_PKTFLAG_FREE;
398 queue_Append(&rx_freePacketQueue, p);
400 rx_nFreePackets += apackets;
401 rxi_NeedMorePackets = FALSE;
405 MUTEX_EXIT(&rx_freePktQ_lock);
410 /* Add more packet buffers */
412 rxi_MorePacketsNoLock(int apackets)
414 struct rx_packet *p, *e;
417 /* allocate enough packets that 1/4 of the packets will be able
418 * to hold maximal amounts of data */
419 apackets += (apackets / 4)
420 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
421 getme = apackets * sizeof(struct rx_packet);
422 p = rx_mallocedP = (struct rx_packet *)osi_Alloc(getme);
424 memset((char *)p, 0, getme);
426 for (e = p + apackets; p < e; p++) {
427 p->wirevec[0].iov_base = (char *)(p->wirehead);
428 p->wirevec[0].iov_len = RX_HEADER_SIZE;
429 p->wirevec[1].iov_base = (char *)(p->localdata);
430 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
431 p->flags |= RX_PKTFLAG_FREE;
434 queue_Append(&rx_freePacketQueue, p);
436 rx_nFreePackets += apackets;
437 rxi_NeedMorePackets = FALSE;
443 rxi_FreeAllPackets(void)
445 /* must be called at proper interrupt level, etcetera */
446 /* MTUXXX need to free all Packets */
447 osi_Free(rx_mallocedP,
448 (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
449 UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
452 /* Allocate more packets iff we need more continuation buffers */
453 /* In kernel, can't page in memory with interrupts disabled, so we
454 * don't use the event mechanism. */
456 rx_CheckPackets(void)
458 if (rxi_NeedMorePackets) {
459 rxi_MorePackets(rx_initSendWindow);
463 /* In the packet freeing routine below, the assumption is that
464 we want all of the packets to be used equally frequently, so that we
465 don't get packet buffers paging out. It would be just as valid to
466 assume that we DO want them to page out if not many are being used.
467 In any event, we assume the former, and append the packets to the end
469 /* This explanation is bogus. The free list doesn't remain in any kind of
470 useful order for afs_int32: the packets in use get pretty much randomly scattered
471 across all the pages. In order to permit unused {packets,bufs} to page out, they
472 must be stored so that packets which are adjacent in memory are adjacent in the
473 free list. An array springs rapidly to mind.
476 /* Actually free the packet p. */
478 rxi_FreePacketNoLock(struct rx_packet *p)
480 dpf(("Free %x\n", (int)p));
482 if (p->flags & RX_PKTFLAG_FREE)
483 osi_Panic("rxi_FreePacketNoLock: packet already free\n");
485 p->flags |= RX_PKTFLAG_FREE;
486 queue_Append(&rx_freePacketQueue, p);
490 rxi_FreeDataBufsNoLock(struct rx_packet *p, int first)
492 struct iovec *iov, *end;
494 if (first != 1) /* MTUXXX */
495 osi_Panic("FreeDataBufs 1: first must be 1");
496 iov = &p->wirevec[1];
497 end = iov + (p->niovecs - 1);
498 if (iov->iov_base != (caddr_t) p->localdata) /* MTUXXX */
499 osi_Panic("FreeDataBufs 2: vec 1 must be localdata");
500 for (iov++; iov < end; iov++) {
502 osi_Panic("FreeDataBufs 3: vecs 2-niovecs must not be NULL");
503 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
511 int rxi_nBadIovecs = 0;
513 /* rxi_RestoreDataBufs
515 * Restore the correct sizes to the iovecs. Called when reusing a packet
516 * for reading off the wire.
519 rxi_RestoreDataBufs(struct rx_packet *p)
522 struct iovec *iov = &p->wirevec[2];
524 p->wirevec[0].iov_base = (char *)(p->wirehead);
525 p->wirevec[0].iov_len = RX_HEADER_SIZE;
526 p->wirevec[1].iov_base = (char *)(p->localdata);
527 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
529 for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
530 if (!iov->iov_base) {
535 iov->iov_len = RX_CBUFFERSIZE;
540 rxi_TrimDataBufs(struct rx_packet *p, int first)
543 struct iovec *iov, *end;
547 osi_Panic("TrimDataBufs 1: first must be 1");
549 /* Skip over continuation buffers containing message data */
550 iov = &p->wirevec[2];
551 end = iov + (p->niovecs - 2);
552 length = p->length - p->wirevec[1].iov_len;
553 for (; iov < end && length > 0; iov++) {
555 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
556 length -= iov->iov_len;
559 /* iov now points to the first empty data buffer. */
564 MUTEX_ENTER(&rx_freePktQ_lock);
566 for (; iov < end; iov++) {
568 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
569 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
574 MUTEX_EXIT(&rx_freePktQ_lock);
580 /* Free the packet p. P is assumed not to be on any queue, i.e.
581 * remove it yourself first if you call this routine. */
583 rxi_FreePacket(struct rx_packet *p)
588 MUTEX_ENTER(&rx_freePktQ_lock);
590 rxi_FreeDataBufsNoLock(p, 1);
591 rxi_FreePacketNoLock(p);
592 /* Wakeup anyone waiting for packets */
595 MUTEX_EXIT(&rx_freePktQ_lock);
600 /* rxi_AllocPacket sets up p->length so it reflects the number of
601 * bytes in the packet at this point, **not including** the header.
602 * The header is absolutely necessary, besides, this is the way the
603 * length field is usually used */
605 rxi_AllocPacketNoLock(int class)
607 register struct rx_packet *p;
610 if (rxi_OverQuota(class)) {
611 rxi_NeedMorePackets = TRUE;
612 MUTEX_ENTER(&rx_stats_mutex);
614 case RX_PACKET_CLASS_RECEIVE:
615 rx_stats.receivePktAllocFailures++;
617 case RX_PACKET_CLASS_SEND:
618 rx_stats.sendPktAllocFailures++;
620 case RX_PACKET_CLASS_SPECIAL:
621 rx_stats.specialPktAllocFailures++;
623 case RX_PACKET_CLASS_RECV_CBUF:
624 rx_stats.receiveCbufPktAllocFailures++;
626 case RX_PACKET_CLASS_SEND_CBUF:
627 rx_stats.sendCbufPktAllocFailures++;
630 MUTEX_EXIT(&rx_stats_mutex);
631 return (struct rx_packet *)0;
635 MUTEX_ENTER(&rx_stats_mutex);
636 rx_stats.packetRequests++;
637 MUTEX_EXIT(&rx_stats_mutex);
640 if (queue_IsEmpty(&rx_freePacketQueue))
641 osi_Panic("rxi_AllocPacket error");
643 if (queue_IsEmpty(&rx_freePacketQueue))
644 rxi_MorePacketsNoLock(rx_initSendWindow);
648 p = queue_First(&rx_freePacketQueue, rx_packet);
649 if (!(p->flags & RX_PKTFLAG_FREE))
650 osi_Panic("rxi_AllocPacket: packet not free\n");
652 dpf(("Alloc %x, class %d\n", (int)p, class));
655 p->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
658 /* have to do this here because rx_FlushWrite fiddles with the iovs in
659 * order to truncate outbound packets. In the near future, may need
660 * to allocate bufs from a static pool here, and/or in AllocSendPacket
662 p->wirevec[0].iov_base = (char *)(p->wirehead);
663 p->wirevec[0].iov_len = RX_HEADER_SIZE;
664 p->wirevec[1].iov_base = (char *)(p->localdata);
665 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
667 p->length = RX_FIRSTBUFFERSIZE;
672 rxi_AllocPacket(int class)
674 register struct rx_packet *p;
676 MUTEX_ENTER(&rx_freePktQ_lock);
677 p = rxi_AllocPacketNoLock(class);
678 MUTEX_EXIT(&rx_freePktQ_lock);
682 /* This guy comes up with as many buffers as it {takes,can get} given
683 * the MTU for this call. It also sets the packet length before
684 * returning. caution: this is often called at NETPRI
685 * Called with call locked.
688 rxi_AllocSendPacket(register struct rx_call *call, int want)
690 register struct rx_packet *p = (struct rx_packet *)0;
692 register unsigned delta;
695 mud = call->MTU - RX_HEADER_SIZE;
697 rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
698 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
700 while (!(call->error)) {
701 MUTEX_ENTER(&rx_freePktQ_lock);
702 /* if an error occurred, or we get the packet we want, we're done */
703 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
704 MUTEX_EXIT(&rx_freePktQ_lock);
707 want = MIN(want, mud);
709 if ((unsigned)want > p->length)
710 (void)rxi_AllocDataBuf(p, (want - p->length),
711 RX_PACKET_CLASS_SEND_CBUF);
713 if ((unsigned)p->length > mud)
716 if (delta >= p->length) {
725 /* no error occurred, and we didn't get a packet, so we sleep.
726 * At this point, we assume that packets will be returned
727 * sooner or later, as packets are acknowledged, and so we
730 call->flags |= RX_CALL_WAIT_PACKETS;
731 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
732 MUTEX_EXIT(&call->lock);
733 rx_waitingForPackets = 1;
735 #ifdef RX_ENABLE_LOCKS
736 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
738 osi_rxSleep(&rx_waitingForPackets);
740 MUTEX_EXIT(&rx_freePktQ_lock);
741 MUTEX_ENTER(&call->lock);
742 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
743 call->flags &= ~RX_CALL_WAIT_PACKETS;
752 /* count the number of used FDs */
754 CountFDs(register int amax)
757 register int i, code;
761 for (i = 0; i < amax; i++) {
762 code = fstat(i, &tstat);
771 #define CountFDs(amax) amax
775 #if !defined(KERNEL) || defined(UKERNEL)
777 /* This function reads a single packet from the interface into the
778 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
779 * (host,port) of the sender are stored in the supplied variables, and
780 * the data length of the packet is stored in the packet structure.
781 * The header is decoded. */
783 rxi_ReadPacket(int socket, register struct rx_packet *p, afs_uint32 * host,
786 struct sockaddr_in from;
789 register afs_int32 tlen, savelen;
791 rx_computelen(p, tlen);
792 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
794 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
795 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
796 * it once in order to avoid races. */
799 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
807 /* Extend the last iovec for padding, it's just to make sure that the
808 * read doesn't return more data than we expect, and is done to get around
809 * our problems caused by the lack of a length field in the rx header.
810 * Use the extra buffer that follows the localdata in each packet
812 savelen = p->wirevec[p->niovecs - 1].iov_len;
813 p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
815 memset((char *)&msg, 0, sizeof(msg));
816 msg.msg_name = (char *)&from;
817 msg.msg_namelen = sizeof(struct sockaddr_in);
818 msg.msg_iov = p->wirevec;
819 msg.msg_iovlen = p->niovecs;
820 nbytes = rxi_Recvmsg(socket, &msg, 0);
822 /* restore the vec to its correct state */
823 p->wirevec[p->niovecs - 1].iov_len = savelen;
825 p->length = (nbytes - RX_HEADER_SIZE);
826 if ((nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
828 rxi_MorePackets(rx_initSendWindow);
830 else if (nbytes < 0 && errno == EWOULDBLOCK) {
831 MUTEX_ENTER(&rx_stats_mutex);
832 rx_stats.noPacketOnRead++;
833 MUTEX_EXIT(&rx_stats_mutex);
837 MUTEX_ENTER(&rx_stats_mutex);
838 rx_stats.bogusPacketOnRead++;
839 rx_stats.bogusHost = from.sin_addr.s_addr;
840 MUTEX_EXIT(&rx_stats_mutex);
841 dpf(("B: bogus packet from [%x,%d] nb=%d", from.sin_addr.s_addr,
842 from.sin_port, nbytes));
846 /* Extract packet header. */
847 rxi_DecodePacketHeader(p);
849 *host = from.sin_addr.s_addr;
850 *port = from.sin_port;
851 if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
852 struct rx_peer *peer;
853 MUTEX_ENTER(&rx_stats_mutex);
854 rx_stats.packetsRead[p->header.type - 1]++;
855 MUTEX_EXIT(&rx_stats_mutex);
857 * Try to look up this peer structure. If it doesn't exist,
858 * don't create a new one -
859 * we don't keep count of the bytes sent/received if a peer
860 * structure doesn't already exist.
862 * The peer/connection cleanup code assumes that there is 1 peer
863 * per connection. If we actually created a peer structure here
864 * and this packet was an rxdebug packet, the peer structure would
865 * never be cleaned up.
867 peer = rxi_FindPeer(*host, *port, 0, 0);
869 MUTEX_ENTER(&peer->peer_lock);
870 hadd32(peer->bytesReceived, p->length);
871 MUTEX_EXIT(&peer->peer_lock);
875 /* Free any empty packet buffers at the end of this packet */
876 rxi_TrimDataBufs(p, 1);
882 #endif /* !KERNEL || UKERNEL */
884 /* This function splits off the first packet in a jumbo packet.
885 * As of AFS 3.5, jumbograms contain more than one fixed size
886 * packet, and the RX_JUMBO_PACKET flag is set in all but the
887 * last packet header. All packets (except the last) are padded to
888 * fall on RX_CBUFFERSIZE boundaries.
889 * HACK: We store the length of the first n-1 packets in the
890 * last two pad bytes. */
893 rxi_SplitJumboPacket(register struct rx_packet *p, afs_int32 host, short port,
896 struct rx_packet *np;
897 struct rx_jumboHeader *jp;
903 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
904 * bytes in length. All but the first packet are preceded by
905 * an abbreviated four byte header. The length of the last packet
906 * is calculated from the size of the jumbogram. */
907 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
909 if ((int)p->length < length) {
910 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
913 niov = p->niovecs - 2;
915 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
918 iov = &p->wirevec[2];
919 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
921 /* Get a pointer to the abbreviated packet header */
922 jp = (struct rx_jumboHeader *)
923 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
925 /* Set up the iovecs for the next packet */
926 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
927 np->wirevec[0].iov_len = sizeof(struct rx_header);
928 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
929 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
930 np->niovecs = niov + 1;
931 for (i = 2, iov++; i <= niov; i++, iov++) {
932 np->wirevec[i] = *iov;
934 np->length = p->length - length;
935 p->length = RX_JUMBOBUFFERSIZE;
938 /* Convert the jumbo packet header to host byte order */
939 temp = ntohl(*(afs_uint32 *) jp);
940 jp->flags = (u_char) (temp >> 24);
941 jp->cksum = (u_short) (temp);
943 /* Fill in the packet header */
944 np->header = p->header;
945 np->header.serial = p->header.serial + 1;
946 np->header.seq = p->header.seq + 1;
947 np->header.flags = jp->flags;
948 np->header.spare = jp->cksum;
954 /* Send a udp datagram */
956 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
957 int length, int istack)
961 memset(&msg, 0, sizeof(msg));
963 msg.msg_iovlen = nvecs;
965 msg.msg_namelen = sizeof(struct sockaddr_in);
967 rxi_Sendmsg(socket, &msg, 0);
971 #elif !defined(UKERNEL)
973 * message receipt is done in rxk_input or rx_put.
976 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
978 * Copy an mblock to the contiguous area pointed to by cp.
979 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
980 * but it doesn't really.
981 * Returns the number of bytes not transferred.
982 * The message is NOT changed.
985 cpytoc(mblk_t * mp, register int off, register int len, register char *cp)
989 for (; mp && len > 0; mp = mp->b_cont) {
990 if (mp->b_datap->db_type != M_DATA) {
993 n = MIN(len, (mp->b_wptr - mp->b_rptr));
994 memcpy(cp, (char *)mp->b_rptr, n);
1002 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1003 * but it doesn't really.
1004 * This sucks, anyway, do it like m_cpy.... below
1007 cpytoiovec(mblk_t * mp, int off, int len, register struct iovec *iovs,
1010 register int m, n, o, t, i;
1012 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1013 if (mp->b_datap->db_type != M_DATA) {
1016 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1022 t = iovs[i].iov_len;
1025 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1035 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
1036 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1038 #if !defined(AFS_LINUX20_ENV)
1040 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1043 unsigned int l1, l2, i, t;
1045 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1046 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1049 if (m->m_len <= off) {
1059 p1 = mtod(m, caddr_t) + off;
1060 l1 = m->m_len - off;
1062 p2 = iovs[0].iov_base;
1063 l2 = iovs[0].iov_len;
1066 t = MIN(l1, MIN(l2, (unsigned int)len));
1077 p1 = mtod(m, caddr_t);
1083 p2 = iovs[i].iov_base;
1084 l2 = iovs[i].iov_len;
1092 #endif /* AFS_SUN5_ENV */
1094 #if !defined(AFS_LINUX20_ENV)
1096 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1097 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1103 struct rx_packet *phandle;
1104 int hdr_len, data_len;
1109 m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1116 #endif /*KERNEL && !UKERNEL */
1119 /* send a response to a debug packet */
1122 rxi_ReceiveDebugPacket(register struct rx_packet *ap, osi_socket asocket,
1123 afs_int32 ahost, short aport, int istack)
1125 struct rx_debugIn tin;
1127 struct rx_serverQueueEntry *np, *nqe;
1130 * Only respond to client-initiated Rx debug packets,
1131 * and clear the client flag in the response.
1133 if (ap->header.flags & RX_CLIENT_INITIATED) {
1134 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1135 rxi_EncodePacketHeader(ap);
1140 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1141 /* all done with packet, now set length to the truth, so we can
1142 * reuse this packet */
1143 rx_computelen(ap, ap->length);
1145 tin.type = ntohl(tin.type);
1146 tin.index = ntohl(tin.index);
1148 case RX_DEBUGI_GETSTATS:{
1149 struct rx_debugStats tstat;
1151 /* get basic stats */
1152 memset((char *)&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1153 tstat.version = RX_DEBUGI_VERSION;
1154 #ifndef RX_ENABLE_LOCKS
1155 tstat.waitingForPackets = rx_waitingForPackets;
1157 tstat.nFreePackets = htonl(rx_nFreePackets);
1158 tstat.callsExecuted = htonl(rxi_nCalls);
1159 tstat.packetReclaims = htonl(rx_packetReclaims);
1160 tstat.usedFDs = CountFDs(64);
1161 tstat.nWaiting = htonl(rx_nWaiting);
1162 queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1164 tstat.idleThreads = htonl(tstat.idleThreads);
1165 tl = sizeof(struct rx_debugStats) - ap->length;
1167 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1170 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1172 ap->length = sizeof(struct rx_debugStats);
1173 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1174 rx_computelen(ap, ap->length);
1179 case RX_DEBUGI_GETALLCONN:
1180 case RX_DEBUGI_GETCONN:{
1182 register struct rx_connection *tc;
1183 struct rx_call *tcall;
1184 struct rx_debugConn tconn;
1185 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1188 tl = sizeof(struct rx_debugConn) - ap->length;
1190 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1194 memset((char *)&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1195 /* get N'th (maybe) "interesting" connection info */
1196 for (i = 0; i < rx_hashTableSize; i++) {
1197 #if !defined(KERNEL)
1198 /* the time complexity of the algorithm used here
1199 * exponentially increses with the number of connections.
1201 #ifdef AFS_PTHREAD_ENV
1207 MUTEX_ENTER(&rx_connHashTable_lock);
1208 /* We might be slightly out of step since we are not
1209 * locking each call, but this is only debugging output.
1211 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1212 if ((all || rxi_IsConnInteresting(tc))
1213 && tin.index-- <= 0) {
1214 tconn.host = tc->peer->host;
1215 tconn.port = tc->peer->port;
1216 tconn.cid = htonl(tc->cid);
1217 tconn.epoch = htonl(tc->epoch);
1218 tconn.serial = htonl(tc->serial);
1219 for (j = 0; j < RX_MAXCALLS; j++) {
1220 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1221 if ((tcall = tc->call[j])) {
1222 tconn.callState[j] = tcall->state;
1223 tconn.callMode[j] = tcall->mode;
1224 tconn.callFlags[j] = tcall->flags;
1225 if (queue_IsNotEmpty(&tcall->rq))
1226 tconn.callOther[j] |= RX_OTHER_IN;
1227 if (queue_IsNotEmpty(&tcall->tq))
1228 tconn.callOther[j] |= RX_OTHER_OUT;
1230 tconn.callState[j] = RX_STATE_NOTINIT;
1233 tconn.natMTU = htonl(tc->peer->natMTU);
1234 tconn.error = htonl(tc->error);
1235 tconn.flags = tc->flags;
1236 tconn.type = tc->type;
1237 tconn.securityIndex = tc->securityIndex;
1238 if (tc->securityObject) {
1239 RXS_GetStats(tc->securityObject, tc,
1241 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1242 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1245 DOHTONL(packetsReceived);
1246 DOHTONL(packetsSent);
1247 DOHTONL(bytesReceived);
1251 sizeof(tconn.secStats.spares) /
1256 sizeof(tconn.secStats.sparel) /
1257 sizeof(afs_int32); i++)
1261 MUTEX_EXIT(&rx_connHashTable_lock);
1262 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1265 ap->length = sizeof(struct rx_debugConn);
1266 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1272 MUTEX_EXIT(&rx_connHashTable_lock);
1274 /* if we make it here, there are no interesting packets */
1275 tconn.cid = htonl(0xffffffff); /* means end */
1276 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1279 ap->length = sizeof(struct rx_debugConn);
1280 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1286 * Pass back all the peer structures we have available
1289 case RX_DEBUGI_GETPEER:{
1291 register struct rx_peer *tp;
1292 struct rx_debugPeer tpeer;
1295 tl = sizeof(struct rx_debugPeer) - ap->length;
1297 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1301 memset((char *)&tpeer, 0, sizeof(tpeer));
1302 for (i = 0; i < rx_hashTableSize; i++) {
1303 #if !defined(KERNEL)
1304 /* the time complexity of the algorithm used here
1305 * exponentially increses with the number of peers.
1307 * Yielding after processing each hash table entry
1308 * and dropping rx_peerHashTable_lock.
1309 * also increases the risk that we will miss a new
1310 * entry - but we are willing to live with this
1311 * limitation since this is meant for debugging only
1313 #ifdef AFS_PTHREAD_ENV
1319 MUTEX_ENTER(&rx_peerHashTable_lock);
1320 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1321 if (tin.index-- <= 0) {
1322 tpeer.host = tp->host;
1323 tpeer.port = tp->port;
1324 tpeer.ifMTU = htons(tp->ifMTU);
1325 tpeer.idleWhen = htonl(tp->idleWhen);
1326 tpeer.refCount = htons(tp->refCount);
1327 tpeer.burstSize = tp->burstSize;
1328 tpeer.burst = tp->burst;
1329 tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1330 tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1331 tpeer.rtt = htonl(tp->rtt);
1332 tpeer.rtt_dev = htonl(tp->rtt_dev);
1333 tpeer.timeout.sec = htonl(tp->timeout.sec);
1334 tpeer.timeout.usec = htonl(tp->timeout.usec);
1335 tpeer.nSent = htonl(tp->nSent);
1336 tpeer.reSends = htonl(tp->reSends);
1337 tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1338 tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1339 tpeer.rateFlag = htonl(tp->rateFlag);
1340 tpeer.natMTU = htons(tp->natMTU);
1341 tpeer.maxMTU = htons(tp->maxMTU);
1342 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1343 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1344 tpeer.MTU = htons(tp->MTU);
1345 tpeer.cwind = htons(tp->cwind);
1346 tpeer.nDgramPackets = htons(tp->nDgramPackets);
1347 tpeer.congestSeq = htons(tp->congestSeq);
1348 tpeer.bytesSent.high = htonl(tp->bytesSent.high);
1349 tpeer.bytesSent.low = htonl(tp->bytesSent.low);
1350 tpeer.bytesReceived.high =
1351 htonl(tp->bytesReceived.high);
1352 tpeer.bytesReceived.low =
1353 htonl(tp->bytesReceived.low);
1355 MUTEX_EXIT(&rx_peerHashTable_lock);
1356 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1359 ap->length = sizeof(struct rx_debugPeer);
1360 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1366 MUTEX_EXIT(&rx_peerHashTable_lock);
1368 /* if we make it here, there are no interesting packets */
1369 tpeer.host = htonl(0xffffffff); /* means end */
1370 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1373 ap->length = sizeof(struct rx_debugPeer);
1374 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1379 case RX_DEBUGI_RXSTATS:{
1383 tl = sizeof(rx_stats) - ap->length;
1385 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1389 /* Since its all int32s convert to network order with a loop. */
1390 MUTEX_ENTER(&rx_stats_mutex);
1391 s = (afs_int32 *) & rx_stats;
1392 for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
1393 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
1396 ap->length = sizeof(rx_stats);
1397 MUTEX_EXIT(&rx_stats_mutex);
1398 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1404 /* error response packet */
1405 tin.type = htonl(RX_DEBUGI_BADTYPE);
1406 tin.index = tin.type;
1407 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1409 ap->length = sizeof(struct rx_debugIn);
1410 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1418 rxi_ReceiveVersionPacket(register struct rx_packet *ap, osi_socket asocket,
1419 afs_int32 ahost, short aport, int istack)
1424 * Only respond to client-initiated version requests, and
1425 * clear that flag in the response.
1427 if (ap->header.flags & RX_CLIENT_INITIATED) {
1430 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1431 rxi_EncodePacketHeader(ap);
1432 memset(buf, 0, sizeof(buf));
1433 strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
1434 rx_packetwrite(ap, 0, 65, buf);
1437 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1445 /* send a debug packet back to the sender */
1447 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
1448 afs_int32 ahost, short aport, afs_int32 istack)
1450 struct sockaddr_in taddr;
1456 int waslocked = ISAFS_GLOCK();
1459 taddr.sin_family = AF_INET;
1460 taddr.sin_port = aport;
1461 taddr.sin_addr.s_addr = ahost;
1462 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
1463 taddr.sin_len = sizeof(struct sockaddr_in);
1466 /* We need to trim the niovecs. */
1467 nbytes = apacket->length;
1468 for (i = 1; i < apacket->niovecs; i++) {
1469 if (nbytes <= apacket->wirevec[i].iov_len) {
1470 savelen = apacket->wirevec[i].iov_len;
1471 saven = apacket->niovecs;
1472 apacket->wirevec[i].iov_len = nbytes;
1473 apacket->niovecs = i + 1; /* so condition fails because i == niovecs */
1475 nbytes -= apacket->wirevec[i].iov_len;
1479 #ifdef RX_KERNEL_TRACE
1480 if (ICL_SETACTIVE(afs_iclSetp)) {
1483 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1484 "before osi_NetSend()");
1492 /* debug packets are not reliably delivered, hence the cast below. */
1493 (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
1494 apacket->length + RX_HEADER_SIZE, istack);
1496 #ifdef RX_KERNEL_TRACE
1497 if (ICL_SETACTIVE(afs_iclSetp)) {
1499 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1500 "after osi_NetSend()");
1510 if (saven) { /* means we truncated the packet above. */
1511 apacket->wirevec[i - 1].iov_len = savelen;
1512 apacket->niovecs = saven;
1517 /* Send the packet to appropriate destination for the specified
1518 * call. The header is first encoded and placed in the packet.
1521 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
1522 struct rx_packet *p, int istack)
1528 struct sockaddr_in addr;
1529 register struct rx_peer *peer = conn->peer;
1532 char deliveryType = 'S';
1534 /* The address we're sending the packet to */
1535 memset(&addr, 0, sizeof(addr));
1536 addr.sin_family = AF_INET;
1537 addr.sin_port = peer->port;
1538 addr.sin_addr.s_addr = peer->host;
1540 /* This stuff should be revamped, I think, so that most, if not
1541 * all, of the header stuff is always added here. We could
1542 * probably do away with the encode/decode routines. XXXXX */
1544 /* Stamp each packet with a unique serial number. The serial
1545 * number is maintained on a connection basis because some types
1546 * of security may be based on the serial number of the packet,
1547 * and security is handled on a per authenticated-connection
1549 /* Pre-increment, to guarantee no zero serial number; a zero
1550 * serial number means the packet was never sent. */
1551 MUTEX_ENTER(&conn->conn_data_lock);
1552 p->header.serial = ++conn->serial;
1553 MUTEX_EXIT(&conn->conn_data_lock);
1554 /* This is so we can adjust retransmit time-outs better in the face of
1555 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1557 if (p->firstSerial == 0) {
1558 p->firstSerial = p->header.serial;
1561 /* If an output tracer function is defined, call it with the packet and
1562 * network address. Note this function may modify its arguments. */
1563 if (rx_almostSent) {
1564 int drop = (*rx_almostSent) (p, &addr);
1565 /* drop packet if return value is non-zero? */
1567 deliveryType = 'D'; /* Drop the packet */
1571 /* Get network byte order header */
1572 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1573 * touch ALL the fields */
1575 /* Send the packet out on the same socket that related packets are being
1579 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1582 /* Possibly drop this packet, for testing purposes */
1583 if ((deliveryType == 'D')
1584 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1585 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1586 deliveryType = 'D'; /* Drop the packet */
1588 deliveryType = 'S'; /* Send the packet */
1589 #endif /* RXDEBUG */
1591 /* Loop until the packet is sent. We'd prefer just to use a
1592 * blocking socket, but unfortunately the interface doesn't
1593 * allow us to have the socket block in send mode, and not
1594 * block in receive mode */
1597 waslocked = ISAFS_GLOCK();
1598 #ifdef RX_KERNEL_TRACE
1599 if (ICL_SETACTIVE(afs_iclSetp)) {
1602 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1603 "before osi_NetSend()");
1612 osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
1613 p->length + RX_HEADER_SIZE, istack)) != 0) {
1614 /* send failed, so let's hurry up the resend, eh? */
1615 MUTEX_ENTER(&rx_stats_mutex);
1616 rx_stats.netSendFailures++;
1617 MUTEX_EXIT(&rx_stats_mutex);
1618 p->retryTime = p->timeSent; /* resend it very soon */
1619 clock_Addmsec(&(p->retryTime),
1620 10 + (((afs_uint32) p->backoff) << 8));
1622 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1623 /* Linux is nice -- it can tell us right away that we cannot
1624 * reach this recipient by returning an ENETUNREACH error
1625 * code. So, when this happens let's "down" the host NOW so
1626 * we don't sit around waiting for this host to timeout later.
1628 if (call && code == -ENETUNREACH)
1629 call->lastReceiveTime = 0;
1633 #ifdef RX_KERNEL_TRACE
1634 if (ICL_SETACTIVE(afs_iclSetp)) {
1636 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
1637 "after osi_NetSend()");
1649 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d", deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], peer->host, peer->port, p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags, (int)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1651 MUTEX_ENTER(&rx_stats_mutex);
1652 rx_stats.packetsSent[p->header.type - 1]++;
1653 MUTEX_EXIT(&rx_stats_mutex);
1654 MUTEX_ENTER(&peer->peer_lock);
1655 hadd32(peer->bytesSent, p->length);
1656 MUTEX_EXIT(&peer->peer_lock);
1659 /* Send a list of packets to appropriate destination for the specified
1660 * connection. The headers are first encoded and placed in the packets.
1663 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
1664 struct rx_packet **list, int len, int istack)
1666 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1669 struct sockaddr_in addr;
1670 register struct rx_peer *peer = conn->peer;
1672 struct rx_packet *p = NULL;
1673 struct iovec wirevec[RX_MAXIOVECS];
1674 int i, length, code;
1677 struct rx_jumboHeader *jp;
1679 char deliveryType = 'S';
1681 /* The address we're sending the packet to */
1682 addr.sin_family = AF_INET;
1683 addr.sin_port = peer->port;
1684 addr.sin_addr.s_addr = peer->host;
1686 if (len + 1 > RX_MAXIOVECS) {
1687 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
1691 * Stamp the packets in this jumbogram with consecutive serial numbers
1693 MUTEX_ENTER(&conn->conn_data_lock);
1694 serial = conn->serial;
1695 conn->serial += len;
1696 MUTEX_EXIT(&conn->conn_data_lock);
1699 /* This stuff should be revamped, I think, so that most, if not
1700 * all, of the header stuff is always added here. We could
1701 * probably do away with the encode/decode routines. XXXXX */
1704 length = RX_HEADER_SIZE;
1705 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
1706 wirevec[0].iov_len = RX_HEADER_SIZE;
1707 for (i = 0; i < len; i++) {
1710 /* The whole 3.5 jumbogram scheme relies on packets fitting
1711 * in a single packet buffer. */
1712 if (p->niovecs > 2) {
1713 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
1716 /* Set the RX_JUMBO_PACKET flags in all but the last packets
1719 if (p->length != RX_JUMBOBUFFERSIZE) {
1720 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
1722 p->header.flags |= RX_JUMBO_PACKET;
1723 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1724 wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1726 wirevec[i + 1].iov_len = p->length;
1727 length += p->length;
1729 wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
1731 /* Convert jumbo packet header to network byte order */
1732 temp = (afs_uint32) (p->header.flags) << 24;
1733 temp |= (afs_uint32) (p->header.spare);
1734 *(afs_uint32 *) jp = htonl(temp);
1736 jp = (struct rx_jumboHeader *)
1737 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
1739 /* Stamp each packet with a unique serial number. The serial
1740 * number is maintained on a connection basis because some types
1741 * of security may be based on the serial number of the packet,
1742 * and security is handled on a per authenticated-connection
1744 /* Pre-increment, to guarantee no zero serial number; a zero
1745 * serial number means the packet was never sent. */
1746 p->header.serial = ++serial;
1747 /* This is so we can adjust retransmit time-outs better in the face of
1748 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1750 if (p->firstSerial == 0) {
1751 p->firstSerial = p->header.serial;
1754 /* If an output tracer function is defined, call it with the packet and
1755 * network address. Note this function may modify its arguments. */
1756 if (rx_almostSent) {
1757 int drop = (*rx_almostSent) (p, &addr);
1758 /* drop packet if return value is non-zero? */
1760 deliveryType = 'D'; /* Drop the packet */
1764 /* Get network byte order header */
1765 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1766 * touch ALL the fields */
1769 /* Send the packet out on the same socket that related packets are being
1773 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
1776 /* Possibly drop this packet, for testing purposes */
1777 if ((deliveryType == 'D')
1778 || ((rx_intentionallyDroppedPacketsPer100 > 0)
1779 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1780 deliveryType = 'D'; /* Drop the packet */
1782 deliveryType = 'S'; /* Send the packet */
1783 #endif /* RXDEBUG */
1785 /* Loop until the packet is sent. We'd prefer just to use a
1786 * blocking socket, but unfortunately the interface doesn't
1787 * allow us to have the socket block in send mode, and not
1788 * block in receive mode */
1790 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1791 waslocked = ISAFS_GLOCK();
1792 if (!istack && waslocked)
1796 osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
1798 /* send failed, so let's hurry up the resend, eh? */
1799 MUTEX_ENTER(&rx_stats_mutex);
1800 rx_stats.netSendFailures++;
1801 MUTEX_EXIT(&rx_stats_mutex);
1802 for (i = 0; i < len; i++) {
1804 p->retryTime = p->timeSent; /* resend it very soon */
1805 clock_Addmsec(&(p->retryTime),
1806 10 + (((afs_uint32) p->backoff) << 8));
1808 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1809 /* Linux is nice -- it can tell us right away that we cannot
1810 * reach this recipient by returning an ENETUNREACH error
1811 * code. So, when this happens let's "down" the host NOW so
1812 * we don't sit around waiting for this host to timeout later.
1814 if (call && code == -ENETUNREACH)
1815 call->lastReceiveTime = 0;
1818 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1819 if (!istack && waslocked)
1828 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d",
1829 deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1],
1830 peer->host, peer->port, p->header.serial, p->header.epoch,
1831 p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1832 (int)p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
1835 MUTEX_ENTER(&rx_stats_mutex);
1836 rx_stats.packetsSent[p->header.type - 1]++;
1837 MUTEX_EXIT(&rx_stats_mutex);
1838 MUTEX_ENTER(&peer->peer_lock);
1840 hadd32(peer->bytesSent, p->length);
1841 MUTEX_EXIT(&peer->peer_lock);
1845 /* Send a "special" packet to the peer connection. If call is
1846 * specified, then the packet is directed to a specific call channel
1847 * associated with the connection, otherwise it is directed to the
1848 * connection only. Uses optionalPacket if it is supplied, rather than
1849 * allocating a new packet buffer. Nbytes is the length of the data
1850 * portion of the packet. If data is non-null, nbytes of data are
1851 * copied into the packet. Type is the type of the packet, as defined
1852 * in rx.h. Bug: there's a lot of duplication between this and other
1853 * routines. This needs to be cleaned up. */
1855 rxi_SendSpecial(register struct rx_call *call,
1856 register struct rx_connection *conn,
1857 struct rx_packet *optionalPacket, int type, char *data,
1858 int nbytes, int istack)
1860 /* Some of the following stuff should be common code for all
1861 * packet sends (it's repeated elsewhere) */
1862 register struct rx_packet *p;
1864 int savelen = 0, saven = 0;
1865 int channel, callNumber;
1867 channel = call->channel;
1868 callNumber = *call->callNumber;
1869 /* BUSY packets refer to the next call on this connection */
1870 if (type == RX_PACKET_TYPE_BUSY) {
1879 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
1881 osi_Panic("rxi_SendSpecial failure");
1888 p->header.serviceId = conn->serviceId;
1889 p->header.securityIndex = conn->securityIndex;
1890 p->header.cid = (conn->cid | channel);
1891 p->header.callNumber = callNumber;
1893 p->header.epoch = conn->epoch;
1894 p->header.type = type;
1895 p->header.flags = 0;
1896 if (conn->type == RX_CLIENT_CONNECTION)
1897 p->header.flags |= RX_CLIENT_INITIATED;
1899 rx_packetwrite(p, 0, nbytes, data);
1901 for (i = 1; i < p->niovecs; i++) {
1902 if (nbytes <= p->wirevec[i].iov_len) {
1903 savelen = p->wirevec[i].iov_len;
1905 p->wirevec[i].iov_len = nbytes;
1906 p->niovecs = i + 1; /* so condition fails because i == niovecs */
1908 nbytes -= p->wirevec[i].iov_len;
1912 rxi_Send(call, p, istack);
1914 rxi_SendPacket((struct rx_call *)0, conn, p, istack);
1915 if (saven) { /* means we truncated the packet above. We probably don't */
1916 /* really need to do this, but it seems safer this way, given that */
1917 /* sneaky optionalPacket... */
1918 p->wirevec[i - 1].iov_len = savelen;
1921 if (!optionalPacket)
1923 return optionalPacket;
1927 /* Encode the packet's header (from the struct header in the packet to
1928 * the net byte order representation in the wire representation of the
1929 * packet, which is what is actually sent out on the wire) */
1931 rxi_EncodePacketHeader(register struct rx_packet *p)
1933 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1935 memset((char *)buf, 0, RX_HEADER_SIZE);
1936 *buf++ = htonl(p->header.epoch);
1937 *buf++ = htonl(p->header.cid);
1938 *buf++ = htonl(p->header.callNumber);
1939 *buf++ = htonl(p->header.seq);
1940 *buf++ = htonl(p->header.serial);
1941 *buf++ = htonl((((afs_uint32) p->header.type) << 24)
1942 | (((afs_uint32) p->header.flags) << 16)
1943 | (p->header.userStatus << 8) | p->header.securityIndex);
1944 /* Note: top 16 bits of this next word were reserved */
1945 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
1948 /* Decode the packet's header (from net byte order to a struct header) */
1950 rxi_DecodePacketHeader(register struct rx_packet *p)
1952 register afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
1955 p->header.epoch = ntohl(*buf);
1957 p->header.cid = ntohl(*buf);
1959 p->header.callNumber = ntohl(*buf);
1961 p->header.seq = ntohl(*buf);
1963 p->header.serial = ntohl(*buf);
1969 /* C will truncate byte fields to bytes for me */
1970 p->header.type = temp >> 24;
1971 p->header.flags = temp >> 16;
1972 p->header.userStatus = temp >> 8;
1973 p->header.securityIndex = temp >> 0;
1978 p->header.serviceId = (temp & 0xffff);
1979 p->header.spare = temp >> 16;
1980 /* Note: top 16 bits of this last word are the security checksum */
1984 rxi_PrepareSendPacket(register struct rx_call *call,
1985 register struct rx_packet *p, register int last)
1987 register struct rx_connection *conn = call->conn;
1989 ssize_t len; /* len must be a signed type; it can go negative */
1991 p->flags &= ~RX_PKTFLAG_ACKED;
1992 p->header.cid = (conn->cid | call->channel);
1993 p->header.serviceId = conn->serviceId;
1994 p->header.securityIndex = conn->securityIndex;
1995 p->header.callNumber = *call->callNumber;
1996 p->header.seq = call->tnext++;
1997 p->header.epoch = conn->epoch;
1998 p->header.type = RX_PACKET_TYPE_DATA;
1999 p->header.flags = 0;
2000 p->header.spare = 0;
2001 if (conn->type == RX_CLIENT_CONNECTION)
2002 p->header.flags |= RX_CLIENT_INITIATED;
2005 p->header.flags |= RX_LAST_PACKET;
2007 clock_Zero(&p->retryTime); /* Never yet transmitted */
2008 clock_Zero(&p->firstSent); /* Never yet transmitted */
2009 p->header.serial = 0; /* Another way of saying never transmitted... */
2012 /* Now that we're sure this is the last data on the call, make sure
2013 * that the "length" and the sum of the iov_lens matches. */
2014 len = p->length + call->conn->securityHeaderSize;
2016 for (i = 1; i < p->niovecs && len > 0; i++) {
2017 len -= p->wirevec[i].iov_len;
2020 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
2022 /* Free any extra elements in the wirevec */
2023 for (j = MAX(2, i); j < p->niovecs; j++) {
2024 rxi_freeCBuf(RX_CBUF_TO_PACKET(p->wirevec[j].iov_base, p));
2027 p->wirevec[i - 1].iov_len += len;
2029 RXS_PreparePacket(conn->securityObject, call, p);
2032 /* Given an interface MTU size, calculate an adjusted MTU size that
2033 * will make efficient use of the RX buffers when the peer is sending
2034 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
2036 rxi_AdjustIfMTU(int mtu)
2041 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2042 if (mtu <= adjMTU) {
2049 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2050 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2053 /* Given an interface MTU size, and the peer's advertised max receive
2054 * size, calculate an adjisted maxMTU size that makes efficient use
2055 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2057 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2059 int maxMTU = mtu * rxi_nSendFrags;
2060 maxMTU = MIN(maxMTU, peerMaxMTU);
2061 return rxi_AdjustIfMTU(maxMTU);
2064 /* Given a packet size, figure out how many datagram packet will fit.
2065 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2066 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2067 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2069 rxi_AdjustDgramPackets(int frags, int mtu)
2072 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2075 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2076 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2077 /* subtract the size of the first and last packets */
2078 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2082 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));