2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
12 #include "afs/param.h"
14 #include <afs/param.h>
21 #include "afs/sysincludes.h"
22 #include "afsincludes.h"
23 #include "rx/rx_kcommon.h"
24 #include "rx/rx_clock.h"
25 #include "rx/rx_queue.h"
26 #include "rx/rx_packet.h"
27 #else /* defined(UKERNEL) */
29 #ifndef AFS_LINUX20_ENV
32 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
33 #include "afs/sysincludes.h"
35 #if defined(AFS_OBSD_ENV)
39 #if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV)
40 #if !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
41 #include "sys/mount.h" /* it gets pulled in by something later anyway */
45 #include "netinet/in.h"
46 #include "afs/afs_osi.h"
47 #include "rx_kmutex.h"
48 #include "rx/rx_clock.h"
49 #include "rx/rx_queue.h"
51 #include <sys/sysmacros.h>
53 #include "rx/rx_packet.h"
54 #endif /* defined(UKERNEL) */
55 #include "rx/rx_globals.h"
57 #include "sys/types.h"
60 #if defined(AFS_NT40_ENV) || defined(AFS_DJGPP_ENV)
64 #include <sys/socket.h>
65 #include <netinet/in.h>
66 #endif /* AFS_NT40_ENV */
67 #include "rx_xmit_nt.h"
70 #include <sys/socket.h>
71 #include <netinet/in.h>
77 #include <sys/sysmacros.h>
79 #include "rx_packet.h"
80 #include "rx_globals.h"
95 /* rxdb_fileID is used to identify the lock location, along with line#. */
96 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
97 #endif /* RX_LOCKS_DB */
98 struct rx_packet *rx_mallocedP = 0;
100 extern char cml_version_number[];
101 extern int (*rx_almostSent)();
103 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
104 afs_int32 ahost, short aport, afs_int32 istack);
106 /* some rules about packets:
107 * 1. When a packet is allocated, the final iov_buf contains room for
108 * a security trailer, but iov_len masks that fact. If the security
109 * package wants to add the trailer, it may do so, and then extend
110 * iov_len appropriately. For this reason, packet's niovecs and
111 * iov_len fields should be accurate before calling PreparePacket.
115 * all packet buffers (iov_base) are integral multiples of
117 * offset is an integral multiple of the word size.
119 afs_int32 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
123 for (l=0, i=1; i< packet->niovecs ; i++ ) {
124 if (l + packet->wirevec[i].iov_len > offset) {
125 return *((afs_int32 *)((char*)(packet->wirevec[i].iov_base) + (offset-l)));
127 l += packet->wirevec[i].iov_len;
134 * all packet buffers (iov_base) are integral multiples of the word size.
135 * offset is an integral multiple of the word size.
137 afs_int32 rx_SlowPutInt32(struct rx_packet *packet, size_t offset, afs_int32 data)
141 for (l=0, i=1; i< packet->niovecs ; i++ ) {
142 if (l + packet->wirevec[i].iov_len > offset) {
143 *((afs_int32 *)((char*)(packet->wirevec[i].iov_base) + (offset - l))) =
147 l += packet->wirevec[i].iov_len;
154 * all packet buffers (iov_base) are integral multiples of the
156 * offset is an integral multiple of the word size.
158 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
160 afs_int32 rx_SlowReadPacket(struct rx_packet *packet, unsigned int offset,
161 int resid, char *out)
163 unsigned int i, j, l, r;
164 for (l=0, i=1; i< packet->niovecs ; i++ ) {
165 if (l + packet->wirevec[i].iov_len > offset) {
168 l += packet->wirevec[i].iov_len;
171 /* i is the iovec which contains the first little bit of data in which we
172 * are interested. l is the total length of everything prior to this iovec.
173 * j is the number of bytes we can safely copy out of this iovec.
176 while ((resid > 0) && (i < packet->niovecs)) {
177 j = MIN (resid, packet->wirevec[i].iov_len - (offset - l));
178 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
180 l += packet->wirevec[i].iov_len;
184 return (resid ? (r - resid) : r);
189 * all packet buffers (iov_base) are integral multiples of the
191 * offset is an integral multiple of the word size.
193 afs_int32 rx_SlowWritePacket(struct rx_packet *packet, int offset, int resid,
199 for (l=0, i=1; i < packet->niovecs; i++ ) {
200 if (l + packet->wirevec[i].iov_len > offset) {
203 l += packet->wirevec[i].iov_len;
206 /* i is the iovec which contains the first little bit of data in which we
207 * are interested. l is the total length of everything prior to this iovec.
208 * j is the number of bytes we can safely copy out of this iovec.
211 while ((resid > 0) && (i < RX_MAXWVECS)) {
212 if (i >= packet->niovecs)
213 if (rxi_AllocDataBuf(packet, resid, RX_PACKET_CLASS_SEND_CBUF) >0) /* ++niovecs as a side-effect */
216 b = (char*)(packet->wirevec[i].iov_base) + (offset - l);
217 j = MIN (resid, packet->wirevec[i].iov_len - (offset - l));
220 l += packet->wirevec[i].iov_len;
224 return (resid ? (r - resid) : r);
227 static struct rx_packet *allocCBuf(int class)
233 MUTEX_ENTER(&rx_freePktQ_lock);
236 if (rxi_OverQuota(class)) {
238 rxi_NeedMorePackets = TRUE;
239 MUTEX_ENTER(&rx_stats_mutex);
241 case RX_PACKET_CLASS_RECEIVE:
242 rx_stats.receivePktAllocFailures++;
244 case RX_PACKET_CLASS_SEND:
245 rx_stats.sendPktAllocFailures++;
247 case RX_PACKET_CLASS_SPECIAL:
248 rx_stats.specialPktAllocFailures++;
250 case RX_PACKET_CLASS_RECV_CBUF:
251 rx_stats.receiveCbufPktAllocFailures++;
253 case RX_PACKET_CLASS_SEND_CBUF:
254 rx_stats.sendCbufPktAllocFailures++;
257 MUTEX_EXIT(&rx_stats_mutex);
261 if (queue_IsEmpty(&rx_freePacketQueue)) {
263 rxi_NeedMorePackets = TRUE;
267 if (queue_IsEmpty(&rx_freePacketQueue)) {
268 rxi_MorePacketsNoLock(rx_initSendWindow);
273 c = queue_First(&rx_freePacketQueue, rx_packet);
275 if (!(c->flags & RX_PKTFLAG_FREE))
276 osi_Panic("rxi_AllocPacket: packet not free\n");
277 c->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
283 MUTEX_EXIT(&rx_freePktQ_lock);
290 * Free a packet currently used as a continuation buffer
292 void rxi_freeCBuf(struct rx_packet *c)
297 MUTEX_ENTER(&rx_freePktQ_lock);
299 rxi_FreePacketNoLock(c);
300 /* Wakeup anyone waiting for packets */
303 MUTEX_EXIT(&rx_freePktQ_lock);
307 /* this one is kind of awful.
308 * In rxkad, the packet has been all shortened, and everything, ready for
309 * sending. All of a sudden, we discover we need some of that space back.
310 * This isn't terribly general, because it knows that the packets are only
311 * rounded up to the EBS (userdata + security header).
313 int rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
317 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
318 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
319 p->wirevec[i].iov_len += nb;
324 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
325 p->wirevec[i].iov_len += nb;
332 /* get sufficient space to store nb bytes of data (or more), and hook
333 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
334 * returns the number of bytes >0 which it failed to come up with.
335 * Don't need to worry about locking on packet, since only
336 * one thread can manipulate one at a time. Locking on continution
337 * packets is handled by allocCBuf */
338 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
339 int rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
343 for (i=p->niovecs; nb>0 && i<RX_MAXWVECS; i++) {
344 register struct rx_packet *cb;
345 if ((cb = allocCBuf(class))) {
346 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
347 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
348 nb -= RX_CBUFFERSIZE;
349 p->length += RX_CBUFFERSIZE;
358 /* Add more packet buffers */
359 void rxi_MorePackets(int apackets)
361 struct rx_packet *p, *e;
365 getme = apackets * sizeof(struct rx_packet);
366 p = rx_mallocedP = (struct rx_packet *) osi_Alloc(getme);
368 PIN(p, getme); /* XXXXX */
369 memset((char *)p, 0, getme);
372 MUTEX_ENTER(&rx_freePktQ_lock);
374 for (e = p + apackets; p<e; p++) {
375 p->wirevec[0].iov_base = (char *) (p->wirehead);
376 p->wirevec[0].iov_len = RX_HEADER_SIZE;
377 p->wirevec[1].iov_base = (char *) (p->localdata);
378 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
379 p->flags |= RX_PKTFLAG_FREE;
382 queue_Append(&rx_freePacketQueue, p);
384 rx_nFreePackets += apackets;
385 rxi_NeedMorePackets = FALSE;
389 MUTEX_EXIT(&rx_freePktQ_lock);
394 /* Add more packet buffers */
395 void rxi_MorePacketsNoLock(int apackets)
397 struct rx_packet *p, *e;
400 /* allocate enough packets that 1/4 of the packets will be able
401 * to hold maximal amounts of data */
402 apackets += (apackets/4)
403 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE)/RX_CBUFFERSIZE);
404 getme = apackets * sizeof(struct rx_packet);
405 p = rx_mallocedP = (struct rx_packet *) osi_Alloc(getme);
407 memset((char *)p, 0, getme);
409 for (e = p + apackets; p<e; p++) {
410 p->wirevec[0].iov_base = (char *) (p->wirehead);
411 p->wirevec[0].iov_len = RX_HEADER_SIZE;
412 p->wirevec[1].iov_base = (char *) (p->localdata);
413 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
414 p->flags |= RX_PKTFLAG_FREE;
417 queue_Append(&rx_freePacketQueue, p);
419 rx_nFreePackets += apackets;
420 rxi_NeedMorePackets = FALSE;
425 void rxi_FreeAllPackets(void)
427 /* must be called at proper interrupt level, etcetera */
428 /* MTUXXX need to free all Packets */
429 osi_Free(rx_mallocedP, (rx_maxReceiveWindow+2) * sizeof(struct rx_packet));
430 UNPIN(rx_mallocedP, (rx_maxReceiveWindow+2) * sizeof(struct rx_packet));
433 /* Allocate more packets iff we need more continuation buffers */
434 /* In kernel, can't page in memory with interrupts disabled, so we
435 * don't use the event mechanism. */
436 void rx_CheckPackets(void)
438 if (rxi_NeedMorePackets) {
439 rxi_MorePackets(rx_initSendWindow);
443 /* In the packet freeing routine below, the assumption is that
444 we want all of the packets to be used equally frequently, so that we
445 don't get packet buffers paging out. It would be just as valid to
446 assume that we DO want them to page out if not many are being used.
447 In any event, we assume the former, and append the packets to the end
449 /* This explanation is bogus. The free list doesn't remain in any kind of
450 useful order for afs_int32: the packets in use get pretty much randomly scattered
451 across all the pages. In order to permit unused {packets,bufs} to page out, they
452 must be stored so that packets which are adjacent in memory are adjacent in the
453 free list. An array springs rapidly to mind.
456 /* Actually free the packet p. */
457 void rxi_FreePacketNoLock(struct rx_packet *p)
459 dpf(("Free %x\n", p));
461 if (p->flags & RX_PKTFLAG_FREE)
462 osi_Panic("rxi_FreePacketNoLock: packet already free\n");
464 p->flags |= RX_PKTFLAG_FREE;
465 queue_Append(&rx_freePacketQueue, p);
468 int rxi_FreeDataBufsNoLock(struct rx_packet *p, int first)
470 struct iovec *iov, *end;
472 if (first != 1) /* MTUXXX */
473 osi_Panic("FreeDataBufs 1: first must be 1");
474 iov = &p->wirevec[1];
475 end = iov + (p->niovecs-1);
476 if (iov->iov_base != (caddr_t) p->localdata) /* MTUXXX */
477 osi_Panic("FreeDataBufs 2: vec 1 must be localdata");
478 for (iov++ ; iov < end ; iov++) {
480 osi_Panic("FreeDataBufs 3: vecs 2-niovecs must not be NULL");
481 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
489 int rxi_nBadIovecs = 0;
491 /* rxi_RestoreDataBufs
493 * Restore the correct sizes to the iovecs. Called when reusing a packet
494 * for reading off the wire.
496 void rxi_RestoreDataBufs(struct rx_packet *p)
499 struct iovec *iov = &p->wirevec[2];
501 p->wirevec[0].iov_base = (char *) (p->wirehead);
502 p->wirevec[0].iov_len = RX_HEADER_SIZE;
503 p->wirevec[1].iov_base = (char *) (p->localdata);
504 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
506 for (i=2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
507 if (!iov->iov_base) {
512 iov->iov_len = RX_CBUFFERSIZE;
516 int rxi_TrimDataBufs(struct rx_packet *p, int first)
519 struct iovec *iov, *end;
523 osi_Panic("TrimDataBufs 1: first must be 1");
525 /* Skip over continuation buffers containing message data */
526 iov = &p->wirevec[2];
527 end = iov + (p->niovecs-2);
528 length = p->length - p->wirevec[1].iov_len;
529 for (; iov < end && length > 0 ; iov++) {
531 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
532 length -= iov->iov_len;
535 /* iov now points to the first empty data buffer. */
540 MUTEX_ENTER(&rx_freePktQ_lock);
542 for (; iov < end ; iov++) {
544 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
545 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
550 MUTEX_EXIT(&rx_freePktQ_lock);
556 /* Free the packet p. P is assumed not to be on any queue, i.e.
557 * remove it yourself first if you call this routine. */
558 void rxi_FreePacket(struct rx_packet *p)
563 MUTEX_ENTER(&rx_freePktQ_lock);
565 rxi_FreeDataBufsNoLock(p,1);
566 rxi_FreePacketNoLock(p);
567 /* Wakeup anyone waiting for packets */
570 MUTEX_EXIT(&rx_freePktQ_lock);
575 /* rxi_AllocPacket sets up p->length so it reflects the number of
576 * bytes in the packet at this point, **not including** the header.
577 * The header is absolutely necessary, besides, this is the way the
578 * length field is usually used */
579 struct rx_packet *rxi_AllocPacketNoLock(int class)
581 register struct rx_packet *p;
584 if (rxi_OverQuota(class)) {
585 rxi_NeedMorePackets = TRUE;
586 MUTEX_ENTER(&rx_stats_mutex);
588 case RX_PACKET_CLASS_RECEIVE:
589 rx_stats.receivePktAllocFailures++;
591 case RX_PACKET_CLASS_SEND:
592 rx_stats.sendPktAllocFailures++;
594 case RX_PACKET_CLASS_SPECIAL:
595 rx_stats.specialPktAllocFailures++;
597 case RX_PACKET_CLASS_RECV_CBUF:
598 rx_stats.receiveCbufPktAllocFailures++;
600 case RX_PACKET_CLASS_SEND_CBUF:
601 rx_stats.sendCbufPktAllocFailures++;
604 MUTEX_EXIT(&rx_stats_mutex);
605 return (struct rx_packet *) 0;
609 MUTEX_ENTER(&rx_stats_mutex);
610 rx_stats.packetRequests++;
611 MUTEX_EXIT(&rx_stats_mutex);
614 if (queue_IsEmpty(&rx_freePacketQueue))
615 osi_Panic("rxi_AllocPacket error");
617 if (queue_IsEmpty(&rx_freePacketQueue))
618 rxi_MorePacketsNoLock(rx_initSendWindow);
622 p = queue_First(&rx_freePacketQueue, rx_packet);
623 if (!(p->flags & RX_PKTFLAG_FREE))
624 osi_Panic("rxi_AllocPacket: packet not free\n");
626 dpf(("Alloc %x, class %d\n", p, class));
629 p->flags = 0; /* clear RX_PKTFLAG_FREE, initialize the rest */
632 /* have to do this here because rx_FlushWrite fiddles with the iovs in
633 * order to truncate outbound packets. In the near future, may need
634 * to allocate bufs from a static pool here, and/or in AllocSendPacket
636 p->wirevec[0].iov_base = (char *) (p->wirehead);
637 p->wirevec[0].iov_len = RX_HEADER_SIZE;
638 p->wirevec[1].iov_base = (char *) (p->localdata);
639 p->wirevec[1].iov_len = RX_FIRSTBUFFERSIZE;
641 p->length = RX_FIRSTBUFFERSIZE;
645 struct rx_packet *rxi_AllocPacket(int class)
647 register struct rx_packet *p;
649 MUTEX_ENTER(&rx_freePktQ_lock);
650 p = rxi_AllocPacketNoLock(class);
651 MUTEX_EXIT(&rx_freePktQ_lock);
655 /* This guy comes up with as many buffers as it {takes,can get} given
656 * the MTU for this call. It also sets the packet length before
657 * returning. caution: this is often called at NETPRI
658 * Called with call locked.
660 struct rx_packet *rxi_AllocSendPacket(register struct rx_call *call, int want)
662 register struct rx_packet *p = (struct rx_packet *) 0;
664 register unsigned delta;
667 mud = call->MTU - RX_HEADER_SIZE;
668 delta = rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
669 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
671 while (!(call->error)) {
672 MUTEX_ENTER(&rx_freePktQ_lock);
673 /* if an error occurred, or we get the packet we want, we're done */
674 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
675 MUTEX_EXIT(&rx_freePktQ_lock);
678 want = MIN(want, mud);
680 if ((unsigned) want > p->length)
681 (void) rxi_AllocDataBuf(p, (want - p->length),
682 RX_PACKET_CLASS_SEND_CBUF);
684 if ((unsigned) p->length > mud)
687 if (delta >= p->length) {
696 /* no error occurred, and we didn't get a packet, so we sleep.
697 * At this point, we assume that packets will be returned
698 * sooner or later, as packets are acknowledged, and so we
701 call->flags |= RX_CALL_WAIT_PACKETS;
702 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
703 MUTEX_EXIT(&call->lock);
704 rx_waitingForPackets = 1;
706 #ifdef RX_ENABLE_LOCKS
707 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
709 osi_rxSleep(&rx_waitingForPackets);
711 MUTEX_EXIT(&rx_freePktQ_lock);
712 MUTEX_ENTER(&call->lock);
713 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
714 call->flags &= ~RX_CALL_WAIT_PACKETS;
723 /* count the number of used FDs */
724 static int CountFDs(register int amax)
727 register int i, code;
731 for(i=0;i<amax;i++) {
732 code = fstat(i, &tstat);
733 if (code == 0) count++;
740 #define CountFDs(amax) amax
744 #if !defined(KERNEL) || defined(UKERNEL)
746 /* This function reads a single packet from the interface into the
747 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
748 * (host,port) of the sender are stored in the supplied variables, and
749 * the data length of the packet is stored in the packet structure.
750 * The header is decoded. */
751 int rxi_ReadPacket(int socket, register struct rx_packet *p, afs_uint32 *host, u_short *port)
753 struct sockaddr_in from;
756 register afs_int32 tlen, savelen;
758 rx_computelen(p, tlen);
759 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
761 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
762 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
763 * it once in order to avoid races. */
766 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
774 /* Extend the last iovec for padding, it's just to make sure that the
775 * read doesn't return more data than we expect, and is done to get around
776 * our problems caused by the lack of a length field in the rx header.
777 * Use the extra buffer that follows the localdata in each packet
779 savelen = p->wirevec[p->niovecs-1].iov_len;
780 p->wirevec[p->niovecs-1].iov_len += RX_EXTRABUFFERSIZE;
782 memset((char *)&msg, 0, sizeof(msg));
783 msg.msg_name = (char *) &from;
784 msg.msg_namelen = sizeof(struct sockaddr_in);
785 msg.msg_iov = p->wirevec;
786 msg.msg_iovlen = p->niovecs;
787 nbytes = rxi_Recvmsg(socket, &msg, 0);
789 /* restore the vec to its correct state */
790 p->wirevec[p->niovecs-1].iov_len = savelen;
792 p->length = (nbytes - RX_HEADER_SIZE);
793 if ((nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
795 rxi_MorePackets(rx_initSendWindow);
797 else if (nbytes < 0 && errno == EWOULDBLOCK) {
798 MUTEX_ENTER(&rx_stats_mutex);
799 rx_stats.noPacketOnRead++;
800 MUTEX_EXIT(&rx_stats_mutex);
804 MUTEX_ENTER(&rx_stats_mutex);
805 rx_stats.bogusPacketOnRead++;
806 rx_stats.bogusHost = from.sin_addr.s_addr;
807 MUTEX_EXIT(&rx_stats_mutex);
808 dpf(("B: bogus packet from [%x,%d] nb=%d", from.sin_addr.s_addr,
809 from.sin_port,nbytes));
814 /* Extract packet header. */
815 rxi_DecodePacketHeader(p);
817 *host = from.sin_addr.s_addr;
818 *port = from.sin_port;
819 if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
820 struct rx_peer *peer;
821 MUTEX_ENTER(&rx_stats_mutex);
822 rx_stats.packetsRead[p->header.type-1]++;
823 MUTEX_EXIT(&rx_stats_mutex);
825 * Try to look up this peer structure. If it doesn't exist,
826 * don't create a new one -
827 * we don't keep count of the bytes sent/received if a peer
828 * structure doesn't already exist.
830 * The peer/connection cleanup code assumes that there is 1 peer
831 * per connection. If we actually created a peer structure here
832 * and this packet was an rxdebug packet, the peer structure would
833 * never be cleaned up.
835 peer = rxi_FindPeer(*host, *port, 0, 0);
837 MUTEX_ENTER(&peer->peer_lock);
838 hadd32(peer->bytesReceived, p->length);
839 MUTEX_EXIT(&peer->peer_lock);
843 /* Free any empty packet buffers at the end of this packet */
844 rxi_TrimDataBufs(p, 1);
850 #endif /* !KERNEL || UKERNEL */
852 /* This function splits off the first packet in a jumbo packet.
853 * As of AFS 3.5, jumbograms contain more than one fixed size
854 * packet, and the RX_JUMBO_PACKET flag is set in all but the
855 * last packet header. All packets (except the last) are padded to
856 * fall on RX_CBUFFERSIZE boundaries.
857 * HACK: We store the length of the first n-1 packets in the
858 * last two pad bytes. */
860 struct rx_packet *rxi_SplitJumboPacket(register struct rx_packet *p, afs_int32 host,
861 short port, int first)
863 struct rx_packet *np;
864 struct rx_jumboHeader *jp;
870 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
871 * bytes in length. All but the first packet are preceded by
872 * an abbreviated four byte header. The length of the last packet
873 * is calculated from the size of the jumbogram. */
874 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
876 if ((int)p->length < length) {
877 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
880 niov = p->niovecs - 2;
882 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
885 iov = &p->wirevec[2];
886 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
888 /* Get a pointer to the abbreviated packet header */
889 jp = (struct rx_jumboHeader *)
890 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
892 /* Set up the iovecs for the next packet */
893 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
894 np->wirevec[0].iov_len = sizeof(struct rx_header);
895 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
896 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
897 np->niovecs = niov+1;
898 for (i = 2 , iov++ ; i <= niov ; i++ , iov++) {
899 np->wirevec[i] = *iov;
901 np->length = p->length - length;
902 p->length = RX_JUMBOBUFFERSIZE;
905 /* Convert the jumbo packet header to host byte order */
906 temp = ntohl(*(afs_uint32 *)jp);
907 jp->flags = (u_char)(temp >> 24);
908 jp->cksum = (u_short)(temp);
910 /* Fill in the packet header */
911 np->header = p->header;
912 np->header.serial = p->header.serial + 1;
913 np->header.seq = p->header.seq + 1;
914 np->header.flags = jp->flags;
915 np->header.spare = jp->cksum;
921 /* Send a udp datagram */
922 int osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
923 int length, int istack)
927 memset(&msg, 0, sizeof(msg));
929 msg.msg_iovlen = nvecs;
931 msg.msg_namelen = sizeof(struct sockaddr_in);
933 rxi_Sendmsg(socket, &msg, 0);
937 #elif !defined(UKERNEL)
939 * message receipt is done in rxk_input or rx_put.
944 * Copy an mblock to the contiguous area pointed to by cp.
945 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
946 * but it doesn't really.
947 * Returns the number of bytes not transferred.
948 * The message is NOT changed.
950 static int cpytoc(mblk_t *mp, register int off, register int len, register char *cp)
954 for (;mp && len > 0; mp = mp->b_cont) {
955 if (mp->b_datap->db_type != M_DATA) {
958 n = MIN(len, (mp->b_wptr - mp->b_rptr));
959 memcpy(cp, (char *)mp->b_rptr, n);
967 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
968 * but it doesn't really.
969 * This sucks, anyway, do it like m_cpy.... below
971 static int cpytoiovec(mblk_t *mp, int off, int len, register struct iovec *iovs, int niovs)
973 register int m,n,o,t,i;
975 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
976 if (mp->b_datap->db_type != M_DATA) {
979 n = MIN(len, (mp->b_wptr - mp->b_rptr));
988 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
997 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
998 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1000 #if !defined(AFS_LINUX20_ENV)
1001 static int m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1004 unsigned int l1, l2, i, t;
1006 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1007 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1010 if (m->m_len <= off) {
1020 p1 = mtod(m, caddr_t)+off;
1021 l1 = m->m_len - off;
1023 p2 = iovs[0].iov_base;
1024 l2 = iovs[0].iov_len;
1027 t = MIN(l1, MIN(l2, (unsigned int)len));
1036 p1 = mtod(m, caddr_t);
1042 p2 = iovs[i].iov_base;
1043 l2 = iovs[i].iov_len;
1051 #endif /* AFS_SUN5_ENV */
1053 #if !defined(AFS_LINUX20_ENV)
1054 int rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1061 struct rx_packet *phandle;
1062 int hdr_len, data_len;
1066 code = m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec, phandle->niovecs);
1072 #endif /*KERNEL && !UKERNEL*/
1075 /* send a response to a debug packet */
1077 struct rx_packet *rxi_ReceiveDebugPacket(register struct rx_packet *ap,
1078 osi_socket asocket, afs_int32 ahost, short aport, int istack)
1080 struct rx_debugIn tin;
1082 struct rx_serverQueueEntry *np, *nqe;
1085 * Only respond to client-initiated Rx debug packets,
1086 * and clear the client flag in the response.
1088 if (ap->header.flags & RX_CLIENT_INITIATED) {
1089 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1090 rxi_EncodePacketHeader(ap);
1095 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1096 /* all done with packet, now set length to the truth, so we can
1097 * reuse this packet */
1098 rx_computelen(ap, ap->length);
1100 tin.type = ntohl(tin.type);
1101 tin.index = ntohl(tin.index);
1103 case RX_DEBUGI_GETSTATS: {
1104 struct rx_debugStats tstat;
1106 /* get basic stats */
1107 memset((char *)&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1108 tstat.version = RX_DEBUGI_VERSION;
1109 #ifndef RX_ENABLE_LOCKS
1110 tstat.waitingForPackets = rx_waitingForPackets;
1112 tstat.nFreePackets = htonl(rx_nFreePackets);
1113 tstat.callsExecuted = htonl(rxi_nCalls);
1114 tstat.packetReclaims = htonl(rx_packetReclaims);
1115 tstat.usedFDs = CountFDs(64);
1116 tstat.nWaiting = htonl(rx_nWaiting);
1117 queue_Count( &rx_idleServerQueue, np, nqe,
1118 rx_serverQueueEntry, tstat.idleThreads);
1119 tstat.idleThreads = htonl(tstat.idleThreads);
1120 tl = sizeof(struct rx_debugStats) - ap->length;
1122 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1125 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats), (char *)&tstat);
1126 ap->length = sizeof(struct rx_debugStats);
1127 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1128 rx_computelen(ap, ap->length);
1133 case RX_DEBUGI_GETALLCONN:
1134 case RX_DEBUGI_GETCONN: {
1136 register struct rx_connection *tc;
1137 struct rx_call *tcall;
1138 struct rx_debugConn tconn;
1139 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1142 tl = sizeof(struct rx_debugConn) - ap->length;
1144 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1148 memset((char *)&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1149 /* get N'th (maybe) "interesting" connection info */
1150 for(i=0;i<rx_hashTableSize;i++) {
1151 #if !defined(KERNEL)
1152 /* the time complexity of the algorithm used here
1153 * exponentially increses with the number of connections.
1155 #ifdef AFS_PTHREAD_ENV
1158 (void) IOMGR_Poll();
1161 MUTEX_ENTER(&rx_connHashTable_lock);
1162 /* We might be slightly out of step since we are not
1163 * locking each call, but this is only debugging output.
1165 for(tc=rx_connHashTable[i]; tc; tc=tc->next) {
1166 if ((all || rxi_IsConnInteresting(tc)) && tin.index-- <= 0) {
1167 tconn.host = tc->peer->host;
1168 tconn.port = tc->peer->port;
1169 tconn.cid = htonl(tc->cid);
1170 tconn.epoch = htonl(tc->epoch);
1171 tconn.serial = htonl(tc->serial);
1172 for(j=0;j<RX_MAXCALLS;j++) {
1173 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1174 if ((tcall=tc->call[j])) {
1175 tconn.callState[j] = tcall->state;
1176 tconn.callMode[j] = tcall->mode;
1177 tconn.callFlags[j] = tcall->flags;
1178 if (queue_IsNotEmpty(&tcall->rq))
1179 tconn.callOther[j] |= RX_OTHER_IN;
1180 if (queue_IsNotEmpty(&tcall->tq))
1181 tconn.callOther[j] |= RX_OTHER_OUT;
1183 else tconn.callState[j] = RX_STATE_NOTINIT;
1186 tconn.natMTU = htonl(tc->peer->natMTU);
1187 tconn.error = htonl(tc->error);
1188 tconn.flags = tc->flags;
1189 tconn.type = tc->type;
1190 tconn.securityIndex = tc->securityIndex;
1191 if (tc->securityObject) {
1192 RXS_GetStats (tc->securityObject, tc,
1194 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1195 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1198 DOHTONL(packetsReceived);
1199 DOHTONL(packetsSent);
1200 DOHTONL(bytesReceived);
1203 i<sizeof(tconn.secStats.spares)/sizeof(short);
1207 i<sizeof(tconn.secStats.sparel)/sizeof(afs_int32);
1212 MUTEX_EXIT(&rx_connHashTable_lock);
1213 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn), (char*)&tconn);
1215 ap->length = sizeof(struct rx_debugConn);
1216 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1221 MUTEX_EXIT(&rx_connHashTable_lock);
1223 /* if we make it here, there are no interesting packets */
1224 tconn.cid = htonl(0xffffffff); /* means end */
1225 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn), (char *)&tconn);
1227 ap->length = sizeof(struct rx_debugConn);
1228 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1234 * Pass back all the peer structures we have available
1237 case RX_DEBUGI_GETPEER: {
1239 register struct rx_peer *tp;
1240 struct rx_debugPeer tpeer;
1243 tl = sizeof(struct rx_debugPeer) - ap->length;
1245 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1249 memset((char *)&tpeer, 0, sizeof(tpeer));
1250 for(i=0;i<rx_hashTableSize;i++) {
1251 #if !defined(KERNEL)
1252 /* the time complexity of the algorithm used here
1253 * exponentially increses with the number of peers.
1255 * Yielding after processing each hash table entry
1256 * and dropping rx_peerHashTable_lock.
1257 * also increases the risk that we will miss a new
1258 * entry - but we are willing to live with this
1259 * limitation since this is meant for debugging only
1261 #ifdef AFS_PTHREAD_ENV
1264 (void) IOMGR_Poll();
1267 MUTEX_ENTER(&rx_peerHashTable_lock);
1268 for(tp=rx_peerHashTable[i]; tp; tp=tp->next) {
1269 if (tin.index-- <= 0) {
1270 tpeer.host = tp->host;
1271 tpeer.port = tp->port;
1272 tpeer.ifMTU = htons(tp->ifMTU);
1273 tpeer.idleWhen = htonl(tp->idleWhen);
1274 tpeer.refCount = htons(tp->refCount);
1275 tpeer.burstSize = tp->burstSize;
1276 tpeer.burst = tp->burst;
1277 tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1278 tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1279 tpeer.rtt = htonl(tp->rtt);
1280 tpeer.rtt_dev = htonl(tp->rtt_dev);
1281 tpeer.timeout.sec = htonl(tp->timeout.sec);
1282 tpeer.timeout.usec = htonl(tp->timeout.usec);
1283 tpeer.nSent = htonl(tp->nSent);
1284 tpeer.reSends = htonl(tp->reSends);
1285 tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1286 tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1287 tpeer.rateFlag = htonl(tp->rateFlag);
1288 tpeer.natMTU = htons(tp->natMTU);
1289 tpeer.maxMTU = htons(tp->maxMTU);
1290 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1291 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1292 tpeer.MTU = htons(tp->MTU);
1293 tpeer.cwind = htons(tp->cwind);
1294 tpeer.nDgramPackets = htons(tp->nDgramPackets);
1295 tpeer.congestSeq = htons(tp->congestSeq);
1296 tpeer.bytesSent.high = htonl(tp->bytesSent.high);
1297 tpeer.bytesSent.low = htonl(tp->bytesSent.low);
1298 tpeer.bytesReceived.high = htonl(tp->bytesReceived.high);
1299 tpeer.bytesReceived.low = htonl(tp->bytesReceived.low);
1301 MUTEX_EXIT(&rx_peerHashTable_lock);
1302 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer), (char*)&tpeer);
1304 ap->length = sizeof(struct rx_debugPeer);
1305 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1310 MUTEX_EXIT(&rx_peerHashTable_lock);
1312 /* if we make it here, there are no interesting packets */
1313 tpeer.host = htonl(0xffffffff); /* means end */
1314 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer), (char *)&tpeer);
1316 ap->length = sizeof(struct rx_debugPeer);
1317 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1322 case RX_DEBUGI_RXSTATS: {
1326 tl = sizeof(rx_stats) - ap->length;
1328 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1332 /* Since its all int32s convert to network order with a loop. */
1333 MUTEX_ENTER(&rx_stats_mutex);
1334 s = (afs_int32 *)&rx_stats;
1335 for (i=0; i<sizeof(rx_stats)/sizeof(afs_int32); i++,s++)
1336 rx_PutInt32(ap, i*sizeof(afs_int32), htonl(*s));
1339 ap->length = sizeof(rx_stats);
1340 MUTEX_EXIT(&rx_stats_mutex);
1341 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1347 /* error response packet */
1348 tin.type = htonl(RX_DEBUGI_BADTYPE);
1349 tin.index = tin.type;
1350 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1352 ap->length = sizeof(struct rx_debugIn);
1353 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1360 struct rx_packet *rxi_ReceiveVersionPacket(register struct rx_packet *ap,
1361 osi_socket asocket, afs_int32 ahost, short aport, int istack)
1366 * Only respond to client-initiated version requests, and
1367 * clear that flag in the response.
1369 if (ap->header.flags & RX_CLIENT_INITIATED) {
1372 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1373 rxi_EncodePacketHeader(ap);
1374 memset(buf, 0, sizeof(buf));
1375 strncpy(buf, cml_version_number+4, sizeof(buf)-1);
1376 rx_packetwrite(ap, 0, 65, buf);
1379 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1387 /* send a debug packet back to the sender */
1388 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
1389 afs_int32 ahost, short aport, afs_int32 istack)
1391 struct sockaddr_in taddr;
1397 int waslocked = ISAFS_GLOCK();
1400 taddr.sin_family = AF_INET;
1401 taddr.sin_port = aport;
1402 taddr.sin_addr.s_addr = ahost;
1403 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
1404 taddr.sin_len = sizeof(struct sockaddr_in);
1407 /* We need to trim the niovecs. */
1408 nbytes = apacket->length;
1409 for (i=1; i < apacket->niovecs; i++) {
1410 if (nbytes <= apacket->wirevec[i].iov_len) {
1411 savelen = apacket->wirevec[i].iov_len;
1412 saven = apacket->niovecs;
1413 apacket->wirevec[i].iov_len = nbytes;
1414 apacket->niovecs = i+1; /* so condition fails because i == niovecs */
1416 else nbytes -= apacket->wirevec[i].iov_len;
1420 if (waslocked) AFS_GUNLOCK();
1422 /* debug packets are not reliably delivered, hence the cast below. */
1423 (void) osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
1424 apacket->length+RX_HEADER_SIZE, istack);
1426 if (waslocked) AFS_GLOCK();
1429 if (saven) { /* means we truncated the packet above. */
1430 apacket->wirevec[i-1].iov_len = savelen;
1431 apacket->niovecs = saven;
1436 /* Send the packet to appropriate destination for the specified
1437 * call. The header is first encoded and placed in the packet.
1439 void rxi_SendPacket(struct rx_call * call, struct rx_connection * conn,
1440 struct rx_packet *p, int istack)
1446 struct sockaddr_in addr;
1447 register struct rx_peer *peer = conn->peer;
1450 char deliveryType = 'S';
1452 /* The address we're sending the packet to */
1453 memset(&addr, 0, sizeof (addr));
1454 addr.sin_family = AF_INET;
1455 addr.sin_port = peer->port;
1456 addr.sin_addr.s_addr = peer->host;
1458 /* This stuff should be revamped, I think, so that most, if not
1459 * all, of the header stuff is always added here. We could
1460 * probably do away with the encode/decode routines. XXXXX */
1462 /* Stamp each packet with a unique serial number. The serial
1463 * number is maintained on a connection basis because some types
1464 * of security may be based on the serial number of the packet,
1465 * and security is handled on a per authenticated-connection
1467 /* Pre-increment, to guarantee no zero serial number; a zero
1468 * serial number means the packet was never sent. */
1469 MUTEX_ENTER(&conn->conn_data_lock);
1470 p->header.serial = ++conn->serial;
1471 MUTEX_EXIT(&conn->conn_data_lock);
1472 /* This is so we can adjust retransmit time-outs better in the face of
1473 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1475 if (p->firstSerial == 0) {
1476 p->firstSerial = p->header.serial;
1480 /* If an output tracer function is defined, call it with the packet and
1481 * network address. Note this function may modify its arguments. */
1482 if (rx_almostSent) {
1483 int drop = (*rx_almostSent) (p, &addr);
1484 /* drop packet if return value is non-zero? */
1485 if (drop) deliveryType = 'D'; /* Drop the packet */
1489 /* Get network byte order header */
1490 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1491 * touch ALL the fields */
1493 /* Send the packet out on the same socket that related packets are being
1495 socket = (conn->type == RX_CLIENT_CONNECTION
1496 ? rx_socket : conn->service->socket);
1499 /* Possibly drop this packet, for testing purposes */
1500 if ((deliveryType == 'D') ||
1501 ((rx_intentionallyDroppedPacketsPer100 > 0) &&
1502 (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1503 deliveryType = 'D'; /* Drop the packet */
1506 deliveryType = 'S'; /* Send the packet */
1507 #endif /* RXDEBUG */
1509 /* Loop until the packet is sent. We'd prefer just to use a
1510 * blocking socket, but unfortunately the interface doesn't
1511 * allow us to have the socket block in send mode, and not
1512 * block in receive mode */
1515 waslocked = ISAFS_GLOCK();
1516 if (waslocked) AFS_GUNLOCK();
1518 if ((code = osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
1519 p->length+RX_HEADER_SIZE, istack)) != 0) {
1520 /* send failed, so let's hurry up the resend, eh? */
1521 MUTEX_ENTER(&rx_stats_mutex);
1522 rx_stats.netSendFailures++;
1523 MUTEX_EXIT(&rx_stats_mutex);
1524 p->retryTime = p->timeSent; /* resend it very soon */
1525 clock_Addmsec(&(p->retryTime), 10 + (((afs_uint32) p->backoff) << 8));
1527 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1528 /* Linux is nice -- it can tell us right away that we cannot
1529 * reach this recipient by returning an ENETUNREACH error
1530 * code. So, when this happens let's "down" the host NOW so
1531 * we don't sit around waiting for this host to timeout later.
1533 if (call && code == -ENETUNREACH)
1534 call->lastReceiveTime = 0;
1538 if (waslocked) AFS_GLOCK();
1543 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d",
1544 deliveryType, p->header.serial, rx_packetTypes[p->header.type-1],
1545 peer->host, peer->port, p->header.serial, p->header.epoch,
1546 p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1547 p, p->retryTime.sec, p->retryTime.usec/1000, p->length));
1549 MUTEX_ENTER(&rx_stats_mutex);
1550 rx_stats.packetsSent[p->header.type-1]++;
1551 MUTEX_EXIT(&rx_stats_mutex);
1552 MUTEX_ENTER(&peer->peer_lock);
1553 hadd32(peer->bytesSent, p->length);
1554 MUTEX_EXIT(&peer->peer_lock);
1557 /* Send a list of packets to appropriate destination for the specified
1558 * connection. The headers are first encoded and placed in the packets.
1560 void rxi_SendPacketList(struct rx_call * call, struct rx_connection * conn,
1561 struct rx_packet **list, int len, int istack)
1563 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1566 struct sockaddr_in addr;
1567 register struct rx_peer *peer = conn->peer;
1569 struct rx_packet *p = NULL;
1570 struct iovec wirevec[RX_MAXIOVECS];
1571 int i, length, code;
1574 struct rx_jumboHeader *jp;
1576 char deliveryType = 'S';
1578 /* The address we're sending the packet to */
1579 addr.sin_family = AF_INET;
1580 addr.sin_port = peer->port;
1581 addr.sin_addr.s_addr = peer->host;
1583 if (len+1 > RX_MAXIOVECS) {
1584 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
1588 * Stamp the packets in this jumbogram with consecutive serial numbers
1590 MUTEX_ENTER(&conn->conn_data_lock);
1591 serial = conn->serial;
1592 conn->serial += len;
1593 MUTEX_EXIT(&conn->conn_data_lock);
1596 /* This stuff should be revamped, I think, so that most, if not
1597 * all, of the header stuff is always added here. We could
1598 * probably do away with the encode/decode routines. XXXXX */
1601 length = RX_HEADER_SIZE;
1602 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
1603 wirevec[0].iov_len = RX_HEADER_SIZE;
1604 for (i = 0 ; i < len ; i++) {
1607 /* The whole 3.5 jumbogram scheme relies on packets fitting
1608 * in a single packet buffer. */
1609 if (p->niovecs > 2) {
1610 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
1613 /* Set the RX_JUMBO_PACKET flags in all but the last packets
1616 if (p->length != RX_JUMBOBUFFERSIZE) {
1617 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
1619 p->header.flags |= RX_JUMBO_PACKET;
1620 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1621 wirevec[i+1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1623 wirevec[i+1].iov_len = p->length;
1624 length += p->length;
1626 wirevec[i+1].iov_base = (char *)(&p->localdata[0]);
1628 /* Convert jumbo packet header to network byte order */
1629 temp = (afs_uint32)(p->header.flags) << 24;
1630 temp |= (afs_uint32)(p->header.spare);
1631 *(afs_uint32 *)jp = htonl(temp);
1633 jp = (struct rx_jumboHeader *)
1634 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
1636 /* Stamp each packet with a unique serial number. The serial
1637 * number is maintained on a connection basis because some types
1638 * of security may be based on the serial number of the packet,
1639 * and security is handled on a per authenticated-connection
1641 /* Pre-increment, to guarantee no zero serial number; a zero
1642 * serial number means the packet was never sent. */
1643 p->header.serial = ++serial;
1644 /* This is so we can adjust retransmit time-outs better in the face of
1645 * rapidly changing round-trip times. RTO estimation is not a la Karn.
1647 if (p->firstSerial == 0) {
1648 p->firstSerial = p->header.serial;
1652 /* If an output tracer function is defined, call it with the packet and
1653 * network address. Note this function may modify its arguments. */
1654 if (rx_almostSent) {
1655 int drop = (*rx_almostSent) (p, &addr);
1656 /* drop packet if return value is non-zero? */
1657 if (drop) deliveryType = 'D'; /* Drop the packet */
1661 /* Get network byte order header */
1662 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
1663 * touch ALL the fields */
1666 /* Send the packet out on the same socket that related packets are being
1668 socket = (conn->type == RX_CLIENT_CONNECTION
1669 ? rx_socket : conn->service->socket);
1672 /* Possibly drop this packet, for testing purposes */
1673 if ((deliveryType == 'D') ||
1674 ((rx_intentionallyDroppedPacketsPer100 > 0) &&
1675 (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
1676 deliveryType = 'D'; /* Drop the packet */
1679 deliveryType = 'S'; /* Send the packet */
1680 #endif /* RXDEBUG */
1682 /* Loop until the packet is sent. We'd prefer just to use a
1683 * blocking socket, but unfortunately the interface doesn't
1684 * allow us to have the socket block in send mode, and not
1685 * block in receive mode */
1687 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1688 waslocked = ISAFS_GLOCK();
1689 if (!istack && waslocked) AFS_GUNLOCK();
1691 if ((code = osi_NetSend(socket, &addr, &wirevec[0], len+1, length, istack)) != 0){
1692 /* send failed, so let's hurry up the resend, eh? */
1693 MUTEX_ENTER(&rx_stats_mutex);
1694 rx_stats.netSendFailures++;
1695 MUTEX_EXIT(&rx_stats_mutex);
1696 for (i = 0 ; i < len ; i++) {
1698 p->retryTime = p->timeSent; /* resend it very soon */
1699 clock_Addmsec(&(p->retryTime), 10 + (((afs_uint32) p->backoff) << 8));
1701 #if defined(KERNEL) && defined(AFS_LINUX20_ENV)
1702 /* Linux is nice -- it can tell us right away that we cannot
1703 * reach this recipient by returning an ENETUNREACH error
1704 * code. So, when this happens let's "down" the host NOW so
1705 * we don't sit around waiting for this host to timeout later.
1707 if (call && code == -ENETUNREACH)
1708 call->lastReceiveTime = 0;
1711 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
1712 if (!istack && waslocked) AFS_GLOCK();
1717 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %x resend %d.%0.3d len %d",
1718 deliveryType, p->header.serial, rx_packetTypes[p->header.type-1],
1719 peer->host, peer->port, p->header.serial, p->header.epoch,
1720 p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1721 p, p->retryTime.sec, p->retryTime.usec/1000, p->length));
1723 MUTEX_ENTER(&rx_stats_mutex);
1724 rx_stats.packetsSent[p->header.type-1]++;
1725 MUTEX_EXIT(&rx_stats_mutex);
1726 MUTEX_ENTER(&peer->peer_lock);
1727 hadd32(peer->bytesSent, p->length);
1728 MUTEX_EXIT(&peer->peer_lock);
1732 /* Send a "special" packet to the peer connection. If call is
1733 * specified, then the packet is directed to a specific call channel
1734 * associated with the connection, otherwise it is directed to the
1735 * connection only. Uses optionalPacket if it is supplied, rather than
1736 * allocating a new packet buffer. Nbytes is the length of the data
1737 * portion of the packet. If data is non-null, nbytes of data are
1738 * copied into the packet. Type is the type of the packet, as defined
1739 * in rx.h. Bug: there's a lot of duplication between this and other
1740 * routines. This needs to be cleaned up. */
1741 struct rx_packet *rxi_SendSpecial(register struct rx_call *call,
1742 register struct rx_connection *conn, struct rx_packet *optionalPacket,
1743 int type, char *data, int nbytes, int istack)
1745 /* Some of the following stuff should be common code for all
1746 * packet sends (it's repeated elsewhere) */
1747 register struct rx_packet *p;
1749 int savelen = 0, saven = 0;
1750 int channel, callNumber;
1752 channel = call->channel;
1753 callNumber = *call->callNumber;
1754 /* BUSY packets refer to the next call on this connection */
1755 if (type == RX_PACKET_TYPE_BUSY) {
1764 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
1765 if (!p) osi_Panic("rxi_SendSpecial failure");
1772 p->header.serviceId = conn->serviceId;
1773 p->header.securityIndex = conn->securityIndex;
1774 p->header.cid = (conn->cid | channel);
1775 p->header.callNumber = callNumber;
1777 p->header.epoch = conn->epoch;
1778 p->header.type = type;
1779 p->header.flags = 0;
1780 if (conn->type == RX_CLIENT_CONNECTION)
1781 p->header.flags |= RX_CLIENT_INITIATED;
1783 rx_packetwrite(p, 0, nbytes, data);
1785 for (i=1; i < p->niovecs; i++) {
1786 if (nbytes <= p->wirevec[i].iov_len) {
1787 savelen = p->wirevec[i].iov_len;
1789 p->wirevec[i].iov_len = nbytes;
1790 p->niovecs = i+1; /* so condition fails because i == niovecs */
1792 else nbytes -= p->wirevec[i].iov_len;
1795 if (call) rxi_Send(call, p, istack);
1796 else rxi_SendPacket((struct rx_call *)0, conn, p, istack);
1797 if (saven) { /* means we truncated the packet above. We probably don't */
1798 /* really need to do this, but it seems safer this way, given that */
1799 /* sneaky optionalPacket... */
1800 p->wirevec[i-1].iov_len = savelen;
1803 if (!optionalPacket) rxi_FreePacket(p);
1804 return optionalPacket;
1808 /* Encode the packet's header (from the struct header in the packet to
1809 * the net byte order representation in the wire representation of the
1810 * packet, which is what is actually sent out on the wire) */
1811 void rxi_EncodePacketHeader(register struct rx_packet *p)
1813 register afs_uint32 *buf = (afs_uint32 *)(p->wirevec[0].iov_base); /* MTUXXX */
1815 memset((char *)buf, 0, RX_HEADER_SIZE);
1816 *buf++ = htonl(p->header.epoch);
1817 *buf++ = htonl(p->header.cid);
1818 *buf++ = htonl(p->header.callNumber);
1819 *buf++ = htonl(p->header.seq);
1820 *buf++ = htonl(p->header.serial);
1821 *buf++ = htonl( (((afs_uint32)p->header.type)<<24)
1822 | (((afs_uint32)p->header.flags)<<16)
1823 | (p->header.userStatus<<8) | p->header.securityIndex);
1824 /* Note: top 16 bits of this next word were reserved */
1825 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId&0xffff));
1828 /* Decode the packet's header (from net byte order to a struct header) */
1829 void rxi_DecodePacketHeader(register struct rx_packet *p)
1831 register afs_uint32 *buf = (afs_uint32*)(p->wirevec[0].iov_base); /* MTUXXX */
1834 p->header.epoch = ntohl(*buf);
1836 p->header.cid = ntohl(*buf);
1838 p->header.callNumber = ntohl(*buf);
1840 p->header.seq = ntohl(*buf);
1842 p->header.serial = ntohl(*buf);
1848 /* C will truncate byte fields to bytes for me */
1849 p->header.type = temp>>24;
1850 p->header.flags = temp>>16;
1851 p->header.userStatus = temp>>8;
1852 p->header.securityIndex = temp>>0;
1857 p->header.serviceId = (temp&0xffff);
1858 p->header.spare = temp>>16;
1859 /* Note: top 16 bits of this last word are the security checksum */
1862 void rxi_PrepareSendPacket(register struct rx_call *call, register struct rx_packet *p,
1865 register struct rx_connection *conn = call->conn;
1867 ssize_t len; /* len must be a signed type; it can go negative */
1869 p->flags &= ~RX_PKTFLAG_ACKED;
1870 p->header.cid = (conn->cid | call->channel);
1871 p->header.serviceId = conn->serviceId;
1872 p->header.securityIndex = conn->securityIndex;
1873 p->header.callNumber = *call->callNumber;
1874 p->header.seq = call->tnext++;
1875 p->header.epoch = conn->epoch;
1876 p->header.type = RX_PACKET_TYPE_DATA;
1877 p->header.flags = 0;
1878 p->header.spare = 0;
1879 if (conn->type == RX_CLIENT_CONNECTION)
1880 p->header.flags |= RX_CLIENT_INITIATED;
1883 p->header.flags |= RX_LAST_PACKET;
1885 clock_Zero(&p->retryTime); /* Never yet transmitted */
1886 clock_Zero(&p->firstSent); /* Never yet transmitted */
1887 p->header.serial = 0; /* Another way of saying never transmitted... */
1890 /* Now that we're sure this is the last data on the call, make sure
1891 * that the "length" and the sum of the iov_lens matches. */
1892 len = p->length + call->conn->securityHeaderSize;
1894 for (i=1; i < p->niovecs && len > 0; i++) {
1895 len -= p->wirevec[i].iov_len;
1898 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
1901 /* Free any extra elements in the wirevec */
1902 for (j = MAX(2,i) ; j < p->niovecs ; j++) {
1903 rxi_freeCBuf(RX_CBUF_TO_PACKET(p->wirevec[j].iov_base, p));
1906 p->wirevec[i-1].iov_len += len;
1908 RXS_PreparePacket(conn->securityObject, call, p);
1911 /* Given an interface MTU size, calculate an adjusted MTU size that
1912 * will make efficient use of the RX buffers when the peer is sending
1913 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
1914 int rxi_AdjustIfMTU(int mtu)
1919 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1920 if (mtu <= adjMTU) {
1927 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
1928 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
1931 /* Given an interface MTU size, and the peer's advertised max receive
1932 * size, calculate an adjisted maxMTU size that makes efficient use
1933 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
1934 int rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
1936 int maxMTU = mtu * rxi_nSendFrags;
1937 maxMTU = MIN(maxMTU, peerMaxMTU);
1938 return rxi_AdjustIfMTU(maxMTU);
1941 /* Given a packet size, figure out how many datagram packet will fit.
1942 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
1943 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
1944 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
1945 int rxi_AdjustDgramPackets(int frags, int mtu)
1948 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
1951 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
1952 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
1953 /* subtract the size of the first and last packets */
1954 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
1958 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));