2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 # include "afs/sysincludes.h"
16 # include "afsincludes.h"
17 # include "rx_kcommon.h"
18 # else /* defined(UKERNEL) */
19 # ifdef RX_KERNEL_TRACE
20 # include "rx_kcommon.h"
23 # ifndef AFS_LINUX20_ENV
26 # if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
27 # include "afs/sysincludes.h"
29 # if defined(AFS_OBSD_ENV)
32 # include "h/socket.h"
33 # if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
34 # if !defined(AFS_AIX41_ENV)
35 # include "sys/mount.h" /* it gets pulled in by something later anyway */
39 # include "netinet/in.h"
40 # include "afs/afs_osi.h"
41 # include "rx_kmutex.h"
42 # endif /* defined(UKERNEL) */
47 # if defined(AFS_NT40_ENV)
49 # define EWOULDBLOCK WSAEWOULDBLOCK
52 # include "rx_xmit_nt.h"
58 # include <sys/sysmacros.h>
61 #include <opr/queue.h>
65 #include "rx_packet.h"
66 #include "rx_atomic.h"
67 #include "rx_globals.h"
68 #include "rx_internal.h"
76 * \brief structure used to keep track of allocated packets
78 struct rx_mallocedPacket {
79 struct opr_queue entry; /*!< chained using opr_queue */
80 struct rx_packet *addr; /*!< address of the first element */
81 afs_uint32 size; /*!< array size in bytes */
85 /* rxdb_fileID is used to identify the lock location, along with line#. */
86 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
87 #endif /* RX_LOCKS_DB */
88 static struct rx_packet *rx_mallocedP = 0;
90 static afs_uint32 rx_packet_id = 0;
93 extern char cml_version_number[];
95 static int AllocPacketBufs(int class, int num_pkts, struct opr_queue *q);
97 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
98 afs_uint32 ahost, short aport,
100 static struct rx_packet *rxi_AllocPacketNoLock(int class);
103 static void rxi_MorePacketsNoLock(int apackets);
106 #ifdef RX_ENABLE_TSFPQ
107 static int rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first,
109 static void rxi_AdjustLocalPacketsTSFPQ(int num_keep_local,
110 int allow_overcommit);
112 static void rxi_FreePacketNoLock(struct rx_packet *p);
113 static int rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first);
114 static int rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first,
115 struct opr_queue * q);
118 extern struct opr_queue rx_idleServerQueue;
120 /* some rules about packets:
121 * 1. When a packet is allocated, the final iov_buf contains room for
122 * a security trailer, but iov_len masks that fact. If the security
123 * package wants to add the trailer, it may do so, and then extend
124 * iov_len appropriately. For this reason, packet's niovecs and
125 * iov_len fields should be accurate before calling PreparePacket.
129 * all packet buffers (iov_base) are integral multiples of
131 * offset is an integral multiple of the word size.
134 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
138 for (l = 0, i = 1; i < packet->niovecs; i++) {
139 if (l + packet->wirevec[i].iov_len > offset) {
141 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
144 l += packet->wirevec[i].iov_len;
151 * all packet buffers (iov_base) are integral multiples of the word size.
152 * offset is an integral multiple of the word size.
155 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
159 for (l = 0, i = 1; i < packet->niovecs; i++) {
160 if (l + packet->wirevec[i].iov_len > offset) {
161 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
162 (offset - l))) = data;
165 l += packet->wirevec[i].iov_len;
172 * all packet buffers (iov_base) are integral multiples of the
174 * offset is an integral multiple of the word size.
176 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
179 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
182 unsigned int i, j, l, r;
183 for (l = 0, i = 1; i < packet->niovecs; i++) {
184 if (l + packet->wirevec[i].iov_len > offset) {
187 l += packet->wirevec[i].iov_len;
190 /* i is the iovec which contains the first little bit of data in which we
191 * are interested. l is the total length of everything prior to this iovec.
192 * j is the number of bytes we can safely copy out of this iovec.
193 * offset only applies to the first iovec.
196 while ((r > 0) && (i < packet->niovecs)) {
197 j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
198 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
201 l += packet->wirevec[i].iov_len;
206 return (r ? (resid - r) : resid);
211 * all packet buffers (iov_base) are integral multiples of the
213 * offset is an integral multiple of the word size.
216 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
218 unsigned int i, j, l, o, r;
221 for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
222 if (l + packet->wirevec[i].iov_len > o) {
225 l += packet->wirevec[i].iov_len;
228 /* i is the iovec which contains the first little bit of data in which we
229 * are interested. l is the total length of everything prior to this iovec.
230 * j is the number of bytes we can safely copy out of this iovec.
231 * offset only applies to the first iovec.
234 while ((r > 0) && (i <= RX_MAXWVECS)) {
235 if (i >= packet->niovecs)
236 if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0) /* ++niovecs as a side-effect */
239 b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
240 j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
244 l += packet->wirevec[i].iov_len;
249 return (r ? (resid - r) : resid);
253 rxi_AllocPackets(int class, int num_pkts, struct opr_queue * q)
257 num_pkts = AllocPacketBufs(class, num_pkts, q);
259 for (opr_queue_Scan(q, c)) {
260 RX_PACKET_IOV_FULLINIT(opr_queue_Entry(c, struct rx_packet, entry));
266 #ifdef RX_ENABLE_TSFPQ
268 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
270 struct rx_ts_info_t * rx_ts_info;
274 RX_TS_INFO_GET(rx_ts_info);
276 transfer = num_pkts - rx_ts_info->_FPQ.len;
279 MUTEX_ENTER(&rx_freePktQ_lock);
280 transfer = MAX(transfer, rx_TSFPQGlobSize);
281 if (transfer > rx_nFreePackets) {
282 /* alloc enough for us, plus a few globs for other threads */
283 rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
286 RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
288 MUTEX_EXIT(&rx_freePktQ_lock);
292 RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
296 #else /* RX_ENABLE_TSFPQ */
298 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
309 MUTEX_ENTER(&rx_freePktQ_lock);
312 for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
313 num_pkts--, overq++);
316 rxi_NeedMorePackets = TRUE;
317 if (rx_stats_active) {
319 case RX_PACKET_CLASS_RECEIVE:
320 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
322 case RX_PACKET_CLASS_SEND:
323 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
325 case RX_PACKET_CLASS_SPECIAL:
326 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
328 case RX_PACKET_CLASS_RECV_CBUF:
329 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
331 case RX_PACKET_CLASS_SEND_CBUF:
332 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
338 if (rx_nFreePackets < num_pkts)
339 num_pkts = rx_nFreePackets;
342 rxi_NeedMorePackets = TRUE;
346 if (rx_nFreePackets < num_pkts) {
347 rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
351 for (i=0, c=opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
353 i++, c=opr_queue_Next(&c->entry, struct rx_packet, entry)) {
357 opr_queue_SplitBeforeAppend(&rx_freePacketQueue, q, &c->entry);
359 rx_nFreePackets -= num_pkts;
364 MUTEX_EXIT(&rx_freePktQ_lock);
369 #endif /* RX_ENABLE_TSFPQ */
372 * Free a packet currently used as a continuation buffer
374 #ifdef RX_ENABLE_TSFPQ
375 /* num_pkts=0 means queue length is unknown */
377 rxi_FreePackets(int num_pkts, struct opr_queue * q)
379 struct rx_ts_info_t * rx_ts_info;
380 struct opr_queue *cursor, *store;
383 osi_Assert(num_pkts >= 0);
384 RX_TS_INFO_GET(rx_ts_info);
387 for (opr_queue_ScanSafe(q, cursor, store)) {
389 rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
393 for (opr_queue_ScanSafe(q, cursor, store)) {
394 rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
400 RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
403 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
405 MUTEX_ENTER(&rx_freePktQ_lock);
407 RX_TS_FPQ_LTOG(rx_ts_info);
409 /* Wakeup anyone waiting for packets */
412 MUTEX_EXIT(&rx_freePktQ_lock);
418 #else /* RX_ENABLE_TSFPQ */
419 /* num_pkts=0 means queue length is unknown */
421 rxi_FreePackets(int num_pkts, struct opr_queue *q)
423 struct opr_queue cbs;
424 struct opr_queue *cursor, *store;
428 osi_Assert(num_pkts >= 0);
429 opr_queue_Init(&cbs);
432 for (opr_queue_ScanSafe(q, cursor, store)) {
434 = opr_queue_Entry(cursor, struct rx_packet, entry);
435 if (p->niovecs > 2) {
436 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
444 for (opr_queue_ScanSafe(q, cursor, store)) {
446 = opr_queue_Entry(cursor, struct rx_packet, entry);
448 if (p->niovecs > 2) {
449 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
456 opr_queue_SpliceAppend(q, &cbs);
462 MUTEX_ENTER(&rx_freePktQ_lock);
464 opr_queue_SpliceAppend(&rx_freePacketQueue, q);
465 rx_nFreePackets += qlen;
467 /* Wakeup anyone waiting for packets */
470 MUTEX_EXIT(&rx_freePktQ_lock);
475 #endif /* RX_ENABLE_TSFPQ */
477 /* this one is kind of awful.
478 * In rxkad, the packet has been all shortened, and everything, ready for
479 * sending. All of a sudden, we discover we need some of that space back.
480 * This isn't terribly general, because it knows that the packets are only
481 * rounded up to the EBS (userdata + security header).
484 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
488 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
489 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
490 p->wirevec[i].iov_len += nb;
494 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
495 p->wirevec[i].iov_len += nb;
503 /* get sufficient space to store nb bytes of data (or more), and hook
504 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
505 * returns the number of bytes >0 which it failed to come up with.
506 * Don't need to worry about locking on packet, since only
507 * one thread can manipulate one at a time. Locking on continution
508 * packets is handled by AllocPacketBufs */
509 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
511 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
514 struct opr_queue q, *cursor, *store;
516 /* compute the number of cbuf's we need */
517 nv = nb / RX_CBUFFERSIZE;
518 if ((nv * RX_CBUFFERSIZE) < nb)
520 if ((nv + p->niovecs) > RX_MAXWVECS)
521 nv = RX_MAXWVECS - p->niovecs;
525 /* allocate buffers */
527 nv = AllocPacketBufs(class, nv, &q);
529 /* setup packet iovs */
531 for (opr_queue_ScanSafe(&q, cursor, store)) {
533 = opr_queue_Entry(cursor, struct rx_packet, entry);
535 opr_queue_Remove(&cb->entry);
536 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
537 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
541 nb -= (nv * RX_CBUFFERSIZE);
542 p->length += (nv * RX_CBUFFERSIZE);
549 * Register allocated packets.
551 * @param[in] addr array of packets
552 * @param[in] npkt number of packets
557 registerPackets(struct rx_packet *addr, afs_uint32 npkt)
559 struct rx_mallocedPacket *mp;
561 mp = osi_Alloc(sizeof(*mp));
563 osi_Assert(mp != NULL);
564 memset(mp, 0, sizeof(*mp));
567 mp->size = npkt * sizeof(struct rx_packet);
568 osi_Assert(npkt <= MAX_AFS_UINT32 / sizeof(struct rx_packet));
570 MUTEX_ENTER(&rx_mallocedPktQ_lock);
571 opr_queue_Append(&rx_mallocedPacketQueue, &mp->entry);
572 MUTEX_EXIT(&rx_mallocedPktQ_lock);
575 /* Add more packet buffers */
576 #ifdef RX_ENABLE_TSFPQ
578 rxi_MorePackets(int apackets)
580 struct rx_packet *p, *e;
581 struct rx_ts_info_t * rx_ts_info;
585 getme = apackets * sizeof(struct rx_packet);
586 p = osi_Alloc(getme);
588 registerPackets(p, apackets);
590 PIN(p, getme); /* XXXXX */
592 RX_TS_INFO_GET(rx_ts_info);
594 RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
595 /* TSFPQ patch also needs to keep track of total packets */
597 MUTEX_ENTER(&rx_packets_mutex);
598 rx_nPackets += apackets;
599 RX_TS_FPQ_COMPUTE_LIMITS;
600 MUTEX_EXIT(&rx_packets_mutex);
602 for (e = p + apackets; p < e; p++) {
603 RX_PACKET_IOV_INIT(p);
606 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
609 MUTEX_ENTER(&rx_freePktQ_lock);
610 #ifdef RXDEBUG_PACKET
611 p->packetId = rx_packet_id++;
612 p->allNextp = rx_mallocedP;
613 #endif /* RXDEBUG_PACKET */
615 MUTEX_EXIT(&rx_freePktQ_lock);
618 rx_ts_info->_FPQ.delta += apackets;
620 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
622 MUTEX_ENTER(&rx_freePktQ_lock);
624 RX_TS_FPQ_LTOG(rx_ts_info);
625 rxi_NeedMorePackets = FALSE;
628 MUTEX_EXIT(&rx_freePktQ_lock);
632 #else /* RX_ENABLE_TSFPQ */
634 rxi_MorePackets(int apackets)
636 struct rx_packet *p, *e;
640 getme = apackets * sizeof(struct rx_packet);
641 p = osi_Alloc(getme);
643 registerPackets(p, apackets);
645 PIN(p, getme); /* XXXXX */
648 MUTEX_ENTER(&rx_freePktQ_lock);
650 for (e = p + apackets; p < e; p++) {
651 RX_PACKET_IOV_INIT(p);
652 #ifdef RX_TRACK_PACKETS
653 p->flags |= RX_PKTFLAG_FREE;
657 opr_queue_Append(&rx_freePacketQueue, &p->entry);
658 #ifdef RXDEBUG_PACKET
659 p->packetId = rx_packet_id++;
660 p->allNextp = rx_mallocedP;
661 #endif /* RXDEBUG_PACKET */
665 rx_nPackets += apackets;
666 rx_nFreePackets += apackets;
667 rxi_NeedMorePackets = FALSE;
670 MUTEX_EXIT(&rx_freePktQ_lock);
673 #endif /* RX_ENABLE_TSFPQ */
675 #ifdef RX_ENABLE_TSFPQ
677 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
679 struct rx_packet *p, *e;
680 struct rx_ts_info_t * rx_ts_info;
684 getme = apackets * sizeof(struct rx_packet);
685 p = osi_Alloc(getme);
686 registerPackets(p, apackets);
688 PIN(p, getme); /* XXXXX */
690 RX_TS_INFO_GET(rx_ts_info);
692 RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
693 /* TSFPQ patch also needs to keep track of total packets */
694 MUTEX_ENTER(&rx_packets_mutex);
695 rx_nPackets += apackets;
696 RX_TS_FPQ_COMPUTE_LIMITS;
697 MUTEX_EXIT(&rx_packets_mutex);
699 for (e = p + apackets; p < e; p++) {
700 RX_PACKET_IOV_INIT(p);
702 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
705 MUTEX_ENTER(&rx_freePktQ_lock);
706 #ifdef RXDEBUG_PACKET
707 p->packetId = rx_packet_id++;
708 p->allNextp = rx_mallocedP;
709 #endif /* RXDEBUG_PACKET */
711 MUTEX_EXIT(&rx_freePktQ_lock);
714 rx_ts_info->_FPQ.delta += apackets;
717 (num_keep_local < apackets)) {
719 MUTEX_ENTER(&rx_freePktQ_lock);
721 RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
722 rxi_NeedMorePackets = FALSE;
725 MUTEX_EXIT(&rx_freePktQ_lock);
729 #endif /* RX_ENABLE_TSFPQ */
732 /* Add more packet buffers */
734 rxi_MorePacketsNoLock(int apackets)
736 #ifdef RX_ENABLE_TSFPQ
737 struct rx_ts_info_t * rx_ts_info;
738 #endif /* RX_ENABLE_TSFPQ */
739 struct rx_packet *p, *e;
742 /* allocate enough packets that 1/4 of the packets will be able
743 * to hold maximal amounts of data */
744 apackets += (apackets / 4)
745 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
747 getme = apackets * sizeof(struct rx_packet);
748 p = osi_Alloc(getme);
750 apackets -= apackets / 4;
751 osi_Assert(apackets > 0);
755 registerPackets(p, apackets);
757 #ifdef RX_ENABLE_TSFPQ
758 RX_TS_INFO_GET(rx_ts_info);
759 RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
760 #endif /* RX_ENABLE_TSFPQ */
762 for (e = p + apackets; p < e; p++) {
763 RX_PACKET_IOV_INIT(p);
764 #ifdef RX_TRACK_PACKETS
765 p->flags |= RX_PKTFLAG_FREE;
769 opr_queue_Append(&rx_freePacketQueue, &p->entry);
770 #ifdef RXDEBUG_PACKET
771 p->packetId = rx_packet_id++;
772 p->allNextp = rx_mallocedP;
773 #endif /* RXDEBUG_PACKET */
777 rx_nFreePackets += apackets;
778 MUTEX_ENTER(&rx_packets_mutex);
779 rx_nPackets += apackets;
780 #ifdef RX_ENABLE_TSFPQ
781 RX_TS_FPQ_COMPUTE_LIMITS;
782 #endif /* RX_ENABLE_TSFPQ */
783 MUTEX_EXIT(&rx_packets_mutex);
784 rxi_NeedMorePackets = FALSE;
790 rxi_FreeAllPackets(void)
792 struct rx_mallocedPacket *mp;
794 MUTEX_ENTER(&rx_mallocedPktQ_lock);
796 while (!opr_queue_IsEmpty(&rx_mallocedPacketQueue)) {
797 mp = opr_queue_First(&rx_mallocedPacketQueue,
798 struct rx_mallocedPacket, entry);
799 opr_queue_Remove(&mp->entry);
800 osi_Free(mp->addr, mp->size);
801 UNPIN(mp->addr, mp->size);
802 osi_Free(mp, sizeof(*mp));
804 MUTEX_EXIT(&rx_mallocedPktQ_lock);
807 #ifdef RX_ENABLE_TSFPQ
809 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
811 struct rx_ts_info_t * rx_ts_info;
815 RX_TS_INFO_GET(rx_ts_info);
817 if (num_keep_local != rx_ts_info->_FPQ.len) {
819 MUTEX_ENTER(&rx_freePktQ_lock);
820 if (num_keep_local < rx_ts_info->_FPQ.len) {
821 xfer = rx_ts_info->_FPQ.len - num_keep_local;
822 RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
825 xfer = num_keep_local - rx_ts_info->_FPQ.len;
826 if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
827 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
828 if (rx_nFreePackets < xfer) {
829 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
831 RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
833 MUTEX_EXIT(&rx_freePktQ_lock);
839 rxi_FlushLocalPacketsTSFPQ(void)
841 rxi_AdjustLocalPacketsTSFPQ(0, 0);
843 #endif /* RX_ENABLE_TSFPQ */
845 /* Allocate more packets iff we need more continuation buffers */
846 /* In kernel, can't page in memory with interrupts disabled, so we
847 * don't use the event mechanism. */
849 rx_CheckPackets(void)
851 if (rxi_NeedMorePackets) {
852 rxi_MorePackets(rx_maxSendWindow);
856 /* In the packet freeing routine below, the assumption is that
857 we want all of the packets to be used equally frequently, so that we
858 don't get packet buffers paging out. It would be just as valid to
859 assume that we DO want them to page out if not many are being used.
860 In any event, we assume the former, and append the packets to the end
862 /* This explanation is bogus. The free list doesn't remain in any kind of
863 useful order for afs_int32: the packets in use get pretty much randomly scattered
864 across all the pages. In order to permit unused {packets,bufs} to page out, they
865 must be stored so that packets which are adjacent in memory are adjacent in the
866 free list. An array springs rapidly to mind.
869 /* Actually free the packet p. */
870 #ifndef RX_ENABLE_TSFPQ
872 rxi_FreePacketNoLock(struct rx_packet *p)
874 dpf(("Free %"AFS_PTR_FMT"\n", p));
878 opr_queue_Append(&rx_freePacketQueue, &p->entry);
880 #endif /* RX_ENABLE_TSFPQ */
882 #ifdef RX_ENABLE_TSFPQ
884 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
886 struct rx_ts_info_t * rx_ts_info;
887 dpf(("Free %"AFS_PTR_FMT"\n", p));
889 RX_TS_INFO_GET(rx_ts_info);
890 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
892 if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
894 MUTEX_ENTER(&rx_freePktQ_lock);
896 RX_TS_FPQ_LTOG(rx_ts_info);
898 /* Wakeup anyone waiting for packets */
901 MUTEX_EXIT(&rx_freePktQ_lock);
905 #endif /* RX_ENABLE_TSFPQ */
908 * free continuation buffers off a packet into a queue
910 * [IN] p -- packet from which continuation buffers will be freed
911 * [IN] first -- iovec offset of first continuation buffer to free
912 * [IN] q -- queue into which continuation buffers will be chained
915 * number of continuation buffers freed
917 #ifndef RX_ENABLE_TSFPQ
919 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct opr_queue * q)
922 struct rx_packet * cb;
925 for (first = MAX(2, first); first < p->niovecs; first++, count++) {
926 iov = &p->wirevec[first];
928 osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
929 cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
930 RX_FPQ_MARK_FREE(cb);
931 opr_queue_Append(q, &cb->entry);
940 * free packet continuation buffers into the global free packet pool
942 * [IN] p -- packet from which to free continuation buffers
943 * [IN] first -- iovec offset of first continuation buffer to free
949 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
953 for (first = MAX(2, first); first < p->niovecs; first++) {
954 iov = &p->wirevec[first];
956 osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
957 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
968 * free packet continuation buffers into the thread-local free pool
970 * [IN] p -- packet from which continuation buffers will be freed
971 * [IN] first -- iovec offset of first continuation buffer to free
972 * any value less than 2, the min number of iovecs,
973 * is treated as if it is 2.
974 * [IN] flush_global -- if nonzero, we will flush overquota packets to the
975 * global free pool before returning
981 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
984 struct rx_ts_info_t * rx_ts_info;
986 RX_TS_INFO_GET(rx_ts_info);
988 for (first = MAX(2, first); first < p->niovecs; first++) {
989 iov = &p->wirevec[first];
991 osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
992 RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
997 if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
999 MUTEX_ENTER(&rx_freePktQ_lock);
1001 RX_TS_FPQ_LTOG(rx_ts_info);
1003 /* Wakeup anyone waiting for packets */
1004 rxi_PacketsUnWait();
1006 MUTEX_EXIT(&rx_freePktQ_lock);
1011 #endif /* RX_ENABLE_TSFPQ */
1013 int rxi_nBadIovecs = 0;
1015 /* rxi_RestoreDataBufs
1017 * Restore the correct sizes to the iovecs. Called when reusing a packet
1018 * for reading off the wire.
1021 rxi_RestoreDataBufs(struct rx_packet *p)
1026 RX_PACKET_IOV_INIT(p);
1028 for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
1029 if (!iov->iov_base) {
1034 iov->iov_len = RX_CBUFFERSIZE;
1038 #ifdef RX_ENABLE_TSFPQ
1040 rxi_TrimDataBufs(struct rx_packet *p, int first)
1043 struct iovec *iov, *end;
1044 struct rx_ts_info_t * rx_ts_info;
1048 osi_Panic("TrimDataBufs 1: first must be 1");
1050 /* Skip over continuation buffers containing message data */
1051 iov = &p->wirevec[2];
1052 end = iov + (p->niovecs - 2);
1053 length = p->length - p->wirevec[1].iov_len;
1054 for (; iov < end && length > 0; iov++) {
1056 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1057 length -= iov->iov_len;
1060 /* iov now points to the first empty data buffer. */
1064 RX_TS_INFO_GET(rx_ts_info);
1065 for (; iov < end; iov++) {
1067 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1068 RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1071 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1073 MUTEX_ENTER(&rx_freePktQ_lock);
1075 RX_TS_FPQ_LTOG(rx_ts_info);
1076 rxi_PacketsUnWait();
1078 MUTEX_EXIT(&rx_freePktQ_lock);
1084 #else /* RX_ENABLE_TSFPQ */
1086 rxi_TrimDataBufs(struct rx_packet *p, int first)
1089 struct iovec *iov, *end;
1093 osi_Panic("TrimDataBufs 1: first must be 1");
1095 /* Skip over continuation buffers containing message data */
1096 iov = &p->wirevec[2];
1097 end = iov + (p->niovecs - 2);
1098 length = p->length - p->wirevec[1].iov_len;
1099 for (; iov < end && length > 0; iov++) {
1101 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1102 length -= iov->iov_len;
1105 /* iov now points to the first empty data buffer. */
1110 MUTEX_ENTER(&rx_freePktQ_lock);
1112 for (; iov < end; iov++) {
1114 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1115 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1118 rxi_PacketsUnWait();
1120 MUTEX_EXIT(&rx_freePktQ_lock);
1125 #endif /* RX_ENABLE_TSFPQ */
1127 /* Free the packet p. P is assumed not to be on any queue, i.e.
1128 * remove it yourself first if you call this routine. */
1129 #ifdef RX_ENABLE_TSFPQ
1131 rxi_FreePacket(struct rx_packet *p)
1133 rxi_FreeDataBufsTSFPQ(p, 2, 0);
1134 rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1136 #else /* RX_ENABLE_TSFPQ */
1138 rxi_FreePacket(struct rx_packet *p)
1143 MUTEX_ENTER(&rx_freePktQ_lock);
1145 rxi_FreeDataBufsNoLock(p, 2);
1146 rxi_FreePacketNoLock(p);
1147 /* Wakeup anyone waiting for packets */
1148 rxi_PacketsUnWait();
1150 MUTEX_EXIT(&rx_freePktQ_lock);
1153 #endif /* RX_ENABLE_TSFPQ */
1155 /* rxi_AllocPacket sets up p->length so it reflects the number of
1156 * bytes in the packet at this point, **not including** the header.
1157 * The header is absolutely necessary, besides, this is the way the
1158 * length field is usually used */
1159 #ifdef RX_ENABLE_TSFPQ
1160 static struct rx_packet *
1161 rxi_AllocPacketNoLock(int class)
1163 struct rx_packet *p;
1164 struct rx_ts_info_t * rx_ts_info;
1166 RX_TS_INFO_GET(rx_ts_info);
1168 if (rx_stats_active)
1169 rx_atomic_inc(&rx_stats.packetRequests);
1170 if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1173 if (opr_queue_IsEmpty(&rx_freePacketQueue))
1174 osi_Panic("rxi_AllocPacket error");
1176 if (opr_queue_IsEmpty(&rx_freePacketQueue))
1177 rxi_MorePacketsNoLock(rx_maxSendWindow);
1181 RX_TS_FPQ_GTOL(rx_ts_info);
1184 RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1186 dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1189 /* have to do this here because rx_FlushWrite fiddles with the iovs in
1190 * order to truncate outbound packets. In the near future, may need
1191 * to allocate bufs from a static pool here, and/or in AllocSendPacket
1193 RX_PACKET_IOV_FULLINIT(p);
1196 #else /* RX_ENABLE_TSFPQ */
1197 static struct rx_packet *
1198 rxi_AllocPacketNoLock(int class)
1200 struct rx_packet *p;
1203 if (rxi_OverQuota(class)) {
1204 rxi_NeedMorePackets = TRUE;
1205 if (rx_stats_active) {
1207 case RX_PACKET_CLASS_RECEIVE:
1208 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1210 case RX_PACKET_CLASS_SEND:
1211 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1213 case RX_PACKET_CLASS_SPECIAL:
1214 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1216 case RX_PACKET_CLASS_RECV_CBUF:
1217 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1219 case RX_PACKET_CLASS_SEND_CBUF:
1220 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1224 return (struct rx_packet *)0;
1228 if (rx_stats_active)
1229 rx_atomic_inc(&rx_stats.packetRequests);
1232 if (opr_queue_IsEmpty(&rx_freePacketQueue))
1233 osi_Panic("rxi_AllocPacket error");
1235 if (opr_queue_IsEmpty(&rx_freePacketQueue))
1236 rxi_MorePacketsNoLock(rx_maxSendWindow);
1240 p = opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
1241 opr_queue_Remove(&p->entry);
1242 RX_FPQ_MARK_USED(p);
1244 dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1247 /* have to do this here because rx_FlushWrite fiddles with the iovs in
1248 * order to truncate outbound packets. In the near future, may need
1249 * to allocate bufs from a static pool here, and/or in AllocSendPacket
1251 RX_PACKET_IOV_FULLINIT(p);
1254 #endif /* RX_ENABLE_TSFPQ */
1256 #ifdef RX_ENABLE_TSFPQ
1257 static struct rx_packet *
1258 rxi_AllocPacketTSFPQ(int class, int pull_global)
1260 struct rx_packet *p;
1261 struct rx_ts_info_t * rx_ts_info;
1263 RX_TS_INFO_GET(rx_ts_info);
1265 if (rx_stats_active)
1266 rx_atomic_inc(&rx_stats.packetRequests);
1267 if (pull_global && opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1268 MUTEX_ENTER(&rx_freePktQ_lock);
1270 if (opr_queue_IsEmpty(&rx_freePacketQueue))
1271 rxi_MorePacketsNoLock(rx_maxSendWindow);
1273 RX_TS_FPQ_GTOL(rx_ts_info);
1275 MUTEX_EXIT(&rx_freePktQ_lock);
1276 } else if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1280 RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1282 dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1284 /* have to do this here because rx_FlushWrite fiddles with the iovs in
1285 * order to truncate outbound packets. In the near future, may need
1286 * to allocate bufs from a static pool here, and/or in AllocSendPacket
1288 RX_PACKET_IOV_FULLINIT(p);
1291 #endif /* RX_ENABLE_TSFPQ */
1293 #ifdef RX_ENABLE_TSFPQ
1295 rxi_AllocPacket(int class)
1297 struct rx_packet *p;
1299 p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1302 #else /* RX_ENABLE_TSFPQ */
1304 rxi_AllocPacket(int class)
1306 struct rx_packet *p;
1308 MUTEX_ENTER(&rx_freePktQ_lock);
1309 p = rxi_AllocPacketNoLock(class);
1310 MUTEX_EXIT(&rx_freePktQ_lock);
1313 #endif /* RX_ENABLE_TSFPQ */
1315 /* This guy comes up with as many buffers as it {takes,can get} given
1316 * the MTU for this call. It also sets the packet length before
1317 * returning. caution: this is often called at NETPRI
1318 * Called with call locked.
1321 rxi_AllocSendPacket(struct rx_call *call, int want)
1323 struct rx_packet *p = (struct rx_packet *)0;
1328 mud = call->MTU - RX_HEADER_SIZE;
1330 rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1331 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1333 #ifdef RX_ENABLE_TSFPQ
1334 if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1336 want = MIN(want, mud);
1338 if ((unsigned)want > p->length)
1339 (void)rxi_AllocDataBuf(p, (want - p->length),
1340 RX_PACKET_CLASS_SEND_CBUF);
1342 if (p->length > mud)
1345 if (delta >= p->length) {
1353 #endif /* RX_ENABLE_TSFPQ */
1355 while (!(call->error)) {
1356 MUTEX_ENTER(&rx_freePktQ_lock);
1357 /* if an error occurred, or we get the packet we want, we're done */
1358 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1359 MUTEX_EXIT(&rx_freePktQ_lock);
1362 want = MIN(want, mud);
1364 if ((unsigned)want > p->length)
1365 (void)rxi_AllocDataBuf(p, (want - p->length),
1366 RX_PACKET_CLASS_SEND_CBUF);
1368 if (p->length > mud)
1371 if (delta >= p->length) {
1380 /* no error occurred, and we didn't get a packet, so we sleep.
1381 * At this point, we assume that packets will be returned
1382 * sooner or later, as packets are acknowledged, and so we
1385 call->flags |= RX_CALL_WAIT_PACKETS;
1386 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1387 MUTEX_EXIT(&call->lock);
1388 rx_waitingForPackets = 1;
1390 #ifdef RX_ENABLE_LOCKS
1391 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1393 osi_rxSleep(&rx_waitingForPackets);
1395 MUTEX_EXIT(&rx_freePktQ_lock);
1396 MUTEX_ENTER(&call->lock);
1397 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1398 call->flags &= ~RX_CALL_WAIT_PACKETS;
1407 /* Windows does not use file descriptors. */
1408 #define CountFDs(amax) 0
1410 /* count the number of used FDs */
1419 for (i = 0; i < amax; i++) {
1420 code = fstat(i, &tstat);
1426 #endif /* AFS_NT40_ENV */
1429 #define CountFDs(amax) amax
1433 #if !defined(KERNEL) || defined(UKERNEL)
1435 /* This function reads a single packet from the interface into the
1436 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
1437 * (host,port) of the sender are stored in the supplied variables, and
1438 * the data length of the packet is stored in the packet structure.
1439 * The header is decoded. */
1441 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1444 struct sockaddr_in from;
1447 afs_uint32 tlen, savelen;
1449 rx_computelen(p, tlen);
1450 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
1452 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
1453 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
1454 * it once in order to avoid races. */
1457 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1465 /* Extend the last iovec for padding, it's just to make sure that the
1466 * read doesn't return more data than we expect, and is done to get around
1467 * our problems caused by the lack of a length field in the rx header.
1468 * Use the extra buffer that follows the localdata in each packet
1470 savelen = p->wirevec[p->niovecs - 1].iov_len;
1471 p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1473 memset(&msg, 0, sizeof(msg));
1474 msg.msg_name = (char *)&from;
1475 msg.msg_namelen = sizeof(struct sockaddr_in);
1476 msg.msg_iov = p->wirevec;
1477 msg.msg_iovlen = p->niovecs;
1478 nbytes = rxi_Recvmsg(socket, &msg, 0);
1480 /* restore the vec to its correct state */
1481 p->wirevec[p->niovecs - 1].iov_len = savelen;
1483 p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1484 if (nbytes < 0 || (nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1485 if (nbytes < 0 && errno == EWOULDBLOCK) {
1486 if (rx_stats_active)
1487 rx_atomic_inc(&rx_stats.noPacketOnRead);
1488 } else if (nbytes <= 0) {
1489 if (rx_stats_active) {
1490 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1491 rx_stats.bogusHost = from.sin_addr.s_addr;
1493 dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1494 ntohs(from.sin_port), nbytes));
1499 else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1500 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1501 rxi_DecodePacketHeader(p);
1503 *host = from.sin_addr.s_addr;
1504 *port = from.sin_port;
1506 dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1507 p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1508 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1510 #ifdef RX_TRIMDATABUFS
1511 rxi_TrimDataBufs(p, 1);
1517 /* Extract packet header. */
1518 rxi_DecodePacketHeader(p);
1520 *host = from.sin_addr.s_addr;
1521 *port = from.sin_port;
1523 && p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1525 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1528 #ifdef RX_TRIMDATABUFS
1529 /* Free any empty packet buffers at the end of this packet */
1530 rxi_TrimDataBufs(p, 1);
1536 #endif /* !KERNEL || UKERNEL */
1538 /* This function splits off the first packet in a jumbo packet.
1539 * As of AFS 3.5, jumbograms contain more than one fixed size
1540 * packet, and the RX_JUMBO_PACKET flag is set in all but the
1541 * last packet header. All packets (except the last) are padded to
1542 * fall on RX_CBUFFERSIZE boundaries.
1543 * HACK: We store the length of the first n-1 packets in the
1544 * last two pad bytes. */
1547 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1550 struct rx_packet *np;
1551 struct rx_jumboHeader *jp;
1557 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1558 * bytes in length. All but the first packet are preceded by
1559 * an abbreviated four byte header. The length of the last packet
1560 * is calculated from the size of the jumbogram. */
1561 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1563 if ((int)p->length < length) {
1564 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1567 niov = p->niovecs - 2;
1569 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1572 iov = &p->wirevec[2];
1573 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1575 /* Get a pointer to the abbreviated packet header */
1576 jp = (struct rx_jumboHeader *)
1577 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1579 /* Set up the iovecs for the next packet */
1580 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1581 np->wirevec[0].iov_len = sizeof(struct rx_header);
1582 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1583 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1584 np->niovecs = niov + 1;
1585 for (i = 2, iov++; i <= niov; i++, iov++) {
1586 np->wirevec[i] = *iov;
1588 np->length = p->length - length;
1589 p->length = RX_JUMBOBUFFERSIZE;
1592 /* Convert the jumbo packet header to host byte order */
1593 temp = ntohl(*(afs_uint32 *) jp);
1594 jp->flags = (u_char) (temp >> 24);
1595 jp->cksum = (u_short) (temp);
1597 /* Fill in the packet header */
1598 np->header = p->header;
1599 np->header.serial = p->header.serial + 1;
1600 np->header.seq = p->header.seq + 1;
1601 np->header.userStatus = 0;
1602 np->header.flags = jp->flags;
1603 np->header.spare = jp->cksum;
1609 /* Send a udp datagram */
1611 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1612 int length, int istack)
1617 memset(&msg, 0, sizeof(msg));
1619 msg.msg_iovlen = nvecs;
1620 msg.msg_name = addr;
1621 msg.msg_namelen = sizeof(struct sockaddr_in);
1623 ret = rxi_Sendmsg(socket, &msg, 0);
1627 #elif !defined(UKERNEL)
1629 * message receipt is done in rxk_input or rx_put.
1632 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1634 * Copy an mblock to the contiguous area pointed to by cp.
1635 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1636 * but it doesn't really.
1637 * Returns the number of bytes not transferred.
1638 * The message is NOT changed.
1641 cpytoc(mblk_t * mp, int off, int len, char *cp)
1645 for (; mp && len > 0; mp = mp->b_cont) {
1646 if (mp->b_datap->db_type != M_DATA) {
1649 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1650 memcpy(cp, (char *)mp->b_rptr, n);
1658 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1659 * but it doesn't really.
1660 * This sucks, anyway, do it like m_cpy.... below
1663 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1668 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1669 if (mp->b_datap->db_type != M_DATA) {
1672 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1678 t = iovs[i].iov_len;
1681 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1691 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
1692 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1694 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1696 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1699 unsigned int l1, l2, i, t;
1701 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1702 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1705 if (m->m_len <= off) {
1715 p1 = mtod(m, caddr_t) + off;
1716 l1 = m->m_len - off;
1718 p2 = iovs[0].iov_base;
1719 l2 = iovs[0].iov_len;
1722 t = MIN(l1, MIN(l2, (unsigned int)len));
1733 p1 = mtod(m, caddr_t);
1739 p2 = iovs[i].iov_base;
1740 l2 = iovs[i].iov_len;
1748 #endif /* AFS_SUN5_ENV */
1750 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1751 #if defined(AFS_NBSD_ENV)
1753 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1756 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1757 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1763 struct rx_packet *phandle;
1764 int hdr_len, data_len;
1765 #endif /* AFS_NBSD_ENV */
1770 m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1777 #endif /*KERNEL && !UKERNEL */
1780 /* send a response to a debug packet */
1783 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1784 afs_uint32 ahost, short aport, int istack)
1786 struct rx_debugIn tin;
1790 * Only respond to client-initiated Rx debug packets,
1791 * and clear the client flag in the response.
1793 if (ap->header.flags & RX_CLIENT_INITIATED) {
1794 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1795 rxi_EncodePacketHeader(ap);
1800 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1801 /* all done with packet, now set length to the truth, so we can
1802 * reuse this packet */
1803 rx_computelen(ap, ap->length);
1805 tin.type = ntohl(tin.type);
1806 tin.index = ntohl(tin.index);
1808 case RX_DEBUGI_GETSTATS:{
1809 struct rx_debugStats tstat;
1811 /* get basic stats */
1812 memset(&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1813 tstat.version = RX_DEBUGI_VERSION;
1814 #ifndef RX_ENABLE_LOCKS
1815 tstat.waitingForPackets = rx_waitingForPackets;
1817 MUTEX_ENTER(&rx_serverPool_lock);
1818 tstat.nFreePackets = htonl(rx_nFreePackets);
1819 tstat.nPackets = htonl(rx_nPackets);
1820 tstat.callsExecuted = htonl(rxi_nCalls);
1821 tstat.packetReclaims = htonl(rx_packetReclaims);
1822 tstat.usedFDs = CountFDs(64);
1823 tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1824 tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1825 tstat.idleThreads = opr_queue_Count(&rx_idleServerQueue);
1826 MUTEX_EXIT(&rx_serverPool_lock);
1827 tstat.idleThreads = htonl(tstat.idleThreads);
1828 tl = sizeof(struct rx_debugStats) - ap->length;
1830 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1833 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1835 ap->length = sizeof(struct rx_debugStats);
1836 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1837 rx_computelen(ap, ap->length);
1842 case RX_DEBUGI_GETALLCONN:
1843 case RX_DEBUGI_GETCONN:{
1845 struct rx_connection *tc;
1846 struct rx_call *tcall;
1847 struct rx_debugConn tconn;
1848 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1851 tl = sizeof(struct rx_debugConn) - ap->length;
1853 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1857 memset(&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1858 /* get N'th (maybe) "interesting" connection info */
1859 for (i = 0; i < rx_hashTableSize; i++) {
1860 #if !defined(KERNEL)
1861 /* the time complexity of the algorithm used here
1862 * exponentially increses with the number of connections.
1864 #ifdef AFS_PTHREAD_ENV
1870 MUTEX_ENTER(&rx_connHashTable_lock);
1871 /* We might be slightly out of step since we are not
1872 * locking each call, but this is only debugging output.
1874 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1875 if ((all || rxi_IsConnInteresting(tc))
1876 && tin.index-- <= 0) {
1877 int do_secstats = 0;
1878 tconn.host = tc->peer->host;
1879 tconn.port = tc->peer->port;
1880 tconn.cid = htonl(tc->cid);
1881 tconn.epoch = htonl(tc->epoch);
1882 tconn.serial = htonl(tc->serial);
1883 for (j = 0; j < RX_MAXCALLS; j++) {
1884 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1885 if ((tcall = tc->call[j])) {
1886 tconn.callState[j] = tcall->state;
1887 tconn.callMode[j] = tcall->app.mode;
1888 tconn.callFlags[j] = tcall->flags;
1889 if (!opr_queue_IsEmpty(&tcall->rq))
1890 tconn.callOther[j] |= RX_OTHER_IN;
1891 if (!opr_queue_IsEmpty(&tcall->tq))
1892 tconn.callOther[j] |= RX_OTHER_OUT;
1894 tconn.callState[j] = RX_STATE_NOTINIT;
1897 tconn.natMTU = htonl(tc->peer->natMTU);
1898 tconn.error = htonl(tc->error);
1899 tconn.flags = tc->flags;
1900 tconn.type = tc->type;
1901 tconn.securityIndex = tc->securityIndex;
1902 if (tc->securityObject) {
1904 code = RXS_GetStats(tc->securityObject, tc,
1911 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1912 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1915 DOHTONL(packetsReceived);
1916 DOHTONL(packetsSent);
1917 DOHTONL(bytesReceived);
1921 sizeof(tconn.secStats.spares) /
1926 sizeof(tconn.secStats.sparel) /
1927 sizeof(afs_int32); i++)
1930 memset(&tconn.secStats, 0, sizeof(tconn.secStats));
1933 MUTEX_EXIT(&rx_connHashTable_lock);
1934 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1937 ap->length = sizeof(struct rx_debugConn);
1938 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1944 MUTEX_EXIT(&rx_connHashTable_lock);
1946 /* if we make it here, there are no interesting packets */
1947 tconn.cid = htonl(0xffffffff); /* means end */
1948 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1951 ap->length = sizeof(struct rx_debugConn);
1952 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1958 * Pass back all the peer structures we have available
1961 case RX_DEBUGI_GETPEER:{
1964 struct rx_debugPeer tpeer;
1967 tl = sizeof(struct rx_debugPeer) - ap->length;
1969 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1973 memset(&tpeer, 0, sizeof(tpeer));
1974 for (i = 0; i < rx_hashTableSize; i++) {
1975 #if !defined(KERNEL)
1976 /* the time complexity of the algorithm used here
1977 * exponentially increses with the number of peers.
1979 * Yielding after processing each hash table entry
1980 * and dropping rx_peerHashTable_lock.
1981 * also increases the risk that we will miss a new
1982 * entry - but we are willing to live with this
1983 * limitation since this is meant for debugging only
1985 #ifdef AFS_PTHREAD_ENV
1991 MUTEX_ENTER(&rx_peerHashTable_lock);
1992 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1993 if (tin.index-- <= 0) {
1995 MUTEX_EXIT(&rx_peerHashTable_lock);
1997 MUTEX_ENTER(&tp->peer_lock);
1998 tpeer.host = tp->host;
1999 tpeer.port = tp->port;
2000 tpeer.ifMTU = htons(tp->ifMTU);
2001 tpeer.idleWhen = htonl(tp->idleWhen);
2002 tpeer.refCount = htons(tp->refCount);
2003 tpeer.burstSize = 0;
2005 tpeer.burstWait.sec = 0;
2006 tpeer.burstWait.usec = 0;
2007 tpeer.rtt = htonl(tp->rtt);
2008 tpeer.rtt_dev = htonl(tp->rtt_dev);
2009 tpeer.nSent = htonl(tp->nSent);
2010 tpeer.reSends = htonl(tp->reSends);
2011 tpeer.natMTU = htons(tp->natMTU);
2012 tpeer.maxMTU = htons(tp->maxMTU);
2013 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2014 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2015 tpeer.MTU = htons(tp->MTU);
2016 tpeer.cwind = htons(tp->cwind);
2017 tpeer.nDgramPackets = htons(tp->nDgramPackets);
2018 tpeer.congestSeq = htons(tp->congestSeq);
2019 tpeer.bytesSent.high =
2020 htonl(tp->bytesSent >> 32);
2021 tpeer.bytesSent.low =
2022 htonl(tp->bytesSent & MAX_AFS_UINT32);
2023 tpeer.bytesReceived.high =
2024 htonl(tp->bytesReceived >> 32);
2025 tpeer.bytesReceived.low =
2026 htonl(tp->bytesReceived & MAX_AFS_UINT32);
2027 MUTEX_EXIT(&tp->peer_lock);
2029 MUTEX_ENTER(&rx_peerHashTable_lock);
2031 MUTEX_EXIT(&rx_peerHashTable_lock);
2033 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2036 ap->length = sizeof(struct rx_debugPeer);
2037 rxi_SendDebugPacket(ap, asocket, ahost, aport,
2043 MUTEX_EXIT(&rx_peerHashTable_lock);
2045 /* if we make it here, there are no interesting packets */
2046 tpeer.host = htonl(0xffffffff); /* means end */
2047 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2050 ap->length = sizeof(struct rx_debugPeer);
2051 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2056 case RX_DEBUGI_RXSTATS:{
2060 tl = sizeof(rx_stats) - ap->length;
2062 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2066 /* Since its all int32s convert to network order with a loop. */
2067 if (rx_stats_active)
2068 MUTEX_ENTER(&rx_stats_mutex);
2069 s = (afs_int32 *) & rx_stats;
2070 for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2071 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2074 ap->length = sizeof(rx_stats);
2075 if (rx_stats_active)
2076 MUTEX_EXIT(&rx_stats_mutex);
2077 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2083 /* error response packet */
2084 tin.type = htonl(RX_DEBUGI_BADTYPE);
2085 tin.index = tin.type;
2086 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2088 ap->length = sizeof(struct rx_debugIn);
2089 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2097 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2098 afs_uint32 ahost, short aport, int istack)
2103 * Only respond to client-initiated version requests, and
2104 * clear that flag in the response.
2106 if (ap->header.flags & RX_CLIENT_INITIATED) {
2109 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2110 rxi_EncodePacketHeader(ap);
2111 memset(buf, 0, sizeof(buf));
2112 strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2113 rx_packetwrite(ap, 0, 65, buf);
2116 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2124 /* send a debug packet back to the sender */
2126 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2127 afs_uint32 ahost, short aport, afs_int32 istack)
2129 struct sockaddr_in taddr;
2130 unsigned int i, nbytes, savelen = 0;
2133 int waslocked = ISAFS_GLOCK();
2136 taddr.sin_family = AF_INET;
2137 taddr.sin_port = aport;
2138 taddr.sin_addr.s_addr = ahost;
2139 memset(&taddr.sin_zero, 0, sizeof(taddr.sin_zero));
2140 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2141 taddr.sin_len = sizeof(struct sockaddr_in);
2144 /* We need to trim the niovecs. */
2145 nbytes = apacket->length;
2146 for (i = 1; i < apacket->niovecs; i++) {
2147 if (nbytes <= apacket->wirevec[i].iov_len) {
2148 savelen = apacket->wirevec[i].iov_len;
2149 saven = apacket->niovecs;
2150 apacket->wirevec[i].iov_len = nbytes;
2151 apacket->niovecs = i + 1; /* so condition fails because i == niovecs */
2153 nbytes -= apacket->wirevec[i].iov_len;
2156 #ifdef RX_KERNEL_TRACE
2157 if (ICL_SETACTIVE(afs_iclSetp)) {
2160 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2161 "before osi_NetSend()");
2169 /* debug packets are not reliably delivered, hence the cast below. */
2170 (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2171 apacket->length + RX_HEADER_SIZE, istack);
2173 #ifdef RX_KERNEL_TRACE
2174 if (ICL_SETACTIVE(afs_iclSetp)) {
2176 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2177 "after osi_NetSend()");
2186 if (saven) { /* means we truncated the packet above. */
2187 apacket->wirevec[i - 1].iov_len = savelen;
2188 apacket->niovecs = saven;
2194 rxi_NetSendError(struct rx_call *call, int code)
2198 if (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) {
2201 if (code == -WSAEHOSTUNREACH) {
2204 #elif defined(AFS_LINUX20_ENV)
2205 if (code == -ENETUNREACH) {
2208 #elif defined(AFS_DARWIN_ENV)
2209 if (code == EHOSTUNREACH) {
2214 call->lastReceiveTime = 0;
2218 /* Send the packet to appropriate destination for the specified
2219 * call. The header is first encoded and placed in the packet.
2222 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2223 struct rx_packet *p, int istack)
2229 struct sockaddr_in addr;
2230 struct rx_peer *peer = conn->peer;
2233 char deliveryType = 'S';
2235 /* The address we're sending the packet to */
2236 memset(&addr, 0, sizeof(addr));
2237 addr.sin_family = AF_INET;
2238 addr.sin_port = peer->port;
2239 addr.sin_addr.s_addr = peer->host;
2240 memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2242 /* This stuff should be revamped, I think, so that most, if not
2243 * all, of the header stuff is always added here. We could
2244 * probably do away with the encode/decode routines. XXXXX */
2246 /* Stamp each packet with a unique serial number. The serial
2247 * number is maintained on a connection basis because some types
2248 * of security may be based on the serial number of the packet,
2249 * and security is handled on a per authenticated-connection
2251 /* Pre-increment, to guarantee no zero serial number; a zero
2252 * serial number means the packet was never sent. */
2253 MUTEX_ENTER(&conn->conn_data_lock);
2254 p->header.serial = ++conn->serial;
2255 if (p->length > conn->peer->maxPacketSize) {
2256 if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2257 (p->header.flags & RX_REQUEST_ACK)) {
2258 conn->lastPingSize = p->length;
2259 conn->lastPingSizeSer = p->header.serial;
2260 } else if (p->header.seq != 0) {
2261 conn->lastPacketSize = p->length;
2262 conn->lastPacketSizeSeq = p->header.seq;
2265 MUTEX_EXIT(&conn->conn_data_lock);
2266 /* This is so we can adjust retransmit time-outs better in the face of
2267 * rapidly changing round-trip times. RTO estimation is not a la Karn.
2269 if (p->firstSerial == 0) {
2270 p->firstSerial = p->header.serial;
2273 /* If an output tracer function is defined, call it with the packet and
2274 * network address. Note this function may modify its arguments. */
2275 if (rx_almostSent) {
2276 int drop = (*rx_almostSent) (p, &addr);
2277 /* drop packet if return value is non-zero? */
2279 deliveryType = 'D'; /* Drop the packet */
2283 /* Get network byte order header */
2284 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
2285 * touch ALL the fields */
2287 /* Send the packet out on the same socket that related packets are being
2291 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2294 /* Possibly drop this packet, for testing purposes */
2295 if ((deliveryType == 'D')
2296 || ((rx_intentionallyDroppedPacketsPer100 > 0)
2297 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2298 deliveryType = 'D'; /* Drop the packet */
2300 deliveryType = 'S'; /* Send the packet */
2301 #endif /* RXDEBUG */
2303 /* Loop until the packet is sent. We'd prefer just to use a
2304 * blocking socket, but unfortunately the interface doesn't
2305 * allow us to have the socket block in send mode, and not
2306 * block in receive mode */
2308 waslocked = ISAFS_GLOCK();
2309 #ifdef RX_KERNEL_TRACE
2310 if (ICL_SETACTIVE(afs_iclSetp)) {
2313 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2314 "before osi_NetSend()");
2323 osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2324 p->length + RX_HEADER_SIZE, istack)) != 0) {
2325 /* send failed, so let's hurry up the resend, eh? */
2326 if (rx_stats_active)
2327 rx_atomic_inc(&rx_stats.netSendFailures);
2328 p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2330 /* Some systems are nice and tell us right away that we cannot
2331 * reach this recipient by returning an error code.
2332 * So, when this happens let's "down" the host NOW so
2333 * we don't sit around waiting for this host to timeout later.
2336 rxi_NetSendError(call, code);
2340 #ifdef RX_KERNEL_TRACE
2341 if (ICL_SETACTIVE(afs_iclSetp)) {
2343 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2344 "after osi_NetSend()");
2355 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2356 deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2357 ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2358 p->header.seq, p->header.flags, p, p->length));
2360 if (rx_stats_active) {
2361 rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2362 MUTEX_ENTER(&peer->peer_lock);
2363 peer->bytesSent += p->length;
2364 MUTEX_EXIT(&peer->peer_lock);
2368 /* Send a list of packets to appropriate destination for the specified
2369 * connection. The headers are first encoded and placed in the packets.
2372 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2373 struct rx_packet **list, int len, int istack)
2375 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
2378 struct sockaddr_in addr;
2379 struct rx_peer *peer = conn->peer;
2381 struct rx_packet *p = NULL;
2382 struct iovec wirevec[RX_MAXIOVECS];
2383 int i, length, code;
2386 struct rx_jumboHeader *jp;
2388 char deliveryType = 'S';
2390 /* The address we're sending the packet to */
2391 addr.sin_family = AF_INET;
2392 addr.sin_port = peer->port;
2393 addr.sin_addr.s_addr = peer->host;
2394 memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2396 if (len + 1 > RX_MAXIOVECS) {
2397 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2401 * Stamp the packets in this jumbogram with consecutive serial numbers
2403 MUTEX_ENTER(&conn->conn_data_lock);
2404 serial = conn->serial;
2405 conn->serial += len;
2406 for (i = 0; i < len; i++) {
2408 /* a ping *or* a sequenced packet can count */
2409 if (p->length > conn->peer->maxPacketSize) {
2410 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2411 (p->header.flags & RX_REQUEST_ACK)) &&
2412 ((i == 0) || (p->length >= conn->lastPingSize))) {
2413 conn->lastPingSize = p->length;
2414 conn->lastPingSizeSer = serial + i;
2415 } else if ((p->header.seq != 0) &&
2416 ((i == 0) || (p->length >= conn->lastPacketSize))) {
2417 conn->lastPacketSize = p->length;
2418 conn->lastPacketSizeSeq = p->header.seq;
2422 MUTEX_EXIT(&conn->conn_data_lock);
2425 /* This stuff should be revamped, I think, so that most, if not
2426 * all, of the header stuff is always added here. We could
2427 * probably do away with the encode/decode routines. XXXXX */
2430 length = RX_HEADER_SIZE;
2431 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2432 wirevec[0].iov_len = RX_HEADER_SIZE;
2433 for (i = 0; i < len; i++) {
2436 /* The whole 3.5 jumbogram scheme relies on packets fitting
2437 * in a single packet buffer. */
2438 if (p->niovecs > 2) {
2439 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2442 /* Set the RX_JUMBO_PACKET flags in all but the last packets
2445 if (p->length != RX_JUMBOBUFFERSIZE) {
2446 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2448 p->header.flags |= RX_JUMBO_PACKET;
2449 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2450 wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2452 wirevec[i + 1].iov_len = p->length;
2453 length += p->length;
2455 wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2457 /* Convert jumbo packet header to network byte order */
2458 temp = (afs_uint32) (p->header.flags) << 24;
2459 temp |= (afs_uint32) (p->header.spare);
2460 *(afs_uint32 *) jp = htonl(temp);
2462 jp = (struct rx_jumboHeader *)
2463 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2465 /* Stamp each packet with a unique serial number. The serial
2466 * number is maintained on a connection basis because some types
2467 * of security may be based on the serial number of the packet,
2468 * and security is handled on a per authenticated-connection
2470 /* Pre-increment, to guarantee no zero serial number; a zero
2471 * serial number means the packet was never sent. */
2472 p->header.serial = ++serial;
2473 /* This is so we can adjust retransmit time-outs better in the face of
2474 * rapidly changing round-trip times. RTO estimation is not a la Karn.
2476 if (p->firstSerial == 0) {
2477 p->firstSerial = p->header.serial;
2480 /* If an output tracer function is defined, call it with the packet and
2481 * network address. Note this function may modify its arguments. */
2482 if (rx_almostSent) {
2483 int drop = (*rx_almostSent) (p, &addr);
2484 /* drop packet if return value is non-zero? */
2486 deliveryType = 'D'; /* Drop the packet */
2490 /* Get network byte order header */
2491 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
2492 * touch ALL the fields */
2495 /* Send the packet out on the same socket that related packets are being
2499 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2502 /* Possibly drop this packet, for testing purposes */
2503 if ((deliveryType == 'D')
2504 || ((rx_intentionallyDroppedPacketsPer100 > 0)
2505 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2506 deliveryType = 'D'; /* Drop the packet */
2508 deliveryType = 'S'; /* Send the packet */
2509 #endif /* RXDEBUG */
2511 /* Loop until the packet is sent. We'd prefer just to use a
2512 * blocking socket, but unfortunately the interface doesn't
2513 * allow us to have the socket block in send mode, and not
2514 * block in receive mode */
2515 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
2516 waslocked = ISAFS_GLOCK();
2517 if (!istack && waslocked)
2521 osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2523 /* send failed, so let's hurry up the resend, eh? */
2524 if (rx_stats_active)
2525 rx_atomic_inc(&rx_stats.netSendFailures);
2526 for (i = 0; i < len; i++) {
2528 p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2530 /* Some systems are nice and tell us right away that we cannot
2531 * reach this recipient by returning an error code.
2532 * So, when this happens let's "down" the host NOW so
2533 * we don't sit around waiting for this host to timeout later.
2536 rxi_NetSendError(call, code);
2539 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
2540 if (!istack && waslocked)
2546 osi_Assert(p != NULL);
2548 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2549 deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2550 ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2551 p->header.seq, p->header.flags, p, p->length));
2554 if (rx_stats_active) {
2555 rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2556 MUTEX_ENTER(&peer->peer_lock);
2557 peer->bytesSent += p->length;
2558 MUTEX_EXIT(&peer->peer_lock);
2562 /* Send a raw abort packet, without any call or connection structures */
2564 rxi_SendRawAbort(osi_socket socket, afs_uint32 host, u_short port,
2565 afs_uint32 serial, afs_int32 error,
2566 struct rx_packet *source, int istack)
2568 struct rx_header theader;
2569 struct sockaddr_in addr;
2570 struct iovec iov[2];
2572 memset(&theader, 0, sizeof(theader));
2573 theader.epoch = htonl(source->header.epoch);
2574 theader.callNumber = htonl(source->header.callNumber);
2575 theader.serial = htonl(serial);
2576 theader.type = RX_PACKET_TYPE_ABORT;
2577 theader.serviceId = htons(source->header.serviceId);
2578 theader.securityIndex = source->header.securityIndex;
2579 theader.cid = htonl(source->header.cid);
2582 * If the abort is being sent in response to a server initiated packet,
2583 * set client_initiated in the abort to ensure it is not associated by
2584 * the receiver with a connection in the opposite direction.
2586 if ((source->header.flags & RX_CLIENT_INITIATED) != RX_CLIENT_INITIATED)
2587 theader.flags |= RX_CLIENT_INITIATED;
2589 error = htonl(error);
2591 iov[0].iov_base = &theader;
2592 iov[0].iov_len = sizeof(struct rx_header);
2593 iov[1].iov_base = &error;
2594 iov[1].iov_len = sizeof(error);
2596 addr.sin_family = AF_INET;
2597 addr.sin_addr.s_addr = host;
2598 addr.sin_port = port;
2599 memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2600 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2601 addr.sin_len = sizeof(struct sockaddr_in);
2604 osi_NetSend(socket, &addr, iov, 2,
2605 sizeof(struct rx_header) + sizeof(error), istack);
2608 /* Send a "special" packet to the peer connection. If call is
2609 * specified, then the packet is directed to a specific call channel
2610 * associated with the connection, otherwise it is directed to the
2611 * connection only. Uses optionalPacket if it is supplied, rather than
2612 * allocating a new packet buffer. Nbytes is the length of the data
2613 * portion of the packet. If data is non-null, nbytes of data are
2614 * copied into the packet. Type is the type of the packet, as defined
2615 * in rx.h. Bug: there's a lot of duplication between this and other
2616 * routines. This needs to be cleaned up. */
2618 rxi_SendSpecial(struct rx_call *call,
2619 struct rx_connection *conn,
2620 struct rx_packet *optionalPacket, int type, char *data,
2621 int nbytes, int istack)
2623 /* Some of the following stuff should be common code for all
2624 * packet sends (it's repeated elsewhere) */
2625 struct rx_packet *p;
2627 int savelen = 0, saven = 0;
2628 int channel, callNumber;
2630 channel = call->channel;
2631 callNumber = *call->callNumber;
2632 /* BUSY packets refer to the next call on this connection */
2633 if (type == RX_PACKET_TYPE_BUSY) {
2642 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2644 osi_Panic("rxi_SendSpecial failure");
2651 p->header.serviceId = conn->serviceId;
2652 p->header.securityIndex = conn->securityIndex;
2653 p->header.cid = (conn->cid | channel);
2654 p->header.callNumber = callNumber;
2656 p->header.epoch = conn->epoch;
2657 p->header.type = type;
2658 p->header.userStatus = 0;
2659 p->header.flags = 0;
2660 if (conn->type == RX_CLIENT_CONNECTION)
2661 p->header.flags |= RX_CLIENT_INITIATED;
2663 rx_packetwrite(p, 0, nbytes, data);
2665 for (i = 1; i < p->niovecs; i++) {
2666 if (nbytes <= p->wirevec[i].iov_len) {
2667 savelen = p->wirevec[i].iov_len;
2669 p->wirevec[i].iov_len = nbytes;
2670 p->niovecs = i + 1; /* so condition fails because i == niovecs */
2672 nbytes -= p->wirevec[i].iov_len;
2676 rxi_Send(call, p, istack);
2678 rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2679 if (saven) { /* means we truncated the packet above. We probably don't */
2680 /* really need to do this, but it seems safer this way, given that */
2681 /* sneaky optionalPacket... */
2682 p->wirevec[i - 1].iov_len = savelen;
2685 if (!optionalPacket)
2687 return optionalPacket;
2691 /* Encode the packet's header (from the struct header in the packet to
2692 * the net byte order representation in the wire representation of the
2693 * packet, which is what is actually sent out on the wire) */
2695 rxi_EncodePacketHeader(struct rx_packet *p)
2697 afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
2699 memset(buf, 0, RX_HEADER_SIZE);
2700 *buf++ = htonl(p->header.epoch);
2701 *buf++ = htonl(p->header.cid);
2702 *buf++ = htonl(p->header.callNumber);
2703 *buf++ = htonl(p->header.seq);
2704 *buf++ = htonl(p->header.serial);
2705 *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2706 | (((afs_uint32) p->header.flags) << 16)
2707 | (p->header.userStatus << 8) | p->header.securityIndex);
2708 /* Note: top 16 bits of this next word were reserved */
2709 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2712 /* Decode the packet's header (from net byte order to a struct header) */
2714 rxi_DecodePacketHeader(struct rx_packet *p)
2716 afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
2719 p->header.epoch = ntohl(*buf);
2721 p->header.cid = ntohl(*buf);
2723 p->header.callNumber = ntohl(*buf);
2725 p->header.seq = ntohl(*buf);
2727 p->header.serial = ntohl(*buf);
2733 /* C will truncate byte fields to bytes for me */
2734 p->header.type = temp >> 24;
2735 p->header.flags = temp >> 16;
2736 p->header.userStatus = temp >> 8;
2737 p->header.securityIndex = temp >> 0;
2742 p->header.serviceId = (temp & 0xffff);
2743 p->header.spare = temp >> 16;
2744 /* Note: top 16 bits of this last word are the security checksum */
2748 * LOCKS HELD: called with call->lock held.
2750 * PrepareSendPacket is the only place in the code that
2751 * can increment call->tnext. This could become an atomic
2752 * in the future. Beyond that there is nothing in this
2753 * function that requires the call being locked. This
2754 * function can only be called by the application thread.
2757 rxi_PrepareSendPacket(struct rx_call *call,
2758 struct rx_packet *p, int last)
2760 struct rx_connection *conn = call->conn;
2761 afs_uint32 seq = call->tnext++;
2763 afs_int32 len; /* len must be a signed type; it can go negative */
2766 /* No data packets on call 0. Where do these come from? */
2767 if (*call->callNumber == 0)
2768 *call->callNumber = 1;
2770 MUTEX_EXIT(&call->lock);
2771 p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2773 p->header.cid = (conn->cid | call->channel);
2774 p->header.serviceId = conn->serviceId;
2775 p->header.securityIndex = conn->securityIndex;
2777 p->header.callNumber = *call->callNumber;
2778 p->header.seq = seq;
2779 p->header.epoch = conn->epoch;
2780 p->header.type = RX_PACKET_TYPE_DATA;
2781 p->header.userStatus = 0;
2782 p->header.flags = 0;
2783 p->header.spare = 0;
2784 if (conn->type == RX_CLIENT_CONNECTION)
2785 p->header.flags |= RX_CLIENT_INITIATED;
2788 p->header.flags |= RX_LAST_PACKET;
2790 clock_Zero(&p->firstSent); /* Never yet transmitted */
2791 p->header.serial = 0; /* Another way of saying never transmitted... */
2793 /* Now that we're sure this is the last data on the call, make sure
2794 * that the "length" and the sum of the iov_lens matches. */
2795 len = p->length + call->conn->securityHeaderSize;
2797 for (i = 1; i < p->niovecs && len > 0; i++) {
2798 len -= p->wirevec[i].iov_len;
2801 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
2802 } else if (i < p->niovecs) {
2803 /* Free any extra elements in the wirevec */
2804 #if defined(RX_ENABLE_TSFPQ)
2805 rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2806 #else /* !RX_ENABLE_TSFPQ */
2807 MUTEX_ENTER(&rx_freePktQ_lock);
2808 rxi_FreeDataBufsNoLock(p, i);
2809 MUTEX_EXIT(&rx_freePktQ_lock);
2810 #endif /* !RX_ENABLE_TSFPQ */
2815 p->wirevec[i - 1].iov_len += len;
2816 MUTEX_ENTER(&call->lock);
2817 code = RXS_PreparePacket(conn->securityObject, call, p);
2819 MUTEX_EXIT(&call->lock);
2820 rxi_ConnectionError(conn, code);
2821 MUTEX_ENTER(&conn->conn_data_lock);
2822 p = rxi_SendConnectionAbort(conn, p, 0, 0);
2823 MUTEX_EXIT(&conn->conn_data_lock);
2824 MUTEX_ENTER(&call->lock);
2825 /* setting a connection error means all calls for that conn are also
2826 * error'd. if this call does not have an error by now, something is
2827 * very wrong, and we risk sending data in the clear that is supposed
2828 * to be encrypted. */
2829 osi_Assert(call->error);
2833 /* Given an interface MTU size, calculate an adjusted MTU size that
2834 * will make efficient use of the RX buffers when the peer is sending
2835 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
2837 rxi_AdjustIfMTU(int mtu)
2842 if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2844 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2845 if (mtu <= adjMTU) {
2852 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2853 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2856 /* Given an interface MTU size, and the peer's advertised max receive
2857 * size, calculate an adjisted maxMTU size that makes efficient use
2858 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2860 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2862 int maxMTU = mtu * rxi_nSendFrags;
2863 maxMTU = MIN(maxMTU, peerMaxMTU);
2864 return rxi_AdjustIfMTU(maxMTU);
2867 /* Given a packet size, figure out how many datagram packet will fit.
2868 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2869 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2870 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2872 rxi_AdjustDgramPackets(int frags, int mtu)
2875 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2878 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2879 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2880 /* subtract the size of the first and last packets */
2881 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2885 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2890 * This function can be used by the Windows Cache Manager
2891 * to dump the list of all rx packets so that we can determine
2892 * where the packet leakage is.
2894 int rx_DumpPackets(FILE *outputFile, char *cookie)
2896 #ifdef RXDEBUG_PACKET
2897 struct rx_packet *p;
2901 #define RXDPRINTF sprintf
2902 #define RXDPRINTOUT output
2904 #define RXDPRINTF fprintf
2905 #define RXDPRINTOUT outputFile
2909 MUTEX_ENTER(&rx_freePktQ_lock);
2910 RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2912 WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2915 for (p = rx_mallocedP; p; p = p->allNextp) {
2916 RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2917 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2918 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2919 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2920 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2921 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2923 WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2927 RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2929 WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2932 MUTEX_EXIT(&rx_freePktQ_lock);
2934 #endif /* RXDEBUG_PACKET */