2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
10 #include <afsconfig.h>
11 #include <afs/param.h>
15 # include "afs/sysincludes.h"
16 # include "afsincludes.h"
17 # include "rx_kcommon.h"
18 # else /* defined(UKERNEL) */
19 # ifdef RX_KERNEL_TRACE
20 # include "rx_kcommon.h"
23 # ifndef AFS_LINUX20_ENV
26 # if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
27 # include "afs/sysincludes.h"
29 # if defined(AFS_OBSD_ENV)
32 # include "h/socket.h"
33 # if !defined(AFS_SUN5_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
34 # if !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
35 # include "sys/mount.h" /* it gets pulled in by something later anyway */
39 # include "netinet/in.h"
40 # include "afs/afs_osi.h"
41 # include "rx_kmutex.h"
42 # endif /* defined(UKERNEL) */
44 # include <sys/types.h>
45 # include <sys/stat.h>
53 # if defined(AFS_NT40_ENV)
54 # include <winsock2.h>
56 # define EWOULDBLOCK WSAEWOULDBLOCK
59 # include "rx_xmit_nt.h"
61 # include <sys/socket.h>
62 # include <netinet/in.h>
68 # include <sys/sysmacros.h>
74 #include "rx_packet.h"
75 #include "rx_atomic.h"
76 #include "rx_globals.h"
77 #include "rx_internal.h"
81 /* rxdb_fileID is used to identify the lock location, along with line#. */
82 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
83 #endif /* RX_LOCKS_DB */
84 static struct rx_packet *rx_mallocedP = 0;
86 static afs_uint32 rx_packet_id = 0;
89 extern char cml_version_number[];
91 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
93 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
94 afs_uint32 ahost, short aport,
97 #ifdef RX_ENABLE_TSFPQ
99 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
101 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
103 struct rx_queue * q);
106 /* some rules about packets:
107 * 1. When a packet is allocated, the final iov_buf contains room for
108 * a security trailer, but iov_len masks that fact. If the security
109 * package wants to add the trailer, it may do so, and then extend
110 * iov_len appropriately. For this reason, packet's niovecs and
111 * iov_len fields should be accurate before calling PreparePacket.
115 * all packet buffers (iov_base) are integral multiples of
117 * offset is an integral multiple of the word size.
120 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
124 for (l = 0, i = 1; i < packet->niovecs; i++) {
125 if (l + packet->wirevec[i].iov_len > offset) {
127 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
130 l += packet->wirevec[i].iov_len;
137 * all packet buffers (iov_base) are integral multiples of the word size.
138 * offset is an integral multiple of the word size.
141 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
145 for (l = 0, i = 1; i < packet->niovecs; i++) {
146 if (l + packet->wirevec[i].iov_len > offset) {
147 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
148 (offset - l))) = data;
151 l += packet->wirevec[i].iov_len;
158 * all packet buffers (iov_base) are integral multiples of the
160 * offset is an integral multiple of the word size.
162 * all buffers are contiguously arrayed in the iovec from 0..niovecs-1
165 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
168 unsigned int i, j, l, r;
169 for (l = 0, i = 1; i < packet->niovecs; i++) {
170 if (l + packet->wirevec[i].iov_len > offset) {
173 l += packet->wirevec[i].iov_len;
176 /* i is the iovec which contains the first little bit of data in which we
177 * are interested. l is the total length of everything prior to this iovec.
178 * j is the number of bytes we can safely copy out of this iovec.
179 * offset only applies to the first iovec.
182 while ((r > 0) && (i < packet->niovecs)) {
183 j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
184 memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
187 l += packet->wirevec[i].iov_len;
192 return (r ? (resid - r) : resid);
197 * all packet buffers (iov_base) are integral multiples of the
199 * offset is an integral multiple of the word size.
202 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
204 unsigned int i, j, l, o, r;
207 for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
208 if (l + packet->wirevec[i].iov_len > o) {
211 l += packet->wirevec[i].iov_len;
214 /* i is the iovec which contains the first little bit of data in which we
215 * are interested. l is the total length of everything prior to this iovec.
216 * j is the number of bytes we can safely copy out of this iovec.
217 * offset only applies to the first iovec.
220 while ((r > 0) && (i <= RX_MAXWVECS)) {
221 if (i >= packet->niovecs)
222 if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0) /* ++niovecs as a side-effect */
225 b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
226 j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
230 l += packet->wirevec[i].iov_len;
235 return (r ? (resid - r) : resid);
239 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
241 struct rx_packet *p, *np;
243 num_pkts = AllocPacketBufs(class, num_pkts, q);
245 for (queue_Scan(q, p, np, rx_packet)) {
246 RX_PACKET_IOV_FULLINIT(p);
252 #ifdef RX_ENABLE_TSFPQ
254 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
256 struct rx_ts_info_t * rx_ts_info;
260 RX_TS_INFO_GET(rx_ts_info);
262 transfer = num_pkts - rx_ts_info->_FPQ.len;
265 MUTEX_ENTER(&rx_freePktQ_lock);
266 transfer = MAX(transfer, rx_TSFPQGlobSize);
267 if (transfer > rx_nFreePackets) {
268 /* alloc enough for us, plus a few globs for other threads */
269 rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
272 RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
274 MUTEX_EXIT(&rx_freePktQ_lock);
278 RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
282 #else /* RX_ENABLE_TSFPQ */
284 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
295 MUTEX_ENTER(&rx_freePktQ_lock);
298 for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
299 num_pkts--, overq++);
302 rxi_NeedMorePackets = TRUE;
303 if (rx_stats_active) {
305 case RX_PACKET_CLASS_RECEIVE:
306 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
308 case RX_PACKET_CLASS_SEND:
309 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
311 case RX_PACKET_CLASS_SPECIAL:
312 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
314 case RX_PACKET_CLASS_RECV_CBUF:
315 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
317 case RX_PACKET_CLASS_SEND_CBUF:
318 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
324 if (rx_nFreePackets < num_pkts)
325 num_pkts = rx_nFreePackets;
328 rxi_NeedMorePackets = TRUE;
332 if (rx_nFreePackets < num_pkts) {
333 rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
337 for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
339 i++, c=queue_Next(c, rx_packet)) {
343 queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
345 rx_nFreePackets -= num_pkts;
350 MUTEX_EXIT(&rx_freePktQ_lock);
355 #endif /* RX_ENABLE_TSFPQ */
358 * Free a packet currently used as a continuation buffer
360 #ifdef RX_ENABLE_TSFPQ
361 /* num_pkts=0 means queue length is unknown */
363 rxi_FreePackets(int num_pkts, struct rx_queue * q)
365 struct rx_ts_info_t * rx_ts_info;
366 struct rx_packet *c, *nc;
369 osi_Assert(num_pkts >= 0);
370 RX_TS_INFO_GET(rx_ts_info);
373 for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
374 rxi_FreeDataBufsTSFPQ(c, 2, 0);
377 for (queue_Scan(q, c, nc, rx_packet)) {
378 rxi_FreeDataBufsTSFPQ(c, 2, 0);
383 RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
386 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
388 MUTEX_ENTER(&rx_freePktQ_lock);
390 RX_TS_FPQ_LTOG(rx_ts_info);
392 /* Wakeup anyone waiting for packets */
395 MUTEX_EXIT(&rx_freePktQ_lock);
401 #else /* RX_ENABLE_TSFPQ */
402 /* num_pkts=0 means queue length is unknown */
404 rxi_FreePackets(int num_pkts, struct rx_queue *q)
407 struct rx_packet *p, *np;
411 osi_Assert(num_pkts >= 0);
415 for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
416 if (p->niovecs > 2) {
417 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
424 for (queue_Scan(q, p, np, rx_packet)) {
425 if (p->niovecs > 2) {
426 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
433 queue_SpliceAppend(q, &cbs);
439 MUTEX_ENTER(&rx_freePktQ_lock);
441 queue_SpliceAppend(&rx_freePacketQueue, q);
442 rx_nFreePackets += qlen;
444 /* Wakeup anyone waiting for packets */
447 MUTEX_EXIT(&rx_freePktQ_lock);
452 #endif /* RX_ENABLE_TSFPQ */
454 /* this one is kind of awful.
455 * In rxkad, the packet has been all shortened, and everything, ready for
456 * sending. All of a sudden, we discover we need some of that space back.
457 * This isn't terribly general, because it knows that the packets are only
458 * rounded up to the EBS (userdata + security header).
461 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
465 if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
466 if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
467 p->wirevec[i].iov_len += nb;
471 if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
472 p->wirevec[i].iov_len += nb;
480 /* get sufficient space to store nb bytes of data (or more), and hook
481 * it into the supplied packet. Return nbytes<=0 if successful, otherwise
482 * returns the number of bytes >0 which it failed to come up with.
483 * Don't need to worry about locking on packet, since only
484 * one thread can manipulate one at a time. Locking on continution
485 * packets is handled by AllocPacketBufs */
486 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
488 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
492 struct rx_packet *cb, *ncb;
494 /* compute the number of cbuf's we need */
495 nv = nb / RX_CBUFFERSIZE;
496 if ((nv * RX_CBUFFERSIZE) < nb)
498 if ((nv + p->niovecs) > RX_MAXWVECS)
499 nv = RX_MAXWVECS - p->niovecs;
503 /* allocate buffers */
505 nv = AllocPacketBufs(class, nv, &q);
507 /* setup packet iovs */
508 for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
510 p->wirevec[i].iov_base = (caddr_t) cb->localdata;
511 p->wirevec[i].iov_len = RX_CBUFFERSIZE;
514 nb -= (nv * RX_CBUFFERSIZE);
515 p->length += (nv * RX_CBUFFERSIZE);
521 /* Add more packet buffers */
522 #ifdef RX_ENABLE_TSFPQ
524 rxi_MorePackets(int apackets)
526 struct rx_packet *p, *e;
527 struct rx_ts_info_t * rx_ts_info;
531 getme = apackets * sizeof(struct rx_packet);
532 p = (struct rx_packet *)osi_Alloc(getme);
535 PIN(p, getme); /* XXXXX */
537 RX_TS_INFO_GET(rx_ts_info);
539 RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
540 /* TSFPQ patch also needs to keep track of total packets */
542 MUTEX_ENTER(&rx_packets_mutex);
543 rx_nPackets += apackets;
544 RX_TS_FPQ_COMPUTE_LIMITS;
545 MUTEX_EXIT(&rx_packets_mutex);
547 for (e = p + apackets; p < e; p++) {
548 RX_PACKET_IOV_INIT(p);
551 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
554 MUTEX_ENTER(&rx_freePktQ_lock);
555 #ifdef RXDEBUG_PACKET
556 p->packetId = rx_packet_id++;
557 p->allNextp = rx_mallocedP;
558 #endif /* RXDEBUG_PACKET */
560 MUTEX_EXIT(&rx_freePktQ_lock);
563 rx_ts_info->_FPQ.delta += apackets;
565 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
567 MUTEX_ENTER(&rx_freePktQ_lock);
569 RX_TS_FPQ_LTOG(rx_ts_info);
570 rxi_NeedMorePackets = FALSE;
573 MUTEX_EXIT(&rx_freePktQ_lock);
577 #else /* RX_ENABLE_TSFPQ */
579 rxi_MorePackets(int apackets)
581 struct rx_packet *p, *e;
585 getme = apackets * sizeof(struct rx_packet);
586 p = (struct rx_packet *)osi_Alloc(getme);
589 PIN(p, getme); /* XXXXX */
592 MUTEX_ENTER(&rx_freePktQ_lock);
594 for (e = p + apackets; p < e; p++) {
595 RX_PACKET_IOV_INIT(p);
596 #ifdef RX_TRACK_PACKETS
597 p->flags |= RX_PKTFLAG_FREE;
601 queue_Append(&rx_freePacketQueue, p);
602 #ifdef RXDEBUG_PACKET
603 p->packetId = rx_packet_id++;
604 p->allNextp = rx_mallocedP;
605 #endif /* RXDEBUG_PACKET */
609 rx_nPackets += apackets;
610 rx_nFreePackets += apackets;
611 rxi_NeedMorePackets = FALSE;
614 MUTEX_EXIT(&rx_freePktQ_lock);
617 #endif /* RX_ENABLE_TSFPQ */
619 #ifdef RX_ENABLE_TSFPQ
621 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
623 struct rx_packet *p, *e;
624 struct rx_ts_info_t * rx_ts_info;
628 getme = apackets * sizeof(struct rx_packet);
629 p = (struct rx_packet *)osi_Alloc(getme);
631 PIN(p, getme); /* XXXXX */
633 RX_TS_INFO_GET(rx_ts_info);
635 RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
636 /* TSFPQ patch also needs to keep track of total packets */
637 MUTEX_ENTER(&rx_packets_mutex);
638 rx_nPackets += apackets;
639 RX_TS_FPQ_COMPUTE_LIMITS;
640 MUTEX_EXIT(&rx_packets_mutex);
642 for (e = p + apackets; p < e; p++) {
643 RX_PACKET_IOV_INIT(p);
645 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
648 MUTEX_ENTER(&rx_freePktQ_lock);
649 #ifdef RXDEBUG_PACKET
650 p->packetId = rx_packet_id++;
651 p->allNextp = rx_mallocedP;
652 #endif /* RXDEBUG_PACKET */
654 MUTEX_EXIT(&rx_freePktQ_lock);
657 rx_ts_info->_FPQ.delta += apackets;
660 (num_keep_local < apackets)) {
662 MUTEX_ENTER(&rx_freePktQ_lock);
664 RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
665 rxi_NeedMorePackets = FALSE;
668 MUTEX_EXIT(&rx_freePktQ_lock);
672 #endif /* RX_ENABLE_TSFPQ */
675 /* Add more packet buffers */
677 rxi_MorePacketsNoLock(int apackets)
679 #ifdef RX_ENABLE_TSFPQ
680 struct rx_ts_info_t * rx_ts_info;
681 #endif /* RX_ENABLE_TSFPQ */
682 struct rx_packet *p, *e;
685 /* allocate enough packets that 1/4 of the packets will be able
686 * to hold maximal amounts of data */
687 apackets += (apackets / 4)
688 * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
690 getme = apackets * sizeof(struct rx_packet);
691 p = (struct rx_packet *)osi_Alloc(getme);
693 apackets -= apackets / 4;
694 osi_Assert(apackets > 0);
699 #ifdef RX_ENABLE_TSFPQ
700 RX_TS_INFO_GET(rx_ts_info);
701 RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
702 #endif /* RX_ENABLE_TSFPQ */
704 for (e = p + apackets; p < e; p++) {
705 RX_PACKET_IOV_INIT(p);
706 #ifdef RX_TRACK_PACKETS
707 p->flags |= RX_PKTFLAG_FREE;
711 queue_Append(&rx_freePacketQueue, p);
712 #ifdef RXDEBUG_PACKET
713 p->packetId = rx_packet_id++;
714 p->allNextp = rx_mallocedP;
715 #endif /* RXDEBUG_PACKET */
719 rx_nFreePackets += apackets;
720 MUTEX_ENTER(&rx_packets_mutex);
721 rx_nPackets += apackets;
722 #ifdef RX_ENABLE_TSFPQ
723 RX_TS_FPQ_COMPUTE_LIMITS;
724 #endif /* RX_ENABLE_TSFPQ */
725 MUTEX_EXIT(&rx_packets_mutex);
726 rxi_NeedMorePackets = FALSE;
732 rxi_FreeAllPackets(void)
734 /* must be called at proper interrupt level, etcetera */
735 /* MTUXXX need to free all Packets */
736 osi_Free(rx_mallocedP,
737 (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
738 UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
741 #ifdef RX_ENABLE_TSFPQ
743 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
745 struct rx_ts_info_t * rx_ts_info;
749 RX_TS_INFO_GET(rx_ts_info);
751 if (num_keep_local != rx_ts_info->_FPQ.len) {
753 MUTEX_ENTER(&rx_freePktQ_lock);
754 if (num_keep_local < rx_ts_info->_FPQ.len) {
755 xfer = rx_ts_info->_FPQ.len - num_keep_local;
756 RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
759 xfer = num_keep_local - rx_ts_info->_FPQ.len;
760 if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
761 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
762 if (rx_nFreePackets < xfer) {
763 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
765 RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
767 MUTEX_EXIT(&rx_freePktQ_lock);
773 rxi_FlushLocalPacketsTSFPQ(void)
775 rxi_AdjustLocalPacketsTSFPQ(0, 0);
777 #endif /* RX_ENABLE_TSFPQ */
779 /* Allocate more packets iff we need more continuation buffers */
780 /* In kernel, can't page in memory with interrupts disabled, so we
781 * don't use the event mechanism. */
783 rx_CheckPackets(void)
785 if (rxi_NeedMorePackets) {
786 rxi_MorePackets(rx_maxSendWindow);
790 /* In the packet freeing routine below, the assumption is that
791 we want all of the packets to be used equally frequently, so that we
792 don't get packet buffers paging out. It would be just as valid to
793 assume that we DO want them to page out if not many are being used.
794 In any event, we assume the former, and append the packets to the end
796 /* This explanation is bogus. The free list doesn't remain in any kind of
797 useful order for afs_int32: the packets in use get pretty much randomly scattered
798 across all the pages. In order to permit unused {packets,bufs} to page out, they
799 must be stored so that packets which are adjacent in memory are adjacent in the
800 free list. An array springs rapidly to mind.
803 /* Actually free the packet p. */
804 #ifdef RX_ENABLE_TSFPQ
806 rxi_FreePacketNoLock(struct rx_packet *p)
808 struct rx_ts_info_t * rx_ts_info;
809 dpf(("Free %"AFS_PTR_FMT"\n", p));
811 RX_TS_INFO_GET(rx_ts_info);
812 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
813 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
814 RX_TS_FPQ_LTOG(rx_ts_info);
817 #else /* RX_ENABLE_TSFPQ */
819 rxi_FreePacketNoLock(struct rx_packet *p)
821 dpf(("Free %"AFS_PTR_FMT"\n", p));
825 queue_Append(&rx_freePacketQueue, p);
827 #endif /* RX_ENABLE_TSFPQ */
829 #ifdef RX_ENABLE_TSFPQ
831 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
833 struct rx_ts_info_t * rx_ts_info;
834 dpf(("Free %"AFS_PTR_FMT"\n", p));
836 RX_TS_INFO_GET(rx_ts_info);
837 RX_TS_FPQ_CHECKIN(rx_ts_info,p);
839 if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
841 MUTEX_ENTER(&rx_freePktQ_lock);
843 RX_TS_FPQ_LTOG(rx_ts_info);
845 /* Wakeup anyone waiting for packets */
848 MUTEX_EXIT(&rx_freePktQ_lock);
852 #endif /* RX_ENABLE_TSFPQ */
855 * free continuation buffers off a packet into a queue
857 * [IN] p -- packet from which continuation buffers will be freed
858 * [IN] first -- iovec offset of first continuation buffer to free
859 * [IN] q -- queue into which continuation buffers will be chained
862 * number of continuation buffers freed
864 #ifndef RX_ENABLE_TSFPQ
866 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
869 struct rx_packet * cb;
872 for (first = MAX(2, first); first < p->niovecs; first++, count++) {
873 iov = &p->wirevec[first];
875 osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
876 cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
877 RX_FPQ_MARK_FREE(cb);
888 * free packet continuation buffers into the global free packet pool
890 * [IN] p -- packet from which to free continuation buffers
891 * [IN] first -- iovec offset of first continuation buffer to free
897 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
901 for (first = MAX(2, first); first < p->niovecs; first++) {
902 iov = &p->wirevec[first];
904 osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
905 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
913 #ifdef RX_ENABLE_TSFPQ
915 * free packet continuation buffers into the thread-local free pool
917 * [IN] p -- packet from which continuation buffers will be freed
918 * [IN] first -- iovec offset of first continuation buffer to free
919 * any value less than 2, the min number of iovecs,
920 * is treated as if it is 2.
921 * [IN] flush_global -- if nonzero, we will flush overquota packets to the
922 * global free pool before returning
928 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
931 struct rx_ts_info_t * rx_ts_info;
933 RX_TS_INFO_GET(rx_ts_info);
935 for (first = MAX(2, first); first < p->niovecs; first++) {
936 iov = &p->wirevec[first];
938 osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
939 RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
944 if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
946 MUTEX_ENTER(&rx_freePktQ_lock);
948 RX_TS_FPQ_LTOG(rx_ts_info);
950 /* Wakeup anyone waiting for packets */
953 MUTEX_EXIT(&rx_freePktQ_lock);
958 #endif /* RX_ENABLE_TSFPQ */
960 int rxi_nBadIovecs = 0;
962 /* rxi_RestoreDataBufs
964 * Restore the correct sizes to the iovecs. Called when reusing a packet
965 * for reading off the wire.
968 rxi_RestoreDataBufs(struct rx_packet *p)
971 struct iovec *iov = &p->wirevec[2];
973 RX_PACKET_IOV_INIT(p);
975 for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
976 if (!iov->iov_base) {
981 iov->iov_len = RX_CBUFFERSIZE;
985 #ifdef RX_ENABLE_TSFPQ
987 rxi_TrimDataBufs(struct rx_packet *p, int first)
990 struct iovec *iov, *end;
991 struct rx_ts_info_t * rx_ts_info;
995 osi_Panic("TrimDataBufs 1: first must be 1");
997 /* Skip over continuation buffers containing message data */
998 iov = &p->wirevec[2];
999 end = iov + (p->niovecs - 2);
1000 length = p->length - p->wirevec[1].iov_len;
1001 for (; iov < end && length > 0; iov++) {
1003 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1004 length -= iov->iov_len;
1007 /* iov now points to the first empty data buffer. */
1011 RX_TS_INFO_GET(rx_ts_info);
1012 for (; iov < end; iov++) {
1014 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1015 RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1018 if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1020 MUTEX_ENTER(&rx_freePktQ_lock);
1022 RX_TS_FPQ_LTOG(rx_ts_info);
1023 rxi_PacketsUnWait();
1025 MUTEX_EXIT(&rx_freePktQ_lock);
1031 #else /* RX_ENABLE_TSFPQ */
1033 rxi_TrimDataBufs(struct rx_packet *p, int first)
1036 struct iovec *iov, *end;
1040 osi_Panic("TrimDataBufs 1: first must be 1");
1042 /* Skip over continuation buffers containing message data */
1043 iov = &p->wirevec[2];
1044 end = iov + (p->niovecs - 2);
1045 length = p->length - p->wirevec[1].iov_len;
1046 for (; iov < end && length > 0; iov++) {
1048 osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1049 length -= iov->iov_len;
1052 /* iov now points to the first empty data buffer. */
1057 MUTEX_ENTER(&rx_freePktQ_lock);
1059 for (; iov < end; iov++) {
1061 osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1062 rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1065 rxi_PacketsUnWait();
1067 MUTEX_EXIT(&rx_freePktQ_lock);
1072 #endif /* RX_ENABLE_TSFPQ */
1074 /* Free the packet p. P is assumed not to be on any queue, i.e.
1075 * remove it yourself first if you call this routine. */
1076 #ifdef RX_ENABLE_TSFPQ
1078 rxi_FreePacket(struct rx_packet *p)
1080 rxi_FreeDataBufsTSFPQ(p, 2, 0);
1081 rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1083 #else /* RX_ENABLE_TSFPQ */
1085 rxi_FreePacket(struct rx_packet *p)
1090 MUTEX_ENTER(&rx_freePktQ_lock);
1092 rxi_FreeDataBufsNoLock(p, 2);
1093 rxi_FreePacketNoLock(p);
1094 /* Wakeup anyone waiting for packets */
1095 rxi_PacketsUnWait();
1097 MUTEX_EXIT(&rx_freePktQ_lock);
1100 #endif /* RX_ENABLE_TSFPQ */
1102 /* rxi_AllocPacket sets up p->length so it reflects the number of
1103 * bytes in the packet at this point, **not including** the header.
1104 * The header is absolutely necessary, besides, this is the way the
1105 * length field is usually used */
1106 #ifdef RX_ENABLE_TSFPQ
1108 rxi_AllocPacketNoLock(int class)
1110 struct rx_packet *p;
1111 struct rx_ts_info_t * rx_ts_info;
1113 RX_TS_INFO_GET(rx_ts_info);
1116 if (rxi_OverQuota(class)) {
1117 rxi_NeedMorePackets = TRUE;
1118 if (rx_stats_active) {
1120 case RX_PACKET_CLASS_RECEIVE:
1121 rx_atomic_inc(rx_stats.receivePktAllocFailures);
1123 case RX_PACKET_CLASS_SEND:
1124 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1126 case RX_PACKET_CLASS_SPECIAL:
1127 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1129 case RX_PACKET_CLASS_RECV_CBUF:
1130 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1132 case RX_PACKET_CLASS_SEND_CBUF:
1133 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1137 return (struct rx_packet *)0;
1141 if (rx_stats_active)
1142 rx_atomic_inc(&rx_stats.packetRequests);
1143 if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1146 if (queue_IsEmpty(&rx_freePacketQueue))
1147 osi_Panic("rxi_AllocPacket error");
1149 if (queue_IsEmpty(&rx_freePacketQueue))
1150 rxi_MorePacketsNoLock(rx_maxSendWindow);
1154 RX_TS_FPQ_GTOL(rx_ts_info);
1157 RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1159 dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1162 /* have to do this here because rx_FlushWrite fiddles with the iovs in
1163 * order to truncate outbound packets. In the near future, may need
1164 * to allocate bufs from a static pool here, and/or in AllocSendPacket
1166 RX_PACKET_IOV_FULLINIT(p);
1169 #else /* RX_ENABLE_TSFPQ */
1171 rxi_AllocPacketNoLock(int class)
1173 struct rx_packet *p;
1176 if (rxi_OverQuota(class)) {
1177 rxi_NeedMorePackets = TRUE;
1178 if (rx_stats_active) {
1180 case RX_PACKET_CLASS_RECEIVE:
1181 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1183 case RX_PACKET_CLASS_SEND:
1184 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1186 case RX_PACKET_CLASS_SPECIAL:
1187 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1189 case RX_PACKET_CLASS_RECV_CBUF:
1190 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1192 case RX_PACKET_CLASS_SEND_CBUF:
1193 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1197 return (struct rx_packet *)0;
1201 if (rx_stats_active)
1202 rx_atomic_inc(&rx_stats.packetRequests);
1205 if (queue_IsEmpty(&rx_freePacketQueue))
1206 osi_Panic("rxi_AllocPacket error");
1208 if (queue_IsEmpty(&rx_freePacketQueue))
1209 rxi_MorePacketsNoLock(rx_maxSendWindow);
1213 p = queue_First(&rx_freePacketQueue, rx_packet);
1215 RX_FPQ_MARK_USED(p);
1217 dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1220 /* have to do this here because rx_FlushWrite fiddles with the iovs in
1221 * order to truncate outbound packets. In the near future, may need
1222 * to allocate bufs from a static pool here, and/or in AllocSendPacket
1224 RX_PACKET_IOV_FULLINIT(p);
1227 #endif /* RX_ENABLE_TSFPQ */
1229 #ifdef RX_ENABLE_TSFPQ
1231 rxi_AllocPacketTSFPQ(int class, int pull_global)
1233 struct rx_packet *p;
1234 struct rx_ts_info_t * rx_ts_info;
1236 RX_TS_INFO_GET(rx_ts_info);
1238 if (rx_stats_active)
1239 rx_atomic_inc(&rx_stats.packetRequests);
1240 if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1241 MUTEX_ENTER(&rx_freePktQ_lock);
1243 if (queue_IsEmpty(&rx_freePacketQueue))
1244 rxi_MorePacketsNoLock(rx_maxSendWindow);
1246 RX_TS_FPQ_GTOL(rx_ts_info);
1248 MUTEX_EXIT(&rx_freePktQ_lock);
1249 } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1253 RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1255 dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1257 /* have to do this here because rx_FlushWrite fiddles with the iovs in
1258 * order to truncate outbound packets. In the near future, may need
1259 * to allocate bufs from a static pool here, and/or in AllocSendPacket
1261 RX_PACKET_IOV_FULLINIT(p);
1264 #endif /* RX_ENABLE_TSFPQ */
1266 #ifdef RX_ENABLE_TSFPQ
1268 rxi_AllocPacket(int class)
1270 struct rx_packet *p;
1272 p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1275 #else /* RX_ENABLE_TSFPQ */
1277 rxi_AllocPacket(int class)
1279 struct rx_packet *p;
1281 MUTEX_ENTER(&rx_freePktQ_lock);
1282 p = rxi_AllocPacketNoLock(class);
1283 MUTEX_EXIT(&rx_freePktQ_lock);
1286 #endif /* RX_ENABLE_TSFPQ */
1288 /* This guy comes up with as many buffers as it {takes,can get} given
1289 * the MTU for this call. It also sets the packet length before
1290 * returning. caution: this is often called at NETPRI
1291 * Called with call locked.
1294 rxi_AllocSendPacket(struct rx_call *call, int want)
1296 struct rx_packet *p = (struct rx_packet *)0;
1301 mud = call->MTU - RX_HEADER_SIZE;
1303 rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1304 rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1306 #ifdef RX_ENABLE_TSFPQ
1307 if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1309 want = MIN(want, mud);
1311 if ((unsigned)want > p->length)
1312 (void)rxi_AllocDataBuf(p, (want - p->length),
1313 RX_PACKET_CLASS_SEND_CBUF);
1315 if (p->length > mud)
1318 if (delta >= p->length) {
1326 #endif /* RX_ENABLE_TSFPQ */
1328 while (!(call->error)) {
1329 MUTEX_ENTER(&rx_freePktQ_lock);
1330 /* if an error occurred, or we get the packet we want, we're done */
1331 if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1332 MUTEX_EXIT(&rx_freePktQ_lock);
1335 want = MIN(want, mud);
1337 if ((unsigned)want > p->length)
1338 (void)rxi_AllocDataBuf(p, (want - p->length),
1339 RX_PACKET_CLASS_SEND_CBUF);
1341 if (p->length > mud)
1344 if (delta >= p->length) {
1353 /* no error occurred, and we didn't get a packet, so we sleep.
1354 * At this point, we assume that packets will be returned
1355 * sooner or later, as packets are acknowledged, and so we
1358 call->flags |= RX_CALL_WAIT_PACKETS;
1359 MUTEX_ENTER(&rx_refcnt_mutex);
1360 CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1361 MUTEX_EXIT(&rx_refcnt_mutex);
1362 MUTEX_EXIT(&call->lock);
1363 rx_waitingForPackets = 1;
1365 #ifdef RX_ENABLE_LOCKS
1366 CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1368 osi_rxSleep(&rx_waitingForPackets);
1370 MUTEX_EXIT(&rx_freePktQ_lock);
1371 MUTEX_ENTER(&call->lock);
1372 MUTEX_ENTER(&rx_refcnt_mutex);
1373 CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1374 MUTEX_EXIT(&rx_refcnt_mutex);
1375 call->flags &= ~RX_CALL_WAIT_PACKETS;
1384 /* Windows does not use file descriptors. */
1385 #define CountFDs(amax) 0
1387 /* count the number of used FDs */
1396 for (i = 0; i < amax; i++) {
1397 code = fstat(i, &tstat);
1403 #endif /* AFS_NT40_ENV */
1406 #define CountFDs(amax) amax
1410 #if !defined(KERNEL) || defined(UKERNEL)
1412 /* This function reads a single packet from the interface into the
1413 * supplied packet buffer (*p). Return 0 if the packet is bogus. The
1414 * (host,port) of the sender are stored in the supplied variables, and
1415 * the data length of the packet is stored in the packet structure.
1416 * The header is decoded. */
1418 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1421 struct sockaddr_in from;
1422 unsigned int nbytes;
1424 afs_uint32 tlen, savelen;
1426 rx_computelen(p, tlen);
1427 rx_SetDataSize(p, tlen); /* this is the size of the user data area */
1429 tlen += RX_HEADER_SIZE; /* now this is the size of the entire packet */
1430 rlen = rx_maxJumboRecvSize; /* this is what I am advertising. Only check
1431 * it once in order to avoid races. */
1434 tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1442 /* Extend the last iovec for padding, it's just to make sure that the
1443 * read doesn't return more data than we expect, and is done to get around
1444 * our problems caused by the lack of a length field in the rx header.
1445 * Use the extra buffer that follows the localdata in each packet
1447 savelen = p->wirevec[p->niovecs - 1].iov_len;
1448 p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1450 memset(&msg, 0, sizeof(msg));
1451 msg.msg_name = (char *)&from;
1452 msg.msg_namelen = sizeof(struct sockaddr_in);
1453 msg.msg_iov = p->wirevec;
1454 msg.msg_iovlen = p->niovecs;
1455 nbytes = rxi_Recvmsg(socket, &msg, 0);
1457 /* restore the vec to its correct state */
1458 p->wirevec[p->niovecs - 1].iov_len = savelen;
1460 p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1461 if ((nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1462 if (nbytes < 0 && errno == EWOULDBLOCK) {
1463 if (rx_stats_active)
1464 rx_atomic_inc(&rx_stats.noPacketOnRead);
1465 } else if (nbytes <= 0) {
1466 if (rx_stats_active) {
1467 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1468 rx_stats.bogusHost = from.sin_addr.s_addr;
1470 dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1471 ntohs(from.sin_port), nbytes));
1476 else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1477 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1478 rxi_DecodePacketHeader(p);
1480 *host = from.sin_addr.s_addr;
1481 *port = from.sin_port;
1483 dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1484 p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1485 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1487 #ifdef RX_TRIMDATABUFS
1488 rxi_TrimDataBufs(p, 1);
1494 /* Extract packet header. */
1495 rxi_DecodePacketHeader(p);
1497 *host = from.sin_addr.s_addr;
1498 *port = from.sin_port;
1499 if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1500 if (rx_stats_active) {
1501 struct rx_peer *peer;
1502 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1504 * Try to look up this peer structure. If it doesn't exist,
1505 * don't create a new one -
1506 * we don't keep count of the bytes sent/received if a peer
1507 * structure doesn't already exist.
1509 * The peer/connection cleanup code assumes that there is 1 peer
1510 * per connection. If we actually created a peer structure here
1511 * and this packet was an rxdebug packet, the peer structure would
1512 * never be cleaned up.
1514 peer = rxi_FindPeer(*host, *port, 0, 0);
1515 /* Since this may not be associated with a connection,
1516 * it may have no refCount, meaning we could race with
1519 if (peer && (peer->refCount > 0)) {
1520 MUTEX_ENTER(&peer->peer_lock);
1521 hadd32(peer->bytesReceived, p->length);
1522 MUTEX_EXIT(&peer->peer_lock);
1527 #ifdef RX_TRIMDATABUFS
1528 /* Free any empty packet buffers at the end of this packet */
1529 rxi_TrimDataBufs(p, 1);
1535 #endif /* !KERNEL || UKERNEL */
1537 /* This function splits off the first packet in a jumbo packet.
1538 * As of AFS 3.5, jumbograms contain more than one fixed size
1539 * packet, and the RX_JUMBO_PACKET flag is set in all but the
1540 * last packet header. All packets (except the last) are padded to
1541 * fall on RX_CBUFFERSIZE boundaries.
1542 * HACK: We store the length of the first n-1 packets in the
1543 * last two pad bytes. */
1546 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1549 struct rx_packet *np;
1550 struct rx_jumboHeader *jp;
1556 /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1557 * bytes in length. All but the first packet are preceded by
1558 * an abbreviated four byte header. The length of the last packet
1559 * is calculated from the size of the jumbogram. */
1560 length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1562 if ((int)p->length < length) {
1563 dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1566 niov = p->niovecs - 2;
1568 dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1571 iov = &p->wirevec[2];
1572 np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1574 /* Get a pointer to the abbreviated packet header */
1575 jp = (struct rx_jumboHeader *)
1576 ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1578 /* Set up the iovecs for the next packet */
1579 np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1580 np->wirevec[0].iov_len = sizeof(struct rx_header);
1581 np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1582 np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1583 np->niovecs = niov + 1;
1584 for (i = 2, iov++; i <= niov; i++, iov++) {
1585 np->wirevec[i] = *iov;
1587 np->length = p->length - length;
1588 p->length = RX_JUMBOBUFFERSIZE;
1591 /* Convert the jumbo packet header to host byte order */
1592 temp = ntohl(*(afs_uint32 *) jp);
1593 jp->flags = (u_char) (temp >> 24);
1594 jp->cksum = (u_short) (temp);
1596 /* Fill in the packet header */
1597 np->header = p->header;
1598 np->header.serial = p->header.serial + 1;
1599 np->header.seq = p->header.seq + 1;
1600 np->header.flags = jp->flags;
1601 np->header.spare = jp->cksum;
1607 /* Send a udp datagram */
1609 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1610 int length, int istack)
1615 memset(&msg, 0, sizeof(msg));
1617 msg.msg_iovlen = nvecs;
1618 msg.msg_name = addr;
1619 msg.msg_namelen = sizeof(struct sockaddr_in);
1621 ret = rxi_Sendmsg(socket, &msg, 0);
1625 #elif !defined(UKERNEL)
1627 * message receipt is done in rxk_input or rx_put.
1630 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1632 * Copy an mblock to the contiguous area pointed to by cp.
1633 * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1634 * but it doesn't really.
1635 * Returns the number of bytes not transferred.
1636 * The message is NOT changed.
1639 cpytoc(mblk_t * mp, int off, int len, char *cp)
1643 for (; mp && len > 0; mp = mp->b_cont) {
1644 if (mp->b_datap->db_type != M_DATA) {
1647 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1648 memcpy(cp, (char *)mp->b_rptr, n);
1656 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1657 * but it doesn't really.
1658 * This sucks, anyway, do it like m_cpy.... below
1661 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1666 for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1667 if (mp->b_datap->db_type != M_DATA) {
1670 n = MIN(len, (mp->b_wptr - mp->b_rptr));
1676 t = iovs[i].iov_len;
1679 memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1689 #define m_cpytoc(a, b, c, d) cpytoc(a, b, c, d)
1690 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1692 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1694 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1697 unsigned int l1, l2, i, t;
1699 if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1700 osi_Panic("m_cpytoiovec"); /* MTUXXX probably don't need this check */
1703 if (m->m_len <= off) {
1713 p1 = mtod(m, caddr_t) + off;
1714 l1 = m->m_len - off;
1716 p2 = iovs[0].iov_base;
1717 l2 = iovs[0].iov_len;
1720 t = MIN(l1, MIN(l2, (unsigned int)len));
1731 p1 = mtod(m, caddr_t);
1737 p2 = iovs[i].iov_base;
1738 l2 = iovs[i].iov_len;
1746 #endif /* AFS_SUN5_ENV */
1748 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1750 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1751 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1757 struct rx_packet *phandle;
1758 int hdr_len, data_len;
1763 m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1770 #endif /*KERNEL && !UKERNEL */
1773 /* send a response to a debug packet */
1776 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1777 afs_uint32 ahost, short aport, int istack)
1779 struct rx_debugIn tin;
1781 struct rx_serverQueueEntry *np, *nqe;
1784 * Only respond to client-initiated Rx debug packets,
1785 * and clear the client flag in the response.
1787 if (ap->header.flags & RX_CLIENT_INITIATED) {
1788 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1789 rxi_EncodePacketHeader(ap);
1794 rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1795 /* all done with packet, now set length to the truth, so we can
1796 * reuse this packet */
1797 rx_computelen(ap, ap->length);
1799 tin.type = ntohl(tin.type);
1800 tin.index = ntohl(tin.index);
1802 case RX_DEBUGI_GETSTATS:{
1803 struct rx_debugStats tstat;
1805 /* get basic stats */
1806 memset(&tstat, 0, sizeof(tstat)); /* make sure spares are zero */
1807 tstat.version = RX_DEBUGI_VERSION;
1808 #ifndef RX_ENABLE_LOCKS
1809 tstat.waitingForPackets = rx_waitingForPackets;
1811 MUTEX_ENTER(&rx_serverPool_lock);
1812 tstat.nFreePackets = htonl(rx_nFreePackets);
1813 tstat.nPackets = htonl(rx_nPackets);
1814 tstat.callsExecuted = htonl(rxi_nCalls);
1815 tstat.packetReclaims = htonl(rx_packetReclaims);
1816 tstat.usedFDs = CountFDs(64);
1817 tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1818 tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1819 queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1821 MUTEX_EXIT(&rx_serverPool_lock);
1822 tstat.idleThreads = htonl(tstat.idleThreads);
1823 tl = sizeof(struct rx_debugStats) - ap->length;
1825 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1828 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1830 ap->length = sizeof(struct rx_debugStats);
1831 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1832 rx_computelen(ap, ap->length);
1837 case RX_DEBUGI_GETALLCONN:
1838 case RX_DEBUGI_GETCONN:{
1840 struct rx_connection *tc;
1841 struct rx_call *tcall;
1842 struct rx_debugConn tconn;
1843 int all = (tin.type == RX_DEBUGI_GETALLCONN);
1846 tl = sizeof(struct rx_debugConn) - ap->length;
1848 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1852 memset(&tconn, 0, sizeof(tconn)); /* make sure spares are zero */
1853 /* get N'th (maybe) "interesting" connection info */
1854 for (i = 0; i < rx_hashTableSize; i++) {
1855 #if !defined(KERNEL)
1856 /* the time complexity of the algorithm used here
1857 * exponentially increses with the number of connections.
1859 #ifdef AFS_PTHREAD_ENV
1865 MUTEX_ENTER(&rx_connHashTable_lock);
1866 /* We might be slightly out of step since we are not
1867 * locking each call, but this is only debugging output.
1869 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1870 if ((all || rxi_IsConnInteresting(tc))
1871 && tin.index-- <= 0) {
1872 tconn.host = tc->peer->host;
1873 tconn.port = tc->peer->port;
1874 tconn.cid = htonl(tc->cid);
1875 tconn.epoch = htonl(tc->epoch);
1876 tconn.serial = htonl(tc->serial);
1877 for (j = 0; j < RX_MAXCALLS; j++) {
1878 tconn.callNumber[j] = htonl(tc->callNumber[j]);
1879 if ((tcall = tc->call[j])) {
1880 tconn.callState[j] = tcall->state;
1881 tconn.callMode[j] = tcall->mode;
1882 tconn.callFlags[j] = tcall->flags;
1883 if (queue_IsNotEmpty(&tcall->rq))
1884 tconn.callOther[j] |= RX_OTHER_IN;
1885 if (queue_IsNotEmpty(&tcall->tq))
1886 tconn.callOther[j] |= RX_OTHER_OUT;
1888 tconn.callState[j] = RX_STATE_NOTINIT;
1891 tconn.natMTU = htonl(tc->peer->natMTU);
1892 tconn.error = htonl(tc->error);
1893 tconn.flags = tc->flags;
1894 tconn.type = tc->type;
1895 tconn.securityIndex = tc->securityIndex;
1896 if (tc->securityObject) {
1897 RXS_GetStats(tc->securityObject, tc,
1899 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1900 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1903 DOHTONL(packetsReceived);
1904 DOHTONL(packetsSent);
1905 DOHTONL(bytesReceived);
1909 sizeof(tconn.secStats.spares) /
1914 sizeof(tconn.secStats.sparel) /
1915 sizeof(afs_int32); i++)
1919 MUTEX_EXIT(&rx_connHashTable_lock);
1920 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1923 ap->length = sizeof(struct rx_debugConn);
1924 rxi_SendDebugPacket(ap, asocket, ahost, aport,
1930 MUTEX_EXIT(&rx_connHashTable_lock);
1932 /* if we make it here, there are no interesting packets */
1933 tconn.cid = htonl(0xffffffff); /* means end */
1934 rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1937 ap->length = sizeof(struct rx_debugConn);
1938 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1944 * Pass back all the peer structures we have available
1947 case RX_DEBUGI_GETPEER:{
1950 struct rx_debugPeer tpeer;
1953 tl = sizeof(struct rx_debugPeer) - ap->length;
1955 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1959 memset(&tpeer, 0, sizeof(tpeer));
1960 for (i = 0; i < rx_hashTableSize; i++) {
1961 #if !defined(KERNEL)
1962 /* the time complexity of the algorithm used here
1963 * exponentially increses with the number of peers.
1965 * Yielding after processing each hash table entry
1966 * and dropping rx_peerHashTable_lock.
1967 * also increases the risk that we will miss a new
1968 * entry - but we are willing to live with this
1969 * limitation since this is meant for debugging only
1971 #ifdef AFS_PTHREAD_ENV
1977 MUTEX_ENTER(&rx_peerHashTable_lock);
1978 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1979 if (tin.index-- <= 0) {
1981 MUTEX_EXIT(&rx_peerHashTable_lock);
1983 MUTEX_ENTER(&tp->peer_lock);
1984 tpeer.host = tp->host;
1985 tpeer.port = tp->port;
1986 tpeer.ifMTU = htons(tp->ifMTU);
1987 tpeer.idleWhen = htonl(tp->idleWhen);
1988 tpeer.refCount = htons(tp->refCount);
1989 tpeer.burstSize = tp->burstSize;
1990 tpeer.burst = tp->burst;
1991 tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1992 tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1993 tpeer.rtt = htonl(tp->rtt);
1994 tpeer.rtt_dev = htonl(tp->rtt_dev);
1995 tpeer.timeout.sec = htonl(tp->timeout.sec);
1996 tpeer.timeout.usec = htonl(tp->timeout.usec);
1997 tpeer.nSent = htonl(tp->nSent);
1998 tpeer.reSends = htonl(tp->reSends);
1999 tpeer.inPacketSkew = htonl(tp->inPacketSkew);
2000 tpeer.outPacketSkew = htonl(tp->outPacketSkew);
2001 tpeer.rateFlag = htonl(tp->rateFlag);
2002 tpeer.natMTU = htons(tp->natMTU);
2003 tpeer.maxMTU = htons(tp->maxMTU);
2004 tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2005 tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2006 tpeer.MTU = htons(tp->MTU);
2007 tpeer.cwind = htons(tp->cwind);
2008 tpeer.nDgramPackets = htons(tp->nDgramPackets);
2009 tpeer.congestSeq = htons(tp->congestSeq);
2010 tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2011 tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2012 tpeer.bytesReceived.high =
2013 htonl(tp->bytesReceived.high);
2014 tpeer.bytesReceived.low =
2015 htonl(tp->bytesReceived.low);
2016 MUTEX_EXIT(&tp->peer_lock);
2018 MUTEX_ENTER(&rx_peerHashTable_lock);
2020 MUTEX_EXIT(&rx_peerHashTable_lock);
2022 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2025 ap->length = sizeof(struct rx_debugPeer);
2026 rxi_SendDebugPacket(ap, asocket, ahost, aport,
2032 MUTEX_EXIT(&rx_peerHashTable_lock);
2034 /* if we make it here, there are no interesting packets */
2035 tpeer.host = htonl(0xffffffff); /* means end */
2036 rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2039 ap->length = sizeof(struct rx_debugPeer);
2040 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2045 case RX_DEBUGI_RXSTATS:{
2049 tl = sizeof(rx_stats) - ap->length;
2051 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2055 /* Since its all int32s convert to network order with a loop. */
2056 if (rx_stats_active)
2057 MUTEX_ENTER(&rx_stats_mutex);
2058 s = (afs_int32 *) & rx_stats;
2059 for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2060 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2063 ap->length = sizeof(rx_stats);
2064 if (rx_stats_active)
2065 MUTEX_EXIT(&rx_stats_mutex);
2066 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2072 /* error response packet */
2073 tin.type = htonl(RX_DEBUGI_BADTYPE);
2074 tin.index = tin.type;
2075 rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2077 ap->length = sizeof(struct rx_debugIn);
2078 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2086 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2087 afs_uint32 ahost, short aport, int istack)
2092 * Only respond to client-initiated version requests, and
2093 * clear that flag in the response.
2095 if (ap->header.flags & RX_CLIENT_INITIATED) {
2098 ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2099 rxi_EncodePacketHeader(ap);
2100 memset(buf, 0, sizeof(buf));
2101 strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2102 rx_packetwrite(ap, 0, 65, buf);
2105 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2113 /* send a debug packet back to the sender */
2115 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2116 afs_uint32 ahost, short aport, afs_int32 istack)
2118 struct sockaddr_in taddr;
2119 unsigned int i, nbytes, savelen = 0;
2122 int waslocked = ISAFS_GLOCK();
2125 taddr.sin_family = AF_INET;
2126 taddr.sin_port = aport;
2127 taddr.sin_addr.s_addr = ahost;
2128 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2129 taddr.sin_len = sizeof(struct sockaddr_in);
2132 /* We need to trim the niovecs. */
2133 nbytes = apacket->length;
2134 for (i = 1; i < apacket->niovecs; i++) {
2135 if (nbytes <= apacket->wirevec[i].iov_len) {
2136 savelen = apacket->wirevec[i].iov_len;
2137 saven = apacket->niovecs;
2138 apacket->wirevec[i].iov_len = nbytes;
2139 apacket->niovecs = i + 1; /* so condition fails because i == niovecs */
2141 nbytes -= apacket->wirevec[i].iov_len;
2144 #ifdef RX_KERNEL_TRACE
2145 if (ICL_SETACTIVE(afs_iclSetp)) {
2148 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2149 "before osi_NetSend()");
2157 /* debug packets are not reliably delivered, hence the cast below. */
2158 (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2159 apacket->length + RX_HEADER_SIZE, istack);
2161 #ifdef RX_KERNEL_TRACE
2162 if (ICL_SETACTIVE(afs_iclSetp)) {
2164 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2165 "after osi_NetSend()");
2174 if (saven) { /* means we truncated the packet above. */
2175 apacket->wirevec[i - 1].iov_len = savelen;
2176 apacket->niovecs = saven;
2181 /* Send the packet to appropriate destination for the specified
2182 * call. The header is first encoded and placed in the packet.
2185 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2186 struct rx_packet *p, int istack)
2192 struct sockaddr_in addr;
2193 struct rx_peer *peer = conn->peer;
2196 char deliveryType = 'S';
2198 /* The address we're sending the packet to */
2199 memset(&addr, 0, sizeof(addr));
2200 addr.sin_family = AF_INET;
2201 addr.sin_port = peer->port;
2202 addr.sin_addr.s_addr = peer->host;
2204 /* This stuff should be revamped, I think, so that most, if not
2205 * all, of the header stuff is always added here. We could
2206 * probably do away with the encode/decode routines. XXXXX */
2208 /* Stamp each packet with a unique serial number. The serial
2209 * number is maintained on a connection basis because some types
2210 * of security may be based on the serial number of the packet,
2211 * and security is handled on a per authenticated-connection
2213 /* Pre-increment, to guarantee no zero serial number; a zero
2214 * serial number means the packet was never sent. */
2215 MUTEX_ENTER(&conn->conn_data_lock);
2216 p->header.serial = ++conn->serial;
2217 if (p->length > conn->peer->maxPacketSize) {
2218 if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2219 (p->header.flags & RX_REQUEST_ACK)) {
2220 conn->lastPingSize = p->length;
2221 conn->lastPingSizeSer = p->header.serial;
2222 } else if (p->header.seq != 0) {
2223 conn->lastPacketSize = p->length;
2224 conn->lastPacketSizeSeq = p->header.seq;
2227 MUTEX_EXIT(&conn->conn_data_lock);
2228 /* This is so we can adjust retransmit time-outs better in the face of
2229 * rapidly changing round-trip times. RTO estimation is not a la Karn.
2231 if (p->firstSerial == 0) {
2232 p->firstSerial = p->header.serial;
2235 /* If an output tracer function is defined, call it with the packet and
2236 * network address. Note this function may modify its arguments. */
2237 if (rx_almostSent) {
2238 int drop = (*rx_almostSent) (p, &addr);
2239 /* drop packet if return value is non-zero? */
2241 deliveryType = 'D'; /* Drop the packet */
2245 /* Get network byte order header */
2246 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
2247 * touch ALL the fields */
2249 /* Send the packet out on the same socket that related packets are being
2253 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2256 /* Possibly drop this packet, for testing purposes */
2257 if ((deliveryType == 'D')
2258 || ((rx_intentionallyDroppedPacketsPer100 > 0)
2259 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2260 deliveryType = 'D'; /* Drop the packet */
2262 deliveryType = 'S'; /* Send the packet */
2263 #endif /* RXDEBUG */
2265 /* Loop until the packet is sent. We'd prefer just to use a
2266 * blocking socket, but unfortunately the interface doesn't
2267 * allow us to have the socket block in send mode, and not
2268 * block in receive mode */
2270 waslocked = ISAFS_GLOCK();
2271 #ifdef RX_KERNEL_TRACE
2272 if (ICL_SETACTIVE(afs_iclSetp)) {
2275 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2276 "before osi_NetSend()");
2285 osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2286 p->length + RX_HEADER_SIZE, istack)) != 0) {
2287 /* send failed, so let's hurry up the resend, eh? */
2288 if (rx_stats_active)
2289 rx_atomic_inc(&rx_stats.netSendFailures);
2290 p->retryTime = p->timeSent; /* resend it very soon */
2291 clock_Addmsec(&(p->retryTime),
2292 10 + (((afs_uint32) p->backoff) << 8));
2293 /* Some systems are nice and tell us right away that we cannot
2294 * reach this recipient by returning an error code.
2295 * So, when this happens let's "down" the host NOW so
2296 * we don't sit around waiting for this host to timeout later.
2300 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2301 #elif defined(AFS_LINUX20_ENV)
2302 code == -ENETUNREACH
2303 #elif defined(AFS_DARWIN_ENV)
2304 code == EHOSTUNREACH
2309 call->lastReceiveTime = 0;
2312 #ifdef RX_KERNEL_TRACE
2313 if (ICL_SETACTIVE(afs_iclSetp)) {
2315 afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2316 "after osi_NetSend()");
2327 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d\n",
2328 deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2329 ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2330 p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2332 if (rx_stats_active) {
2333 rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2334 MUTEX_ENTER(&peer->peer_lock);
2335 hadd32(peer->bytesSent, p->length);
2336 MUTEX_EXIT(&peer->peer_lock);
2340 /* Send a list of packets to appropriate destination for the specified
2341 * connection. The headers are first encoded and placed in the packets.
2344 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2345 struct rx_packet **list, int len, int istack)
2347 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
2350 struct sockaddr_in addr;
2351 struct rx_peer *peer = conn->peer;
2353 struct rx_packet *p = NULL;
2354 struct iovec wirevec[RX_MAXIOVECS];
2355 int i, length, code;
2358 struct rx_jumboHeader *jp;
2360 char deliveryType = 'S';
2362 /* The address we're sending the packet to */
2363 addr.sin_family = AF_INET;
2364 addr.sin_port = peer->port;
2365 addr.sin_addr.s_addr = peer->host;
2367 if (len + 1 > RX_MAXIOVECS) {
2368 osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2372 * Stamp the packets in this jumbogram with consecutive serial numbers
2374 MUTEX_ENTER(&conn->conn_data_lock);
2375 serial = conn->serial;
2376 conn->serial += len;
2377 for (i = 0; i < len; i++) {
2379 if (p->length > conn->peer->maxPacketSize) {
2380 /* a ping *or* a sequenced packet can count */
2381 if ((p->length > conn->peer->maxPacketSize)) {
2382 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2383 (p->header.flags & RX_REQUEST_ACK)) &&
2384 ((i == 0) || (p->length >= conn->lastPingSize))) {
2385 conn->lastPingSize = p->length;
2386 conn->lastPingSizeSer = serial + i;
2387 } else if ((p->header.seq != 0) &&
2388 ((i == 0) || (p->length >= conn->lastPacketSize))) {
2389 conn->lastPacketSize = p->length;
2390 conn->lastPacketSizeSeq = p->header.seq;
2395 MUTEX_EXIT(&conn->conn_data_lock);
2398 /* This stuff should be revamped, I think, so that most, if not
2399 * all, of the header stuff is always added here. We could
2400 * probably do away with the encode/decode routines. XXXXX */
2403 length = RX_HEADER_SIZE;
2404 wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2405 wirevec[0].iov_len = RX_HEADER_SIZE;
2406 for (i = 0; i < len; i++) {
2409 /* The whole 3.5 jumbogram scheme relies on packets fitting
2410 * in a single packet buffer. */
2411 if (p->niovecs > 2) {
2412 osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2415 /* Set the RX_JUMBO_PACKET flags in all but the last packets
2418 if (p->length != RX_JUMBOBUFFERSIZE) {
2419 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2421 p->header.flags |= RX_JUMBO_PACKET;
2422 length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2423 wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2425 wirevec[i + 1].iov_len = p->length;
2426 length += p->length;
2428 wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2430 /* Convert jumbo packet header to network byte order */
2431 temp = (afs_uint32) (p->header.flags) << 24;
2432 temp |= (afs_uint32) (p->header.spare);
2433 *(afs_uint32 *) jp = htonl(temp);
2435 jp = (struct rx_jumboHeader *)
2436 ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2438 /* Stamp each packet with a unique serial number. The serial
2439 * number is maintained on a connection basis because some types
2440 * of security may be based on the serial number of the packet,
2441 * and security is handled on a per authenticated-connection
2443 /* Pre-increment, to guarantee no zero serial number; a zero
2444 * serial number means the packet was never sent. */
2445 p->header.serial = ++serial;
2446 /* This is so we can adjust retransmit time-outs better in the face of
2447 * rapidly changing round-trip times. RTO estimation is not a la Karn.
2449 if (p->firstSerial == 0) {
2450 p->firstSerial = p->header.serial;
2453 /* If an output tracer function is defined, call it with the packet and
2454 * network address. Note this function may modify its arguments. */
2455 if (rx_almostSent) {
2456 int drop = (*rx_almostSent) (p, &addr);
2457 /* drop packet if return value is non-zero? */
2459 deliveryType = 'D'; /* Drop the packet */
2463 /* Get network byte order header */
2464 rxi_EncodePacketHeader(p); /* XXX in the event of rexmit, etc, don't need to
2465 * touch ALL the fields */
2468 /* Send the packet out on the same socket that related packets are being
2472 RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2475 /* Possibly drop this packet, for testing purposes */
2476 if ((deliveryType == 'D')
2477 || ((rx_intentionallyDroppedPacketsPer100 > 0)
2478 && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2479 deliveryType = 'D'; /* Drop the packet */
2481 deliveryType = 'S'; /* Send the packet */
2482 #endif /* RXDEBUG */
2484 /* Loop until the packet is sent. We'd prefer just to use a
2485 * blocking socket, but unfortunately the interface doesn't
2486 * allow us to have the socket block in send mode, and not
2487 * block in receive mode */
2488 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
2489 waslocked = ISAFS_GLOCK();
2490 if (!istack && waslocked)
2494 osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2496 /* send failed, so let's hurry up the resend, eh? */
2497 if (rx_stats_active)
2498 rx_atomic_inc(&rx_stats.netSendFailures);
2499 for (i = 0; i < len; i++) {
2501 p->retryTime = p->timeSent; /* resend it very soon */
2502 clock_Addmsec(&(p->retryTime),
2503 10 + (((afs_uint32) p->backoff) << 8));
2505 /* Some systems are nice and tell us right away that we cannot
2506 * reach this recipient by returning an error code.
2507 * So, when this happens let's "down" the host NOW so
2508 * we don't sit around waiting for this host to timeout later.
2512 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2513 #elif defined(AFS_LINUX20_ENV)
2514 code == -ENETUNREACH
2515 #elif defined(AFS_DARWIN_ENV)
2516 code == EHOSTUNREACH
2521 call->lastReceiveTime = 0;
2523 #if defined(AFS_SUN5_ENV) && defined(KERNEL)
2524 if (!istack && waslocked)
2530 osi_Assert(p != NULL);
2532 dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d\n",
2533 deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2534 ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2535 p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2538 if (rx_stats_active) {
2539 rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2540 MUTEX_ENTER(&peer->peer_lock);
2541 hadd32(peer->bytesSent, p->length);
2542 MUTEX_EXIT(&peer->peer_lock);
2547 /* Send a "special" packet to the peer connection. If call is
2548 * specified, then the packet is directed to a specific call channel
2549 * associated with the connection, otherwise it is directed to the
2550 * connection only. Uses optionalPacket if it is supplied, rather than
2551 * allocating a new packet buffer. Nbytes is the length of the data
2552 * portion of the packet. If data is non-null, nbytes of data are
2553 * copied into the packet. Type is the type of the packet, as defined
2554 * in rx.h. Bug: there's a lot of duplication between this and other
2555 * routines. This needs to be cleaned up. */
2557 rxi_SendSpecial(struct rx_call *call,
2558 struct rx_connection *conn,
2559 struct rx_packet *optionalPacket, int type, char *data,
2560 int nbytes, int istack)
2562 /* Some of the following stuff should be common code for all
2563 * packet sends (it's repeated elsewhere) */
2564 struct rx_packet *p;
2566 int savelen = 0, saven = 0;
2567 int channel, callNumber;
2569 channel = call->channel;
2570 callNumber = *call->callNumber;
2571 /* BUSY packets refer to the next call on this connection */
2572 if (type == RX_PACKET_TYPE_BUSY) {
2581 p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2583 osi_Panic("rxi_SendSpecial failure");
2590 p->header.serviceId = conn->serviceId;
2591 p->header.securityIndex = conn->securityIndex;
2592 p->header.cid = (conn->cid | channel);
2593 p->header.callNumber = callNumber;
2595 p->header.epoch = conn->epoch;
2596 p->header.type = type;
2597 p->header.flags = 0;
2598 if (conn->type == RX_CLIENT_CONNECTION)
2599 p->header.flags |= RX_CLIENT_INITIATED;
2601 rx_packetwrite(p, 0, nbytes, data);
2603 for (i = 1; i < p->niovecs; i++) {
2604 if (nbytes <= p->wirevec[i].iov_len) {
2605 savelen = p->wirevec[i].iov_len;
2607 p->wirevec[i].iov_len = nbytes;
2608 p->niovecs = i + 1; /* so condition fails because i == niovecs */
2610 nbytes -= p->wirevec[i].iov_len;
2614 rxi_Send(call, p, istack);
2616 rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2617 if (saven) { /* means we truncated the packet above. We probably don't */
2618 /* really need to do this, but it seems safer this way, given that */
2619 /* sneaky optionalPacket... */
2620 p->wirevec[i - 1].iov_len = savelen;
2623 if (!optionalPacket)
2625 return optionalPacket;
2629 /* Encode the packet's header (from the struct header in the packet to
2630 * the net byte order representation in the wire representation of the
2631 * packet, which is what is actually sent out on the wire) */
2633 rxi_EncodePacketHeader(struct rx_packet *p)
2635 afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
2637 memset(buf, 0, RX_HEADER_SIZE);
2638 *buf++ = htonl(p->header.epoch);
2639 *buf++ = htonl(p->header.cid);
2640 *buf++ = htonl(p->header.callNumber);
2641 *buf++ = htonl(p->header.seq);
2642 *buf++ = htonl(p->header.serial);
2643 *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2644 | (((afs_uint32) p->header.flags) << 16)
2645 | (p->header.userStatus << 8) | p->header.securityIndex);
2646 /* Note: top 16 bits of this next word were reserved */
2647 *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2650 /* Decode the packet's header (from net byte order to a struct header) */
2652 rxi_DecodePacketHeader(struct rx_packet *p)
2654 afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base); /* MTUXXX */
2657 p->header.epoch = ntohl(*buf);
2659 p->header.cid = ntohl(*buf);
2661 p->header.callNumber = ntohl(*buf);
2663 p->header.seq = ntohl(*buf);
2665 p->header.serial = ntohl(*buf);
2671 /* C will truncate byte fields to bytes for me */
2672 p->header.type = temp >> 24;
2673 p->header.flags = temp >> 16;
2674 p->header.userStatus = temp >> 8;
2675 p->header.securityIndex = temp >> 0;
2680 p->header.serviceId = (temp & 0xffff);
2681 p->header.spare = temp >> 16;
2682 /* Note: top 16 bits of this last word are the security checksum */
2686 * LOCKS HELD: called with call->lock held.
2688 * PrepareSendPacket is the only place in the code that
2689 * can increment call->tnext. This could become an atomic
2690 * in the future. Beyond that there is nothing in this
2691 * function that requires the call being locked. This
2692 * function can only be called by the application thread.
2695 rxi_PrepareSendPacket(struct rx_call *call,
2696 struct rx_packet *p, int last)
2698 struct rx_connection *conn = call->conn;
2699 afs_uint32 seq = call->tnext++;
2701 afs_int32 len; /* len must be a signed type; it can go negative */
2703 /* No data packets on call 0. Where do these come from? */
2704 if (*call->callNumber == 0)
2705 *call->callNumber = 1;
2707 MUTEX_EXIT(&call->lock);
2708 p->flags &= ~RX_PKTFLAG_ACKED;
2709 p->header.cid = (conn->cid | call->channel);
2710 p->header.serviceId = conn->serviceId;
2711 p->header.securityIndex = conn->securityIndex;
2713 p->header.callNumber = *call->callNumber;
2714 p->header.seq = seq;
2715 p->header.epoch = conn->epoch;
2716 p->header.type = RX_PACKET_TYPE_DATA;
2717 p->header.flags = 0;
2718 p->header.spare = 0;
2719 if (conn->type == RX_CLIENT_CONNECTION)
2720 p->header.flags |= RX_CLIENT_INITIATED;
2723 p->header.flags |= RX_LAST_PACKET;
2725 clock_Zero(&p->retryTime); /* Never yet transmitted */
2726 clock_Zero(&p->firstSent); /* Never yet transmitted */
2727 p->header.serial = 0; /* Another way of saying never transmitted... */
2730 /* Now that we're sure this is the last data on the call, make sure
2731 * that the "length" and the sum of the iov_lens matches. */
2732 len = p->length + call->conn->securityHeaderSize;
2734 for (i = 1; i < p->niovecs && len > 0; i++) {
2735 len -= p->wirevec[i].iov_len;
2738 osi_Panic("PrepareSendPacket 1\n"); /* MTUXXX */
2739 } else if (i < p->niovecs) {
2740 /* Free any extra elements in the wirevec */
2741 #if defined(RX_ENABLE_TSFPQ)
2742 rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2743 #else /* !RX_ENABLE_TSFPQ */
2744 MUTEX_ENTER(&rx_freePktQ_lock);
2745 rxi_FreeDataBufsNoLock(p, i);
2746 MUTEX_EXIT(&rx_freePktQ_lock);
2747 #endif /* !RX_ENABLE_TSFPQ */
2752 p->wirevec[i - 1].iov_len += len;
2753 RXS_PreparePacket(conn->securityObject, call, p);
2754 MUTEX_ENTER(&call->lock);
2757 /* Given an interface MTU size, calculate an adjusted MTU size that
2758 * will make efficient use of the RX buffers when the peer is sending
2759 * either AFS 3.4a jumbograms or AFS 3.5 jumbograms. */
2761 rxi_AdjustIfMTU(int mtu)
2766 if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2768 adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2769 if (mtu <= adjMTU) {
2776 frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2777 return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2780 /* Given an interface MTU size, and the peer's advertised max receive
2781 * size, calculate an adjisted maxMTU size that makes efficient use
2782 * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2784 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2786 int maxMTU = mtu * rxi_nSendFrags;
2787 maxMTU = MIN(maxMTU, peerMaxMTU);
2788 return rxi_AdjustIfMTU(maxMTU);
2791 /* Given a packet size, figure out how many datagram packet will fit.
2792 * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2793 * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2794 * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2796 rxi_AdjustDgramPackets(int frags, int mtu)
2799 if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2802 maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2803 maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2804 /* subtract the size of the first and last packets */
2805 maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2809 return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2814 * This function can be used by the Windows Cache Manager
2815 * to dump the list of all rx packets so that we can determine
2816 * where the packet leakage is.
2818 int rx_DumpPackets(FILE *outputFile, char *cookie)
2820 #ifdef RXDEBUG_PACKET
2821 struct rx_packet *p;
2825 #define RXDPRINTF sprintf
2826 #define RXDPRINTOUT output
2828 #define RXDPRINTF fprintf
2829 #define RXDPRINTOUT outputFile
2833 MUTEX_ENTER(&rx_freePktQ_lock);
2834 RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2836 WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2839 for (p = rx_mallocedP; p; p = p->allNextp) {
2840 RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2841 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2842 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2843 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2844 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2845 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2847 WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2851 RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2853 WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2856 MUTEX_EXIT(&rx_freePktQ_lock);
2858 #endif /* RXDEBUG_PACKET */