src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #ifdef KERNEL
  12 #include "afs/param.h"
  13 #else
  14 #include <afs/param.h>
  15 #endif
  16
  17 #ifdef KERNEL
  18 #if defined(UKERNEL)
  19 #include "afs/sysincludes.h"
  20 #include "afsincludes.h"
  21 #include "rx/rx_kcommon.h"
  22 #include "rx/rx_clock.h"
  23 #include "rx/rx_queue.h"
  24 #include "rx/rx_packet.h"
  25 #include "rx/rx_atomic.h"
  26 #include "rx/rx_internal.h"
  27 #include "rx/rx_stats.h"
  28 #else /* defined(UKERNEL) */
  29 #ifdef RX_KERNEL_TRACE
  30 #include "../rx/rx_kcommon.h"
  31 #endif
  32 #include "h/types.h"
  33 #ifndef AFS_LINUX20_ENV
  34 #include "h/systm.h"
  35 #endif
  36 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  37 #include "afs/sysincludes.h"
  38 #endif
  39 #if defined(AFS_OBSD_ENV)
  40 #include "h/proc.h"
  41 #endif
  42 #include "h/socket.h"
  43 #if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  44 #if     !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  45 #include "sys/mount.h"          /* it gets pulled in by something later anyway */
  46 #endif
  47 #include "h/mbuf.h"
  48 #endif
  49 #include "netinet/in.h"
  50 #include "afs/afs_osi.h"
  51 #include "rx_kmutex.h"
  52 #include "rx/rx_clock.h"
  53 #include "rx/rx_queue.h"
  54 #include "rx_atomic.h"
  55 #ifdef  AFS_SUN5_ENV
  56 #include <sys/sysmacros.h>
  57 #endif
  58 #include "rx/rx_packet.h"
  59 #include "rx_internal.h"
  60 #include "rx_stats.h"
  61 #endif /* defined(UKERNEL) */
  62 #include "rx/rx_globals.h"
  63 #else /* KERNEL */
  64 #include "sys/types.h"
  65 #include <sys/stat.h>
  66 #include <errno.h>
  67 #if defined(AFS_NT40_ENV)
  68 #include <winsock2.h>
  69 #ifndef EWOULDBLOCK
  70 #define EWOULDBLOCK WSAEWOULDBLOCK
  71 #endif
  72 #include "rx_user.h"
  73 #include "rx_xmit_nt.h"
  74 #include <stdlib.h>
  75 #else
  76 #include <sys/socket.h>
  77 #include <netinet/in.h>
  78 #endif
  79 #include "rx_clock.h"
  80 #include "rx.h"
  81 #include "rx_queue.h"
  82 #ifdef  AFS_SUN5_ENV
  83 #include <sys/sysmacros.h>
  84 #endif
  85 #include "rx_packet.h"
  86 #include "rx_atomic.h"
  87 #include "rx_globals.h"
  88 #include "rx_internal.h"
  89 #include "rx_stats.h"
  90 #include <lwp.h>
  91 #include <assert.h>
  92 #include <string.h>
  93 #ifdef HAVE_UNISTD_H
  94 #include <unistd.h>
  95 #endif
  96 #endif /* KERNEL */
  97
  98 #ifdef RX_LOCKS_DB
  99 /* rxdb_fileID is used to identify the lock location, along with line#. */
 100 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
 101 #endif /* RX_LOCKS_DB */
 102 static struct rx_packet *rx_mallocedP = 0;
 103 #ifdef RXDEBUG_PACKET
 104 static afs_uint32       rx_packet_id = 0;
 105 #endif
 106
 107 extern char cml_version_number[];
 108
 109 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
 110
 111 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
 112                                 afs_uint32 ahost, short aport,
 113                                 afs_int32 istack);
 114
 115 #ifdef RX_ENABLE_TSFPQ
 116 static int
 117 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
 118 #else
 119 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
 120                                    afs_uint32 first,
 121                                    struct rx_queue * q);
 122 #endif
 123
 124 /* some rules about packets:
 125  * 1.  When a packet is allocated, the final iov_buf contains room for
 126  * a security trailer, but iov_len masks that fact.  If the security
 127  * package wants to add the trailer, it may do so, and then extend
 128  * iov_len appropriately.  For this reason, packet's niovecs and
 129  * iov_len fields should be accurate before calling PreparePacket.
 130 */
 131
 132 /* Preconditions:
 133  *        all packet buffers (iov_base) are integral multiples of
 134  *        the word size.
 135  *        offset is an integral multiple of the word size.
 136  */
 137 afs_int32
 138 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 139 {
 140     unsigned int i;
 141     size_t l;
 142     for (l = 0, i = 1; i < packet->niovecs; i++) {
 143         if (l + packet->wirevec[i].iov_len > offset) {
 144             return
 145                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 146                                  (offset - l)));
 147         }
 148         l += packet->wirevec[i].iov_len;
 149     }
 150
 151     return 0;
 152 }
 153
 154 /* Preconditions:
 155  *        all packet buffers (iov_base) are integral multiples of the word size.
 156  *        offset is an integral multiple of the word size.
 157  */
 158 afs_int32
 159 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 160 {
 161     unsigned int i;
 162     size_t l;
 163     for (l = 0, i = 1; i < packet->niovecs; i++) {
 164         if (l + packet->wirevec[i].iov_len > offset) {
 165             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 166                              (offset - l))) = data;
 167             return 0;
 168         }
 169         l += packet->wirevec[i].iov_len;
 170     }
 171
 172     return 0;
 173 }
 174
 175 /* Preconditions:
 176  *        all packet buffers (iov_base) are integral multiples of the
 177  *        word size.
 178  *        offset is an integral multiple of the word size.
 179  * Packet Invariants:
 180  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 181  */
 182 afs_int32
 183 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 184                   char *out)
 185 {
 186     unsigned int i, j, l, r;
 187     for (l = 0, i = 1; i < packet->niovecs; i++) {
 188         if (l + packet->wirevec[i].iov_len > offset) {
 189             break;
 190         }
 191         l += packet->wirevec[i].iov_len;
 192     }
 193
 194     /* i is the iovec which contains the first little bit of data in which we
 195      * are interested.  l is the total length of everything prior to this iovec.
 196      * j is the number of bytes we can safely copy out of this iovec.
 197      * offset only applies to the first iovec.
 198      */
 199     r = resid;
 200     while ((r > 0) && (i < packet->niovecs)) {
 201         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 202         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 203         r -= j;
 204         out += j;
 205         l += packet->wirevec[i].iov_len;
 206         offset = l;
 207         i++;
 208     }
 209
 210     return (r ? (resid - r) : resid);
 211 }
 212
 213
 214 /* Preconditions:
 215  *        all packet buffers (iov_base) are integral multiples of the
 216  *        word size.
 217  *        offset is an integral multiple of the word size.
 218  */
 219 afs_int32
 220 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 221 {
 222     unsigned int i, j, l, o, r;
 223     char *b;
 224
 225     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 226         if (l + packet->wirevec[i].iov_len > o) {
 227             break;
 228         }
 229         l += packet->wirevec[i].iov_len;
 230     }
 231
 232     /* i is the iovec which contains the first little bit of data in which we
 233      * are interested.  l is the total length of everything prior to this iovec.
 234      * j is the number of bytes we can safely copy out of this iovec.
 235      * offset only applies to the first iovec.
 236      */
 237     r = resid;
 238     while ((r > 0) && (i <= RX_MAXWVECS)) {
 239         if (i >= packet->niovecs)
 240             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 241                 break;
 242
 243         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 244         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 245         memcpy(b, in, j);
 246         r -= j;
 247         in += j;
 248         l += packet->wirevec[i].iov_len;
 249         offset = l;
 250         i++;
 251     }
 252
 253     return (r ? (resid - r) : resid);
 254 }
 255
 256 int
 257 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 258 {
 259     struct rx_packet *p, *np;
 260
 261     num_pkts = AllocPacketBufs(class, num_pkts, q);
 262
 263     for (queue_Scan(q, p, np, rx_packet)) {
 264         RX_PACKET_IOV_FULLINIT(p);
 265     }
 266
 267     return num_pkts;
 268 }
 269
 270 #ifdef RX_ENABLE_TSFPQ
 271 static int
 272 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 273 {
 274     struct rx_ts_info_t * rx_ts_info;
 275     int transfer;
 276     SPLVAR;
 277
 278     RX_TS_INFO_GET(rx_ts_info);
 279
 280     transfer = num_pkts - rx_ts_info->_FPQ.len;
 281     if (transfer > 0) {
 282         NETPRI;
 283         MUTEX_ENTER(&rx_freePktQ_lock);
 284         transfer = MAX(transfer, rx_TSFPQGlobSize);
 285         if (transfer > rx_nFreePackets) {
 286             /* alloc enough for us, plus a few globs for other threads */
 287             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 288         }
 289
 290         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 291
 292         MUTEX_EXIT(&rx_freePktQ_lock);
 293         USERPRI;
 294     }
 295
 296     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 297
 298     return num_pkts;
 299 }
 300 #else /* RX_ENABLE_TSFPQ */
 301 static int
 302 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 303 {
 304     struct rx_packet *c;
 305     int i;
 306 #ifdef KERNEL
 307     int overq = 0;
 308 #endif
 309     SPLVAR;
 310
 311     NETPRI;
 312
 313     MUTEX_ENTER(&rx_freePktQ_lock);
 314
 315 #ifdef KERNEL
 316     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 317          num_pkts--, overq++);
 318
 319     if (overq) {
 320         rxi_NeedMorePackets = TRUE;
 321         if (rx_stats_active) {
 322             switch (class) {
 323             case RX_PACKET_CLASS_RECEIVE:
 324                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 325                 break;
 326             case RX_PACKET_CLASS_SEND:
 327                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 328                 break;
 329             case RX_PACKET_CLASS_SPECIAL:
 330                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 331                 break;
 332             case RX_PACKET_CLASS_RECV_CBUF:
 333                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 334                 break;
 335             case RX_PACKET_CLASS_SEND_CBUF:
 336                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 337                 break;
 338             }
 339         }
 340     }
 341
 342     if (rx_nFreePackets < num_pkts)
 343         num_pkts = rx_nFreePackets;
 344
 345     if (!num_pkts) {
 346         rxi_NeedMorePackets = TRUE;
 347         goto done;
 348     }
 349 #else /* KERNEL */
 350     if (rx_nFreePackets < num_pkts) {
 351         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 352     }
 353 #endif /* KERNEL */
 354
 355     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 356          i < num_pkts;
 357          i++, c=queue_Next(c, rx_packet)) {
 358         RX_FPQ_MARK_USED(c);
 359     }
 360
 361     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 362
 363     rx_nFreePackets -= num_pkts;
 364
 365 #ifdef KERNEL
 366   done:
 367 #endif
 368     MUTEX_EXIT(&rx_freePktQ_lock);
 369
 370     USERPRI;
 371     return num_pkts;
 372 }
 373 #endif /* RX_ENABLE_TSFPQ */
 374
 375 /*
 376  * Free a packet currently used as a continuation buffer
 377  */
 378 #ifdef RX_ENABLE_TSFPQ
 379 /* num_pkts=0 means queue length is unknown */
 380 int
 381 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 382 {
 383     struct rx_ts_info_t * rx_ts_info;
 384     struct rx_packet *c, *nc;
 385     SPLVAR;
 386
 387     osi_Assert(num_pkts >= 0);
 388     RX_TS_INFO_GET(rx_ts_info);
 389
 390     if (!num_pkts) {
 391         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 392             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 393         }
 394     } else {
 395         for (queue_Scan(q, c, nc, rx_packet)) {
 396             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 397         }
 398     }
 399
 400     if (num_pkts) {
 401         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 402     }
 403
 404     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 405         NETPRI;
 406         MUTEX_ENTER(&rx_freePktQ_lock);
 407
 408         RX_TS_FPQ_LTOG(rx_ts_info);
 409
 410         /* Wakeup anyone waiting for packets */
 411         rxi_PacketsUnWait();
 412
 413         MUTEX_EXIT(&rx_freePktQ_lock);
 414         USERPRI;
 415     }
 416
 417     return num_pkts;
 418 }
 419 #else /* RX_ENABLE_TSFPQ */
 420 /* num_pkts=0 means queue length is unknown */
 421 int
 422 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 423 {
 424     struct rx_queue cbs;
 425     struct rx_packet *p, *np;
 426     int qlen = 0;
 427     SPLVAR;
 428
 429     osi_Assert(num_pkts >= 0);
 430     queue_Init(&cbs);
 431
 432     if (!num_pkts) {
 433         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 434             if (p->niovecs > 2) {
 435                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 436             }
 437             RX_FPQ_MARK_FREE(p);
 438         }
 439         if (!num_pkts)
 440             return 0;
 441     } else {
 442         for (queue_Scan(q, p, np, rx_packet)) {
 443             if (p->niovecs > 2) {
 444                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 445             }
 446             RX_FPQ_MARK_FREE(p);
 447         }
 448     }
 449
 450     if (qlen) {
 451         queue_SpliceAppend(q, &cbs);
 452         qlen += num_pkts;
 453     } else
 454         qlen = num_pkts;
 455
 456     NETPRI;
 457     MUTEX_ENTER(&rx_freePktQ_lock);
 458
 459     queue_SpliceAppend(&rx_freePacketQueue, q);
 460     rx_nFreePackets += qlen;
 461
 462     /* Wakeup anyone waiting for packets */
 463     rxi_PacketsUnWait();
 464
 465     MUTEX_EXIT(&rx_freePktQ_lock);
 466     USERPRI;
 467
 468     return num_pkts;
 469 }
 470 #endif /* RX_ENABLE_TSFPQ */
 471
 472 /* this one is kind of awful.
 473  * In rxkad, the packet has been all shortened, and everything, ready for
 474  * sending.  All of a sudden, we discover we need some of that space back.
 475  * This isn't terribly general, because it knows that the packets are only
 476  * rounded up to the EBS (userdata + security header).
 477  */
 478 int
 479 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 480 {
 481     int i;
 482     i = p->niovecs - 1;
 483     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 484         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 485             p->wirevec[i].iov_len += nb;
 486             return 0;
 487         }
 488     } else {
 489         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 490             p->wirevec[i].iov_len += nb;
 491             return 0;
 492         }
 493     }
 494
 495     return 0;
 496 }
 497
 498 /* get sufficient space to store nb bytes of data (or more), and hook
 499  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 500  * returns the number of bytes >0 which it failed to come up with.
 501  * Don't need to worry about locking on packet, since only
 502  * one thread can manipulate one at a time. Locking on continution
 503  * packets is handled by AllocPacketBufs */
 504 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 505 int
 506 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 507 {
 508     int i, nv;
 509     struct rx_queue q;
 510     struct rx_packet *cb, *ncb;
 511
 512     /* compute the number of cbuf's we need */
 513     nv = nb / RX_CBUFFERSIZE;
 514     if ((nv * RX_CBUFFERSIZE) < nb)
 515         nv++;
 516     if ((nv + p->niovecs) > RX_MAXWVECS)
 517         nv = RX_MAXWVECS - p->niovecs;
 518     if (nv < 1)
 519         return nb;
 520
 521     /* allocate buffers */
 522     queue_Init(&q);
 523     nv = AllocPacketBufs(class, nv, &q);
 524
 525     /* setup packet iovs */
 526     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 527         queue_Remove(cb);
 528         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 529         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 530     }
 531
 532     nb -= (nv * RX_CBUFFERSIZE);
 533     p->length += (nv * RX_CBUFFERSIZE);
 534     p->niovecs += nv;
 535
 536     return nb;
 537 }
 538
 539 /* Add more packet buffers */
 540 #ifdef RX_ENABLE_TSFPQ
 541 void
 542 rxi_MorePackets(int apackets)
 543 {
 544     struct rx_packet *p, *e;
 545     struct rx_ts_info_t * rx_ts_info;
 546     int getme;
 547     SPLVAR;
 548
 549     getme = apackets * sizeof(struct rx_packet);
 550     p = (struct rx_packet *)osi_Alloc(getme);
 551     osi_Assert(p);
 552
 553     PIN(p, getme);              /* XXXXX */
 554     memset(p, 0, getme);
 555     RX_TS_INFO_GET(rx_ts_info);
 556
 557     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 558     /* TSFPQ patch also needs to keep track of total packets */
 559
 560     MUTEX_ENTER(&rx_packets_mutex);
 561     rx_nPackets += apackets;
 562     RX_TS_FPQ_COMPUTE_LIMITS;
 563     MUTEX_EXIT(&rx_packets_mutex);
 564
 565     for (e = p + apackets; p < e; p++) {
 566         RX_PACKET_IOV_INIT(p);
 567         p->niovecs = 2;
 568
 569         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 570
 571         NETPRI;
 572         MUTEX_ENTER(&rx_freePktQ_lock);
 573 #ifdef RXDEBUG_PACKET
 574         p->packetId = rx_packet_id++;
 575         p->allNextp = rx_mallocedP;
 576 #endif /* RXDEBUG_PACKET */
 577         rx_mallocedP = p;
 578         MUTEX_EXIT(&rx_freePktQ_lock);
 579         USERPRI;
 580     }
 581     rx_ts_info->_FPQ.delta += apackets;
 582
 583     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 584         NETPRI;
 585         MUTEX_ENTER(&rx_freePktQ_lock);
 586
 587         RX_TS_FPQ_LTOG(rx_ts_info);
 588         rxi_NeedMorePackets = FALSE;
 589         rxi_PacketsUnWait();
 590
 591         MUTEX_EXIT(&rx_freePktQ_lock);
 592         USERPRI;
 593     }
 594 }
 595 #else /* RX_ENABLE_TSFPQ */
 596 void
 597 rxi_MorePackets(int apackets)
 598 {
 599     struct rx_packet *p, *e;
 600     int getme;
 601     SPLVAR;
 602
 603     getme = apackets * sizeof(struct rx_packet);
 604     p = (struct rx_packet *)osi_Alloc(getme);
 605     osi_Assert(p);
 606
 607     PIN(p, getme);              /* XXXXX */
 608     memset(p, 0, getme);
 609     NETPRI;
 610     MUTEX_ENTER(&rx_freePktQ_lock);
 611
 612     for (e = p + apackets; p < e; p++) {
 613         RX_PACKET_IOV_INIT(p);
 614 #ifdef RX_TRACK_PACKETS
 615         p->flags |= RX_PKTFLAG_FREE;
 616 #endif
 617         p->niovecs = 2;
 618
 619         queue_Append(&rx_freePacketQueue, p);
 620 #ifdef RXDEBUG_PACKET
 621         p->packetId = rx_packet_id++;
 622         p->allNextp = rx_mallocedP;
 623 #endif /* RXDEBUG_PACKET */
 624         rx_mallocedP = p;
 625     }
 626
 627     rx_nPackets += apackets;
 628     rx_nFreePackets += apackets;
 629     rxi_NeedMorePackets = FALSE;
 630     rxi_PacketsUnWait();
 631
 632     MUTEX_EXIT(&rx_freePktQ_lock);
 633     USERPRI;
 634 }
 635 #endif /* RX_ENABLE_TSFPQ */
 636
 637 #ifdef RX_ENABLE_TSFPQ
 638 void
 639 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 640 {
 641     struct rx_packet *p, *e;
 642     struct rx_ts_info_t * rx_ts_info;
 643     int getme;
 644     SPLVAR;
 645
 646     getme = apackets * sizeof(struct rx_packet);
 647     p = (struct rx_packet *)osi_Alloc(getme);
 648
 649     PIN(p, getme);              /* XXXXX */
 650     memset(p, 0, getme);
 651     RX_TS_INFO_GET(rx_ts_info);
 652
 653     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 654     /* TSFPQ patch also needs to keep track of total packets */
 655     MUTEX_ENTER(&rx_packets_mutex);
 656     rx_nPackets += apackets;
 657     RX_TS_FPQ_COMPUTE_LIMITS;
 658     MUTEX_EXIT(&rx_packets_mutex);
 659
 660     for (e = p + apackets; p < e; p++) {
 661         RX_PACKET_IOV_INIT(p);
 662         p->niovecs = 2;
 663         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 664
 665         NETPRI;
 666         MUTEX_ENTER(&rx_freePktQ_lock);
 667 #ifdef RXDEBUG_PACKET
 668         p->packetId = rx_packet_id++;
 669         p->allNextp = rx_mallocedP;
 670 #endif /* RXDEBUG_PACKET */
 671         rx_mallocedP = p;
 672         MUTEX_EXIT(&rx_freePktQ_lock);
 673         USERPRI;
 674     }
 675     rx_ts_info->_FPQ.delta += apackets;
 676
 677     if (flush_global &&
 678         (num_keep_local < apackets)) {
 679         NETPRI;
 680         MUTEX_ENTER(&rx_freePktQ_lock);
 681
 682         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 683         rxi_NeedMorePackets = FALSE;
 684         rxi_PacketsUnWait();
 685
 686         MUTEX_EXIT(&rx_freePktQ_lock);
 687         USERPRI;
 688     }
 689 }
 690 #endif /* RX_ENABLE_TSFPQ */
 691
 692 #ifndef KERNEL
 693 /* Add more packet buffers */
 694 void
 695 rxi_MorePacketsNoLock(int apackets)
 696 {
 697 #ifdef RX_ENABLE_TSFPQ
 698     struct rx_ts_info_t * rx_ts_info;
 699 #endif /* RX_ENABLE_TSFPQ */
 700     struct rx_packet *p, *e;
 701     int getme;
 702
 703     /* allocate enough packets that 1/4 of the packets will be able
 704      * to hold maximal amounts of data */
 705     apackets += (apackets / 4)
 706         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 707     do {
 708         getme = apackets * sizeof(struct rx_packet);
 709         p = (struct rx_packet *)osi_Alloc(getme);
 710         if (p == NULL) {
 711             apackets -= apackets / 4;
 712             osi_Assert(apackets > 0);
 713         }
 714     } while(p == NULL);
 715     memset(p, 0, getme);
 716
 717 #ifdef RX_ENABLE_TSFPQ
 718     RX_TS_INFO_GET(rx_ts_info);
 719     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 720 #endif /* RX_ENABLE_TSFPQ */
 721
 722     for (e = p + apackets; p < e; p++) {
 723         RX_PACKET_IOV_INIT(p);
 724 #ifdef RX_TRACK_PACKETS
 725         p->flags |= RX_PKTFLAG_FREE;
 726 #endif
 727         p->niovecs = 2;
 728
 729         queue_Append(&rx_freePacketQueue, p);
 730 #ifdef RXDEBUG_PACKET
 731         p->packetId = rx_packet_id++;
 732         p->allNextp = rx_mallocedP;
 733 #endif /* RXDEBUG_PACKET */
 734         rx_mallocedP = p;
 735     }
 736
 737     rx_nFreePackets += apackets;
 738     MUTEX_ENTER(&rx_packets_mutex);
 739     rx_nPackets += apackets;
 740 #ifdef RX_ENABLE_TSFPQ
 741     RX_TS_FPQ_COMPUTE_LIMITS;
 742 #endif /* RX_ENABLE_TSFPQ */
 743     MUTEX_EXIT(&rx_packets_mutex);
 744     rxi_NeedMorePackets = FALSE;
 745     rxi_PacketsUnWait();
 746 }
 747 #endif /* !KERNEL */
 748
 749 void
 750 rxi_FreeAllPackets(void)
 751 {
 752     /* must be called at proper interrupt level, etcetera */
 753     /* MTUXXX need to free all Packets */
 754     osi_Free(rx_mallocedP,
 755              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 756     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 757 }
 758
 759 #ifdef RX_ENABLE_TSFPQ
 760 void
 761 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 762 {
 763     struct rx_ts_info_t * rx_ts_info;
 764     int xfer;
 765     SPLVAR;
 766
 767     RX_TS_INFO_GET(rx_ts_info);
 768
 769     if (num_keep_local != rx_ts_info->_FPQ.len) {
 770         NETPRI;
 771         MUTEX_ENTER(&rx_freePktQ_lock);
 772         if (num_keep_local < rx_ts_info->_FPQ.len) {
 773             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 774             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 775             rxi_PacketsUnWait();
 776         } else {
 777             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 778             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 779                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 780             if (rx_nFreePackets < xfer) {
 781                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 782             }
 783             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 784         }
 785         MUTEX_EXIT(&rx_freePktQ_lock);
 786         USERPRI;
 787     }
 788 }
 789
 790 void
 791 rxi_FlushLocalPacketsTSFPQ(void)
 792 {
 793     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 794 }
 795 #endif /* RX_ENABLE_TSFPQ */
 796
 797 /* Allocate more packets iff we need more continuation buffers */
 798 /* In kernel, can't page in memory with interrupts disabled, so we
 799  * don't use the event mechanism. */
 800 void
 801 rx_CheckPackets(void)
 802 {
 803     if (rxi_NeedMorePackets) {
 804         rxi_MorePackets(rx_maxSendWindow);
 805     }
 806 }
 807
 808 /* In the packet freeing routine below, the assumption is that
 809    we want all of the packets to be used equally frequently, so that we
 810    don't get packet buffers paging out.  It would be just as valid to
 811    assume that we DO want them to page out if not many are being used.
 812    In any event, we assume the former, and append the packets to the end
 813    of the free list.  */
 814 /* This explanation is bogus.  The free list doesn't remain in any kind of
 815    useful order for afs_int32: the packets in use get pretty much randomly scattered
 816    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 817    must be stored so that packets which are adjacent in memory are adjacent in the
 818    free list.  An array springs rapidly to mind.
 819    */
 820
 821 /* Actually free the packet p. */
 822 #ifdef RX_ENABLE_TSFPQ
 823 void
 824 rxi_FreePacketNoLock(struct rx_packet *p)
 825 {
 826     struct rx_ts_info_t * rx_ts_info;
 827     dpf(("Free %"AFS_PTR_FMT"\n", p));
 828
 829     RX_TS_INFO_GET(rx_ts_info);
 830     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 831     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 832         RX_TS_FPQ_LTOG(rx_ts_info);
 833     }
 834 }
 835 #else /* RX_ENABLE_TSFPQ */
 836 void
 837 rxi_FreePacketNoLock(struct rx_packet *p)
 838 {
 839     dpf(("Free %"AFS_PTR_FMT"\n", p));
 840
 841     RX_FPQ_MARK_FREE(p);
 842     rx_nFreePackets++;
 843     queue_Append(&rx_freePacketQueue, p);
 844 }
 845 #endif /* RX_ENABLE_TSFPQ */
 846
 847 #ifdef RX_ENABLE_TSFPQ
 848 void
 849 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 850 {
 851     struct rx_ts_info_t * rx_ts_info;
 852     dpf(("Free %"AFS_PTR_FMT"\n", p));
 853
 854     RX_TS_INFO_GET(rx_ts_info);
 855     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 856
 857     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 858         NETPRI;
 859         MUTEX_ENTER(&rx_freePktQ_lock);
 860
 861         RX_TS_FPQ_LTOG(rx_ts_info);
 862
 863         /* Wakeup anyone waiting for packets */
 864         rxi_PacketsUnWait();
 865
 866         MUTEX_EXIT(&rx_freePktQ_lock);
 867         USERPRI;
 868     }
 869 }
 870 #endif /* RX_ENABLE_TSFPQ */
 871
 872 /*
 873  * free continuation buffers off a packet into a queue
 874  *
 875  * [IN] p      -- packet from which continuation buffers will be freed
 876  * [IN] first  -- iovec offset of first continuation buffer to free
 877  * [IN] q      -- queue into which continuation buffers will be chained
 878  *
 879  * returns:
 880  *   number of continuation buffers freed
 881  */
 882 #ifndef RX_ENABLE_TSFPQ
 883 static int
 884 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 885 {
 886     struct iovec *iov;
 887     struct rx_packet * cb;
 888     int count = 0;
 889
 890     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 891         iov = &p->wirevec[first];
 892         if (!iov->iov_base)
 893             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 894         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 895         RX_FPQ_MARK_FREE(cb);
 896         queue_Append(q, cb);
 897     }
 898     p->length = 0;
 899     p->niovecs = 0;
 900
 901     return count;
 902 }
 903 #endif
 904
 905 /*
 906  * free packet continuation buffers into the global free packet pool
 907  *
 908  * [IN] p      -- packet from which to free continuation buffers
 909  * [IN] first  -- iovec offset of first continuation buffer to free
 910  *
 911  * returns:
 912  *   zero always
 913  */
 914 int
 915 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 916 {
 917     struct iovec *iov;
 918
 919     for (first = MAX(2, first); first < p->niovecs; first++) {
 920         iov = &p->wirevec[first];
 921         if (!iov->iov_base)
 922             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 923         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 924     }
 925     p->length = 0;
 926     p->niovecs = 0;
 927
 928     return 0;
 929 }
 930
 931 #ifdef RX_ENABLE_TSFPQ
 932 /*
 933  * free packet continuation buffers into the thread-local free pool
 934  *
 935  * [IN] p             -- packet from which continuation buffers will be freed
 936  * [IN] first         -- iovec offset of first continuation buffer to free
 937  *                       any value less than 2, the min number of iovecs,
 938  *                       is treated as if it is 2.
 939  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 940  *                       global free pool before returning
 941  *
 942  * returns:
 943  *   zero always
 944  */
 945 static int
 946 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 947 {
 948     struct iovec *iov;
 949     struct rx_ts_info_t * rx_ts_info;
 950
 951     RX_TS_INFO_GET(rx_ts_info);
 952
 953     for (first = MAX(2, first); first < p->niovecs; first++) {
 954         iov = &p->wirevec[first];
 955         if (!iov->iov_base)
 956             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 957         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 958     }
 959     p->length = 0;
 960     p->niovecs = 0;
 961
 962     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 963         NETPRI;
 964         MUTEX_ENTER(&rx_freePktQ_lock);
 965
 966         RX_TS_FPQ_LTOG(rx_ts_info);
 967
 968         /* Wakeup anyone waiting for packets */
 969         rxi_PacketsUnWait();
 970
 971         MUTEX_EXIT(&rx_freePktQ_lock);
 972         USERPRI;
 973     }
 974     return 0;
 975 }
 976 #endif /* RX_ENABLE_TSFPQ */
 977
 978 int rxi_nBadIovecs = 0;
 979
 980 /* rxi_RestoreDataBufs
 981  *
 982  * Restore the correct sizes to the iovecs. Called when reusing a packet
 983  * for reading off the wire.
 984  */
 985 void
 986 rxi_RestoreDataBufs(struct rx_packet *p)
 987 {
 988     unsigned int i;
 989     struct iovec *iov = &p->wirevec[2];
 990
 991     RX_PACKET_IOV_INIT(p);
 992
 993     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 994         if (!iov->iov_base) {
 995             rxi_nBadIovecs++;
 996             p->niovecs = i;
 997             break;
 998         }
 999         iov->iov_len = RX_CBUFFERSIZE;
1000     }
1001 }
1002
1003 #ifdef RX_ENABLE_TSFPQ
1004 int
1005 rxi_TrimDataBufs(struct rx_packet *p, int first)
1006 {
1007     int length;
1008     struct iovec *iov, *end;
1009     struct rx_ts_info_t * rx_ts_info;
1010     SPLVAR;
1011
1012     if (first != 1)
1013         osi_Panic("TrimDataBufs 1: first must be 1");
1014
1015     /* Skip over continuation buffers containing message data */
1016     iov = &p->wirevec[2];
1017     end = iov + (p->niovecs - 2);
1018     length = p->length - p->wirevec[1].iov_len;
1019     for (; iov < end && length > 0; iov++) {
1020         if (!iov->iov_base)
1021             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1022         length -= iov->iov_len;
1023     }
1024
1025     /* iov now points to the first empty data buffer. */
1026     if (iov >= end)
1027         return 0;
1028
1029     RX_TS_INFO_GET(rx_ts_info);
1030     for (; iov < end; iov++) {
1031         if (!iov->iov_base)
1032             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1033         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1034         p->niovecs--;
1035     }
1036     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1037         NETPRI;
1038         MUTEX_ENTER(&rx_freePktQ_lock);
1039
1040         RX_TS_FPQ_LTOG(rx_ts_info);
1041         rxi_PacketsUnWait();
1042
1043         MUTEX_EXIT(&rx_freePktQ_lock);
1044         USERPRI;
1045     }
1046
1047     return 0;
1048 }
1049 #else /* RX_ENABLE_TSFPQ */
1050 int
1051 rxi_TrimDataBufs(struct rx_packet *p, int first)
1052 {
1053     int length;
1054     struct iovec *iov, *end;
1055     SPLVAR;
1056
1057     if (first != 1)
1058         osi_Panic("TrimDataBufs 1: first must be 1");
1059
1060     /* Skip over continuation buffers containing message data */
1061     iov = &p->wirevec[2];
1062     end = iov + (p->niovecs - 2);
1063     length = p->length - p->wirevec[1].iov_len;
1064     for (; iov < end && length > 0; iov++) {
1065         if (!iov->iov_base)
1066             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1067         length -= iov->iov_len;
1068     }
1069
1070     /* iov now points to the first empty data buffer. */
1071     if (iov >= end)
1072         return 0;
1073
1074     NETPRI;
1075     MUTEX_ENTER(&rx_freePktQ_lock);
1076
1077     for (; iov < end; iov++) {
1078         if (!iov->iov_base)
1079             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1080         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1081         p->niovecs--;
1082     }
1083     rxi_PacketsUnWait();
1084
1085     MUTEX_EXIT(&rx_freePktQ_lock);
1086     USERPRI;
1087
1088     return 0;
1089 }
1090 #endif /* RX_ENABLE_TSFPQ */
1091
1092 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1093  * remove it yourself first if you call this routine. */
1094 #ifdef RX_ENABLE_TSFPQ
1095 void
1096 rxi_FreePacket(struct rx_packet *p)
1097 {
1098     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1099     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1100 }
1101 #else /* RX_ENABLE_TSFPQ */
1102 void
1103 rxi_FreePacket(struct rx_packet *p)
1104 {
1105     SPLVAR;
1106
1107     NETPRI;
1108     MUTEX_ENTER(&rx_freePktQ_lock);
1109
1110     rxi_FreeDataBufsNoLock(p, 2);
1111     rxi_FreePacketNoLock(p);
1112     /* Wakeup anyone waiting for packets */
1113     rxi_PacketsUnWait();
1114
1115     MUTEX_EXIT(&rx_freePktQ_lock);
1116     USERPRI;
1117 }
1118 #endif /* RX_ENABLE_TSFPQ */
1119
1120 /* rxi_AllocPacket sets up p->length so it reflects the number of
1121  * bytes in the packet at this point, **not including** the header.
1122  * The header is absolutely necessary, besides, this is the way the
1123  * length field is usually used */
1124 #ifdef RX_ENABLE_TSFPQ
1125 struct rx_packet *
1126 rxi_AllocPacketNoLock(int class)
1127 {
1128     struct rx_packet *p;
1129     struct rx_ts_info_t * rx_ts_info;
1130
1131     RX_TS_INFO_GET(rx_ts_info);
1132
1133 #ifdef KERNEL
1134     if (rxi_OverQuota(class)) {
1135         rxi_NeedMorePackets = TRUE;
1136         if (rx_stats_active) {
1137             switch (class) {
1138             case RX_PACKET_CLASS_RECEIVE:
1139                 rx_atomic_inc(rx_stats.receivePktAllocFailures);
1140                 break;
1141             case RX_PACKET_CLASS_SEND:
1142                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1143                 break;
1144             case RX_PACKET_CLASS_SPECIAL:
1145                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1146                 break;
1147             case RX_PACKET_CLASS_RECV_CBUF:
1148                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1149                 break;
1150             case RX_PACKET_CLASS_SEND_CBUF:
1151                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1152                 break;
1153             }
1154         }
1155         return (struct rx_packet *)0;
1156     }
1157 #endif /* KERNEL */
1158
1159     if (rx_stats_active)
1160         rx_atomic_inc(&rx_stats.packetRequests);
1161     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1162
1163 #ifdef KERNEL
1164         if (queue_IsEmpty(&rx_freePacketQueue))
1165             osi_Panic("rxi_AllocPacket error");
1166 #else /* KERNEL */
1167         if (queue_IsEmpty(&rx_freePacketQueue))
1168             rxi_MorePacketsNoLock(rx_maxSendWindow);
1169 #endif /* KERNEL */
1170
1171
1172         RX_TS_FPQ_GTOL(rx_ts_info);
1173     }
1174
1175     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1176
1177     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1178
1179
1180     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1181      * order to truncate outbound packets.  In the near future, may need
1182      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1183      */
1184     RX_PACKET_IOV_FULLINIT(p);
1185     return p;
1186 }
1187 #else /* RX_ENABLE_TSFPQ */
1188 struct rx_packet *
1189 rxi_AllocPacketNoLock(int class)
1190 {
1191     struct rx_packet *p;
1192
1193 #ifdef KERNEL
1194     if (rxi_OverQuota(class)) {
1195         rxi_NeedMorePackets = TRUE;
1196         if (rx_stats_active) {
1197             switch (class) {
1198             case RX_PACKET_CLASS_RECEIVE:
1199                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1200                 break;
1201             case RX_PACKET_CLASS_SEND:
1202                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1203                 break;
1204             case RX_PACKET_CLASS_SPECIAL:
1205                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1206                 break;
1207             case RX_PACKET_CLASS_RECV_CBUF:
1208                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1209                 break;
1210             case RX_PACKET_CLASS_SEND_CBUF:
1211                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1212                 break;
1213             }
1214         }
1215         return (struct rx_packet *)0;
1216     }
1217 #endif /* KERNEL */
1218
1219     if (rx_stats_active)
1220         rx_atomic_inc(&rx_stats.packetRequests);
1221
1222 #ifdef KERNEL
1223     if (queue_IsEmpty(&rx_freePacketQueue))
1224         osi_Panic("rxi_AllocPacket error");
1225 #else /* KERNEL */
1226     if (queue_IsEmpty(&rx_freePacketQueue))
1227         rxi_MorePacketsNoLock(rx_maxSendWindow);
1228 #endif /* KERNEL */
1229
1230     rx_nFreePackets--;
1231     p = queue_First(&rx_freePacketQueue, rx_packet);
1232     queue_Remove(p);
1233     RX_FPQ_MARK_USED(p);
1234
1235     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1236
1237
1238     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1239      * order to truncate outbound packets.  In the near future, may need
1240      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1241      */
1242     RX_PACKET_IOV_FULLINIT(p);
1243     return p;
1244 }
1245 #endif /* RX_ENABLE_TSFPQ */
1246
1247 #ifdef RX_ENABLE_TSFPQ
1248 struct rx_packet *
1249 rxi_AllocPacketTSFPQ(int class, int pull_global)
1250 {
1251     struct rx_packet *p;
1252     struct rx_ts_info_t * rx_ts_info;
1253
1254     RX_TS_INFO_GET(rx_ts_info);
1255
1256     if (rx_stats_active)
1257         rx_atomic_inc(&rx_stats.packetRequests);
1258     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1259         MUTEX_ENTER(&rx_freePktQ_lock);
1260
1261         if (queue_IsEmpty(&rx_freePacketQueue))
1262             rxi_MorePacketsNoLock(rx_maxSendWindow);
1263
1264         RX_TS_FPQ_GTOL(rx_ts_info);
1265
1266         MUTEX_EXIT(&rx_freePktQ_lock);
1267     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1268         return NULL;
1269     }
1270
1271     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1272
1273     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1274
1275     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1276      * order to truncate outbound packets.  In the near future, may need
1277      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1278      */
1279     RX_PACKET_IOV_FULLINIT(p);
1280     return p;
1281 }
1282 #endif /* RX_ENABLE_TSFPQ */
1283
1284 #ifdef RX_ENABLE_TSFPQ
1285 struct rx_packet *
1286 rxi_AllocPacket(int class)
1287 {
1288     struct rx_packet *p;
1289
1290     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1291     return p;
1292 }
1293 #else /* RX_ENABLE_TSFPQ */
1294 struct rx_packet *
1295 rxi_AllocPacket(int class)
1296 {
1297     struct rx_packet *p;
1298
1299     MUTEX_ENTER(&rx_freePktQ_lock);
1300     p = rxi_AllocPacketNoLock(class);
1301     MUTEX_EXIT(&rx_freePktQ_lock);
1302     return p;
1303 }
1304 #endif /* RX_ENABLE_TSFPQ */
1305
1306 /* This guy comes up with as many buffers as it {takes,can get} given
1307  * the MTU for this call. It also sets the packet length before
1308  * returning.  caution: this is often called at NETPRI
1309  * Called with call locked.
1310  */
1311 struct rx_packet *
1312 rxi_AllocSendPacket(struct rx_call *call, int want)
1313 {
1314     struct rx_packet *p = (struct rx_packet *)0;
1315     int mud;
1316     unsigned delta;
1317
1318     SPLVAR;
1319     mud = call->MTU - RX_HEADER_SIZE;
1320     delta =
1321         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1322         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1323
1324 #ifdef RX_ENABLE_TSFPQ
1325     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1326         want += delta;
1327         want = MIN(want, mud);
1328
1329         if ((unsigned)want > p->length)
1330             (void)rxi_AllocDataBuf(p, (want - p->length),
1331                                    RX_PACKET_CLASS_SEND_CBUF);
1332
1333         if (p->length > mud)
1334             p->length = mud;
1335
1336         if (delta >= p->length) {
1337             rxi_FreePacket(p);
1338             p = NULL;
1339         } else {
1340             p->length -= delta;
1341         }
1342         return p;
1343     }
1344 #endif /* RX_ENABLE_TSFPQ */
1345
1346     while (!(call->error)) {
1347         MUTEX_ENTER(&rx_freePktQ_lock);
1348         /* if an error occurred, or we get the packet we want, we're done */
1349         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1350             MUTEX_EXIT(&rx_freePktQ_lock);
1351
1352             want += delta;
1353             want = MIN(want, mud);
1354
1355             if ((unsigned)want > p->length)
1356                 (void)rxi_AllocDataBuf(p, (want - p->length),
1357                                        RX_PACKET_CLASS_SEND_CBUF);
1358
1359             if (p->length > mud)
1360                 p->length = mud;
1361
1362             if (delta >= p->length) {
1363                 rxi_FreePacket(p);
1364                 p = NULL;
1365             } else {
1366                 p->length -= delta;
1367             }
1368             break;
1369         }
1370
1371         /* no error occurred, and we didn't get a packet, so we sleep.
1372          * At this point, we assume that packets will be returned
1373          * sooner or later, as packets are acknowledged, and so we
1374          * just wait.  */
1375         NETPRI;
1376         call->flags |= RX_CALL_WAIT_PACKETS;
1377         MUTEX_ENTER(&rx_refcnt_mutex);
1378         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1379         MUTEX_EXIT(&rx_refcnt_mutex);
1380         MUTEX_EXIT(&call->lock);
1381         rx_waitingForPackets = 1;
1382
1383 #ifdef  RX_ENABLE_LOCKS
1384         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1385 #else
1386         osi_rxSleep(&rx_waitingForPackets);
1387 #endif
1388         MUTEX_EXIT(&rx_freePktQ_lock);
1389         MUTEX_ENTER(&call->lock);
1390         MUTEX_ENTER(&rx_refcnt_mutex);
1391         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1392         MUTEX_EXIT(&rx_refcnt_mutex);
1393         call->flags &= ~RX_CALL_WAIT_PACKETS;
1394         USERPRI;
1395     }
1396
1397     return p;
1398 }
1399
1400 #ifndef KERNEL
1401 #ifdef AFS_NT40_ENV
1402 /* Windows does not use file descriptors. */
1403 #define CountFDs(amax) 0
1404 #else
1405 /* count the number of used FDs */
1406 static int
1407 CountFDs(int amax)
1408 {
1409     struct stat tstat;
1410     int i, code;
1411     int count;
1412
1413     count = 0;
1414     for (i = 0; i < amax; i++) {
1415         code = fstat(i, &tstat);
1416         if (code == 0)
1417             count++;
1418     }
1419     return count;
1420 }
1421 #endif /* AFS_NT40_ENV */
1422 #else /* KERNEL */
1423
1424 #define CountFDs(amax) amax
1425
1426 #endif /* KERNEL */
1427
1428 #if !defined(KERNEL) || defined(UKERNEL)
1429
1430 /* This function reads a single packet from the interface into the
1431  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1432  * (host,port) of the sender are stored in the supplied variables, and
1433  * the data length of the packet is stored in the packet structure.
1434  * The header is decoded. */
1435 int
1436 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1437                u_short * port)
1438 {
1439     struct sockaddr_in from;
1440     unsigned int nbytes;
1441     afs_int32 rlen;
1442     afs_uint32 tlen, savelen;
1443     struct msghdr msg;
1444     rx_computelen(p, tlen);
1445     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1446
1447     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1448     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1449                                  * it once in order to avoid races.  */
1450     tlen = rlen - tlen;
1451     if (tlen > 0) {
1452         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1453         if (tlen > 0) {
1454             tlen = rlen - tlen;
1455         } else
1456             tlen = rlen;
1457     } else
1458         tlen = rlen;
1459
1460     /* Extend the last iovec for padding, it's just to make sure that the
1461      * read doesn't return more data than we expect, and is done to get around
1462      * our problems caused by the lack of a length field in the rx header.
1463      * Use the extra buffer that follows the localdata in each packet
1464      * structure. */
1465     savelen = p->wirevec[p->niovecs - 1].iov_len;
1466     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1467
1468     memset(&msg, 0, sizeof(msg));
1469     msg.msg_name = (char *)&from;
1470     msg.msg_namelen = sizeof(struct sockaddr_in);
1471     msg.msg_iov = p->wirevec;
1472     msg.msg_iovlen = p->niovecs;
1473     nbytes = rxi_Recvmsg(socket, &msg, 0);
1474
1475     /* restore the vec to its correct state */
1476     p->wirevec[p->niovecs - 1].iov_len = savelen;
1477
1478     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1479     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1480         if (nbytes < 0 && errno == EWOULDBLOCK) {
1481             if (rx_stats_active)
1482                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1483         } else if (nbytes <= 0) {
1484             if (rx_stats_active) {
1485                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1486                 rx_stats.bogusHost = from.sin_addr.s_addr;
1487             }
1488             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1489                  ntohs(from.sin_port), nbytes));
1490         }
1491         return 0;
1492     }
1493 #ifdef RXDEBUG
1494     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1495                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1496         rxi_DecodePacketHeader(p);
1497
1498         *host = from.sin_addr.s_addr;
1499         *port = from.sin_port;
1500
1501         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1502               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1503               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1504               p->length));
1505 #ifdef RX_TRIMDATABUFS
1506         rxi_TrimDataBufs(p, 1);
1507 #endif
1508         return 0;
1509     }
1510 #endif
1511     else {
1512         /* Extract packet header. */
1513         rxi_DecodePacketHeader(p);
1514
1515         *host = from.sin_addr.s_addr;
1516         *port = from.sin_port;
1517         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1518             if (rx_stats_active) {
1519                 struct rx_peer *peer;
1520                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1521                 /*
1522                  * Try to look up this peer structure.  If it doesn't exist,
1523                  * don't create a new one -
1524                  * we don't keep count of the bytes sent/received if a peer
1525                  * structure doesn't already exist.
1526                  *
1527                  * The peer/connection cleanup code assumes that there is 1 peer
1528                  * per connection.  If we actually created a peer structure here
1529                  * and this packet was an rxdebug packet, the peer structure would
1530                  * never be cleaned up.
1531                  */
1532                 peer = rxi_FindPeer(*host, *port, 0, 0);
1533                 /* Since this may not be associated with a connection,
1534                  * it may have no refCount, meaning we could race with
1535                  * ReapConnections
1536                  */
1537                 if (peer && (peer->refCount > 0)) {
1538                     MUTEX_ENTER(&peer->peer_lock);
1539                     hadd32(peer->bytesReceived, p->length);
1540                     MUTEX_EXIT(&peer->peer_lock);
1541                 }
1542             }
1543         }
1544
1545 #ifdef RX_TRIMDATABUFS
1546         /* Free any empty packet buffers at the end of this packet */
1547         rxi_TrimDataBufs(p, 1);
1548 #endif
1549         return 1;
1550     }
1551 }
1552
1553 #endif /* !KERNEL || UKERNEL */
1554
1555 /* This function splits off the first packet in a jumbo packet.
1556  * As of AFS 3.5, jumbograms contain more than one fixed size
1557  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1558  * last packet header. All packets (except the last) are padded to
1559  * fall on RX_CBUFFERSIZE boundaries.
1560  * HACK: We store the length of the first n-1 packets in the
1561  * last two pad bytes. */
1562
1563 struct rx_packet *
1564 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1565                      int first)
1566 {
1567     struct rx_packet *np;
1568     struct rx_jumboHeader *jp;
1569     int niov, i;
1570     struct iovec *iov;
1571     int length;
1572     afs_uint32 temp;
1573
1574     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1575      * bytes in length. All but the first packet are preceded by
1576      * an abbreviated four byte header. The length of the last packet
1577      * is calculated from the size of the jumbogram. */
1578     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1579
1580     if ((int)p->length < length) {
1581         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1582         return NULL;
1583     }
1584     niov = p->niovecs - 2;
1585     if (niov < 1) {
1586         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1587         return NULL;
1588     }
1589     iov = &p->wirevec[2];
1590     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1591
1592     /* Get a pointer to the abbreviated packet header */
1593     jp = (struct rx_jumboHeader *)
1594         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1595
1596     /* Set up the iovecs for the next packet */
1597     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1598     np->wirevec[0].iov_len = sizeof(struct rx_header);
1599     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1600     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1601     np->niovecs = niov + 1;
1602     for (i = 2, iov++; i <= niov; i++, iov++) {
1603         np->wirevec[i] = *iov;
1604     }
1605     np->length = p->length - length;
1606     p->length = RX_JUMBOBUFFERSIZE;
1607     p->niovecs = 2;
1608
1609     /* Convert the jumbo packet header to host byte order */
1610     temp = ntohl(*(afs_uint32 *) jp);
1611     jp->flags = (u_char) (temp >> 24);
1612     jp->cksum = (u_short) (temp);
1613
1614     /* Fill in the packet header */
1615     np->header = p->header;
1616     np->header.serial = p->header.serial + 1;
1617     np->header.seq = p->header.seq + 1;
1618     np->header.flags = jp->flags;
1619     np->header.spare = jp->cksum;
1620
1621     return np;
1622 }
1623
1624 #ifndef KERNEL
1625 /* Send a udp datagram */
1626 int
1627 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1628             int length, int istack)
1629 {
1630     struct msghdr msg;
1631         int ret;
1632
1633     memset(&msg, 0, sizeof(msg));
1634     msg.msg_iov = dvec;
1635     msg.msg_iovlen = nvecs;
1636     msg.msg_name = addr;
1637     msg.msg_namelen = sizeof(struct sockaddr_in);
1638
1639     ret = rxi_Sendmsg(socket, &msg, 0);
1640
1641     return ret;
1642 }
1643 #elif !defined(UKERNEL)
1644 /*
1645  * message receipt is done in rxk_input or rx_put.
1646  */
1647
1648 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1649 /*
1650  * Copy an mblock to the contiguous area pointed to by cp.
1651  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1652  * but it doesn't really.
1653  * Returns the number of bytes not transferred.
1654  * The message is NOT changed.
1655  */
1656 static int
1657 cpytoc(mblk_t * mp, int off, int len, char *cp)
1658 {
1659     int n;
1660
1661     for (; mp && len > 0; mp = mp->b_cont) {
1662         if (mp->b_datap->db_type != M_DATA) {
1663             return -1;
1664         }
1665         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1666         memcpy(cp, (char *)mp->b_rptr, n);
1667         cp += n;
1668         len -= n;
1669         mp->b_rptr += n;
1670     }
1671     return (len);
1672 }
1673
1674 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1675  * but it doesn't really.
1676  * This sucks, anyway, do it like m_cpy.... below
1677  */
1678 static int
1679 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1680            int niovs)
1681 {
1682     int m, n, o, t, i;
1683
1684     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1685         if (mp->b_datap->db_type != M_DATA) {
1686             return -1;
1687         }
1688         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1689         len -= n;
1690         while (n) {
1691             if (!t) {
1692                 o = 0;
1693                 i++;
1694                 t = iovs[i].iov_len;
1695             }
1696             m = MIN(n, t);
1697             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1698             mp->b_rptr += m;
1699             o += m;
1700             t -= m;
1701             n -= m;
1702         }
1703     }
1704     return (len);
1705 }
1706
1707 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1708 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1709 #else
1710 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1711 static int
1712 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1713 {
1714     caddr_t p1, p2;
1715     unsigned int l1, l2, i, t;
1716
1717     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1718         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1719
1720     while (off && m)
1721         if (m->m_len <= off) {
1722             off -= m->m_len;
1723             m = m->m_next;
1724             continue;
1725         } else
1726             break;
1727
1728     if (m == NULL)
1729         return len;
1730
1731     p1 = mtod(m, caddr_t) + off;
1732     l1 = m->m_len - off;
1733     i = 0;
1734     p2 = iovs[0].iov_base;
1735     l2 = iovs[0].iov_len;
1736
1737     while (len) {
1738         t = MIN(l1, MIN(l2, (unsigned int)len));
1739         memcpy(p2, p1, t);
1740         p1 += t;
1741         p2 += t;
1742         l1 -= t;
1743         l2 -= t;
1744         len -= t;
1745         if (!l1) {
1746             m = m->m_next;
1747             if (!m)
1748                 break;
1749             p1 = mtod(m, caddr_t);
1750             l1 = m->m_len;
1751         }
1752         if (!l2) {
1753             if (++i >= niovs)
1754                 break;
1755             p2 = iovs[i].iov_base;
1756             l2 = iovs[i].iov_len;
1757         }
1758
1759     }
1760
1761     return len;
1762 }
1763 #endif /* LINUX */
1764 #endif /* AFS_SUN5_ENV */
1765
1766 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1767 int
1768 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1769 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1770      mblk_t *amb;
1771 #else
1772      struct mbuf *amb;
1773 #endif
1774      void (*free) ();
1775      struct rx_packet *phandle;
1776      int hdr_len, data_len;
1777 {
1778     int code;
1779
1780     code =
1781         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1782                      phandle->niovecs);
1783     (*free) (amb);
1784
1785     return code;
1786 }
1787 #endif /* LINUX */
1788 #endif /*KERNEL && !UKERNEL */
1789
1790
1791 /* send a response to a debug packet */
1792
1793 struct rx_packet *
1794 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1795                        afs_uint32 ahost, short aport, int istack)
1796 {
1797     struct rx_debugIn tin;
1798     afs_int32 tl;
1799     struct rx_serverQueueEntry *np, *nqe;
1800
1801     /*
1802      * Only respond to client-initiated Rx debug packets,
1803      * and clear the client flag in the response.
1804      */
1805     if (ap->header.flags & RX_CLIENT_INITIATED) {
1806         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1807         rxi_EncodePacketHeader(ap);
1808     } else {
1809         return ap;
1810     }
1811
1812     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1813     /* all done with packet, now set length to the truth, so we can
1814      * reuse this packet */
1815     rx_computelen(ap, ap->length);
1816
1817     tin.type = ntohl(tin.type);
1818     tin.index = ntohl(tin.index);
1819     switch (tin.type) {
1820     case RX_DEBUGI_GETSTATS:{
1821             struct rx_debugStats tstat;
1822
1823             /* get basic stats */
1824             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1825             tstat.version = RX_DEBUGI_VERSION;
1826 #ifndef RX_ENABLE_LOCKS
1827             tstat.waitingForPackets = rx_waitingForPackets;
1828 #endif
1829             MUTEX_ENTER(&rx_serverPool_lock);
1830             tstat.nFreePackets = htonl(rx_nFreePackets);
1831             tstat.nPackets = htonl(rx_nPackets);
1832             tstat.callsExecuted = htonl(rxi_nCalls);
1833             tstat.packetReclaims = htonl(rx_packetReclaims);
1834             tstat.usedFDs = CountFDs(64);
1835             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1836             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1837             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1838                         tstat.idleThreads);
1839             MUTEX_EXIT(&rx_serverPool_lock);
1840             tstat.idleThreads = htonl(tstat.idleThreads);
1841             tl = sizeof(struct rx_debugStats) - ap->length;
1842             if (tl > 0)
1843                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1844
1845             if (tl <= 0) {
1846                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1847                                (char *)&tstat);
1848                 ap->length = sizeof(struct rx_debugStats);
1849                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1850                 rx_computelen(ap, ap->length);
1851             }
1852             break;
1853         }
1854
1855     case RX_DEBUGI_GETALLCONN:
1856     case RX_DEBUGI_GETCONN:{
1857             unsigned int i, j;
1858             struct rx_connection *tc;
1859             struct rx_call *tcall;
1860             struct rx_debugConn tconn;
1861             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1862
1863
1864             tl = sizeof(struct rx_debugConn) - ap->length;
1865             if (tl > 0)
1866                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1867             if (tl > 0)
1868                 return ap;
1869
1870             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1871             /* get N'th (maybe) "interesting" connection info */
1872             for (i = 0; i < rx_hashTableSize; i++) {
1873 #if !defined(KERNEL)
1874                 /* the time complexity of the algorithm used here
1875                  * exponentially increses with the number of connections.
1876                  */
1877 #ifdef AFS_PTHREAD_ENV
1878                 pthread_yield();
1879 #else
1880                 (void)IOMGR_Poll();
1881 #endif
1882 #endif
1883                 MUTEX_ENTER(&rx_connHashTable_lock);
1884                 /* We might be slightly out of step since we are not
1885                  * locking each call, but this is only debugging output.
1886                  */
1887                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1888                     if ((all || rxi_IsConnInteresting(tc))
1889                         && tin.index-- <= 0) {
1890                         tconn.host = tc->peer->host;
1891                         tconn.port = tc->peer->port;
1892                         tconn.cid = htonl(tc->cid);
1893                         tconn.epoch = htonl(tc->epoch);
1894                         tconn.serial = htonl(tc->serial);
1895                         for (j = 0; j < RX_MAXCALLS; j++) {
1896                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1897                             if ((tcall = tc->call[j])) {
1898                                 tconn.callState[j] = tcall->state;
1899                                 tconn.callMode[j] = tcall->mode;
1900                                 tconn.callFlags[j] = tcall->flags;
1901                                 if (queue_IsNotEmpty(&tcall->rq))
1902                                     tconn.callOther[j] |= RX_OTHER_IN;
1903                                 if (queue_IsNotEmpty(&tcall->tq))
1904                                     tconn.callOther[j] |= RX_OTHER_OUT;
1905                             } else
1906                                 tconn.callState[j] = RX_STATE_NOTINIT;
1907                         }
1908
1909                         tconn.natMTU = htonl(tc->peer->natMTU);
1910                         tconn.error = htonl(tc->error);
1911                         tconn.flags = tc->flags;
1912                         tconn.type = tc->type;
1913                         tconn.securityIndex = tc->securityIndex;
1914                         if (tc->securityObject) {
1915                             RXS_GetStats(tc->securityObject, tc,
1916                                          &tconn.secStats);
1917 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1918 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1919                             DOHTONL(flags);
1920                             DOHTONL(expires);
1921                             DOHTONL(packetsReceived);
1922                             DOHTONL(packetsSent);
1923                             DOHTONL(bytesReceived);
1924                             DOHTONL(bytesSent);
1925                             for (i = 0;
1926                                  i <
1927                                  sizeof(tconn.secStats.spares) /
1928                                  sizeof(short); i++)
1929                                 DOHTONS(spares[i]);
1930                             for (i = 0;
1931                                  i <
1932                                  sizeof(tconn.secStats.sparel) /
1933                                  sizeof(afs_int32); i++)
1934                                 DOHTONL(sparel[i]);
1935                         }
1936
1937                         MUTEX_EXIT(&rx_connHashTable_lock);
1938                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1939                                        (char *)&tconn);
1940                         tl = ap->length;
1941                         ap->length = sizeof(struct rx_debugConn);
1942                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1943                                             istack);
1944                         ap->length = tl;
1945                         return ap;
1946                     }
1947                 }
1948                 MUTEX_EXIT(&rx_connHashTable_lock);
1949             }
1950             /* if we make it here, there are no interesting packets */
1951             tconn.cid = htonl(0xffffffff);      /* means end */
1952             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1953                            (char *)&tconn);
1954             tl = ap->length;
1955             ap->length = sizeof(struct rx_debugConn);
1956             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1957             ap->length = tl;
1958             break;
1959         }
1960
1961         /*
1962          * Pass back all the peer structures we have available
1963          */
1964
1965     case RX_DEBUGI_GETPEER:{
1966             unsigned int i;
1967             struct rx_peer *tp;
1968             struct rx_debugPeer tpeer;
1969
1970
1971             tl = sizeof(struct rx_debugPeer) - ap->length;
1972             if (tl > 0)
1973                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1974             if (tl > 0)
1975                 return ap;
1976
1977             memset(&tpeer, 0, sizeof(tpeer));
1978             for (i = 0; i < rx_hashTableSize; i++) {
1979 #if !defined(KERNEL)
1980                 /* the time complexity of the algorithm used here
1981                  * exponentially increses with the number of peers.
1982                  *
1983                  * Yielding after processing each hash table entry
1984                  * and dropping rx_peerHashTable_lock.
1985                  * also increases the risk that we will miss a new
1986                  * entry - but we are willing to live with this
1987                  * limitation since this is meant for debugging only
1988                  */
1989 #ifdef AFS_PTHREAD_ENV
1990                 pthread_yield();
1991 #else
1992                 (void)IOMGR_Poll();
1993 #endif
1994 #endif
1995                 MUTEX_ENTER(&rx_peerHashTable_lock);
1996                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1997                     if (tin.index-- <= 0) {
1998                         tp->refCount++;
1999                         MUTEX_EXIT(&rx_peerHashTable_lock);
2000
2001                         MUTEX_ENTER(&tp->peer_lock);
2002                         tpeer.host = tp->host;
2003                         tpeer.port = tp->port;
2004                         tpeer.ifMTU = htons(tp->ifMTU);
2005                         tpeer.idleWhen = htonl(tp->idleWhen);
2006                         tpeer.refCount = htons(tp->refCount);
2007                         tpeer.burstSize = tp->burstSize;
2008                         tpeer.burst = tp->burst;
2009                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
2010                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
2011                         tpeer.rtt = htonl(tp->rtt);
2012                         tpeer.rtt_dev = htonl(tp->rtt_dev);
2013                         tpeer.timeout.sec = htonl(tp->timeout.sec);
2014                         tpeer.timeout.usec = htonl(tp->timeout.usec);
2015                         tpeer.nSent = htonl(tp->nSent);
2016                         tpeer.reSends = htonl(tp->reSends);
2017                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
2018                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
2019                         tpeer.rateFlag = htonl(tp->rateFlag);
2020                         tpeer.natMTU = htons(tp->natMTU);
2021                         tpeer.maxMTU = htons(tp->maxMTU);
2022                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2023                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2024                         tpeer.MTU = htons(tp->MTU);
2025                         tpeer.cwind = htons(tp->cwind);
2026                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2027                         tpeer.congestSeq = htons(tp->congestSeq);
2028                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2029                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2030                         tpeer.bytesReceived.high =
2031                             htonl(tp->bytesReceived.high);
2032                         tpeer.bytesReceived.low =
2033                             htonl(tp->bytesReceived.low);
2034                         MUTEX_EXIT(&tp->peer_lock);
2035
2036                         MUTEX_ENTER(&rx_peerHashTable_lock);
2037                         tp->refCount--;
2038                         MUTEX_EXIT(&rx_peerHashTable_lock);
2039
2040                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2041                                        (char *)&tpeer);
2042                         tl = ap->length;
2043                         ap->length = sizeof(struct rx_debugPeer);
2044                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2045                                             istack);
2046                         ap->length = tl;
2047                         return ap;
2048                     }
2049                 }
2050                 MUTEX_EXIT(&rx_peerHashTable_lock);
2051             }
2052             /* if we make it here, there are no interesting packets */
2053             tpeer.host = htonl(0xffffffff);     /* means end */
2054             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2055                            (char *)&tpeer);
2056             tl = ap->length;
2057             ap->length = sizeof(struct rx_debugPeer);
2058             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2059             ap->length = tl;
2060             break;
2061         }
2062
2063     case RX_DEBUGI_RXSTATS:{
2064             int i;
2065             afs_int32 *s;
2066
2067             tl = sizeof(rx_stats) - ap->length;
2068             if (tl > 0)
2069                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2070             if (tl > 0)
2071                 return ap;
2072
2073             /* Since its all int32s convert to network order with a loop. */
2074         if (rx_stats_active)
2075             MUTEX_ENTER(&rx_stats_mutex);
2076             s = (afs_int32 *) & rx_stats;
2077             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2078                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2079
2080             tl = ap->length;
2081             ap->length = sizeof(rx_stats);
2082         if (rx_stats_active)
2083             MUTEX_EXIT(&rx_stats_mutex);
2084             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2085             ap->length = tl;
2086             break;
2087         }
2088
2089     default:
2090         /* error response packet */
2091         tin.type = htonl(RX_DEBUGI_BADTYPE);
2092         tin.index = tin.type;
2093         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2094         tl = ap->length;
2095         ap->length = sizeof(struct rx_debugIn);
2096         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2097         ap->length = tl;
2098         break;
2099     }
2100     return ap;
2101 }
2102
2103 struct rx_packet *
2104 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2105                          afs_uint32 ahost, short aport, int istack)
2106 {
2107     afs_int32 tl;
2108
2109     /*
2110      * Only respond to client-initiated version requests, and
2111      * clear that flag in the response.
2112      */
2113     if (ap->header.flags & RX_CLIENT_INITIATED) {
2114         char buf[66];
2115
2116         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2117         rxi_EncodePacketHeader(ap);
2118         memset(buf, 0, sizeof(buf));
2119         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2120         rx_packetwrite(ap, 0, 65, buf);
2121         tl = ap->length;
2122         ap->length = 65;
2123         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2124         ap->length = tl;
2125     }
2126
2127     return ap;
2128 }
2129
2130
2131 /* send a debug packet back to the sender */
2132 static void
2133 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2134                     afs_uint32 ahost, short aport, afs_int32 istack)
2135 {
2136     struct sockaddr_in taddr;
2137     unsigned int i, nbytes, savelen = 0;
2138     int saven = 0;
2139 #ifdef KERNEL
2140     int waslocked = ISAFS_GLOCK();
2141 #endif
2142
2143     taddr.sin_family = AF_INET;
2144     taddr.sin_port = aport;
2145     taddr.sin_addr.s_addr = ahost;
2146 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2147     taddr.sin_len = sizeof(struct sockaddr_in);
2148 #endif
2149
2150     /* We need to trim the niovecs. */
2151     nbytes = apacket->length;
2152     for (i = 1; i < apacket->niovecs; i++) {
2153         if (nbytes <= apacket->wirevec[i].iov_len) {
2154             savelen = apacket->wirevec[i].iov_len;
2155             saven = apacket->niovecs;
2156             apacket->wirevec[i].iov_len = nbytes;
2157             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2158         } else
2159             nbytes -= apacket->wirevec[i].iov_len;
2160     }
2161 #ifdef KERNEL
2162 #ifdef RX_KERNEL_TRACE
2163     if (ICL_SETACTIVE(afs_iclSetp)) {
2164         if (!waslocked)
2165             AFS_GLOCK();
2166         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2167                    "before osi_NetSend()");
2168         AFS_GUNLOCK();
2169     } else
2170 #else
2171     if (waslocked)
2172         AFS_GUNLOCK();
2173 #endif
2174 #endif
2175     /* debug packets are not reliably delivered, hence the cast below. */
2176     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2177                       apacket->length + RX_HEADER_SIZE, istack);
2178 #ifdef KERNEL
2179 #ifdef RX_KERNEL_TRACE
2180     if (ICL_SETACTIVE(afs_iclSetp)) {
2181         AFS_GLOCK();
2182         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2183                    "after osi_NetSend()");
2184         if (!waslocked)
2185             AFS_GUNLOCK();
2186     } else
2187 #else
2188     if (waslocked)
2189         AFS_GLOCK();
2190 #endif
2191 #endif
2192     if (saven) {                /* means we truncated the packet above. */
2193         apacket->wirevec[i - 1].iov_len = savelen;
2194         apacket->niovecs = saven;
2195     }
2196
2197 }
2198
2199 /* Send the packet to appropriate destination for the specified
2200  * call.  The header is first encoded and placed in the packet.
2201  */
2202 void
2203 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2204                struct rx_packet *p, int istack)
2205 {
2206 #if defined(KERNEL)
2207     int waslocked;
2208 #endif
2209     int code;
2210     struct sockaddr_in addr;
2211     struct rx_peer *peer = conn->peer;
2212     osi_socket socket;
2213 #ifdef RXDEBUG
2214     char deliveryType = 'S';
2215 #endif
2216     /* The address we're sending the packet to */
2217     memset(&addr, 0, sizeof(addr));
2218     addr.sin_family = AF_INET;
2219     addr.sin_port = peer->port;
2220     addr.sin_addr.s_addr = peer->host;
2221
2222     /* This stuff should be revamped, I think, so that most, if not
2223      * all, of the header stuff is always added here.  We could
2224      * probably do away with the encode/decode routines. XXXXX */
2225
2226     /* Stamp each packet with a unique serial number.  The serial
2227      * number is maintained on a connection basis because some types
2228      * of security may be based on the serial number of the packet,
2229      * and security is handled on a per authenticated-connection
2230      * basis. */
2231     /* Pre-increment, to guarantee no zero serial number; a zero
2232      * serial number means the packet was never sent. */
2233     MUTEX_ENTER(&conn->conn_data_lock);
2234     p->header.serial = ++conn->serial;
2235     if (p->length > conn->peer->maxPacketSize) {
2236         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2237             (p->header.flags & RX_REQUEST_ACK)) {
2238             conn->lastPingSize = p->length;
2239             conn->lastPingSizeSer = p->header.serial;
2240         } else if (p->header.seq != 0) {
2241             conn->lastPacketSize = p->length;
2242             conn->lastPacketSizeSeq = p->header.seq;
2243         }
2244     }
2245     MUTEX_EXIT(&conn->conn_data_lock);
2246     /* This is so we can adjust retransmit time-outs better in the face of
2247      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2248      */
2249     if (p->firstSerial == 0) {
2250         p->firstSerial = p->header.serial;
2251     }
2252 #ifdef RXDEBUG
2253     /* If an output tracer function is defined, call it with the packet and
2254      * network address.  Note this function may modify its arguments. */
2255     if (rx_almostSent) {
2256         int drop = (*rx_almostSent) (p, &addr);
2257         /* drop packet if return value is non-zero? */
2258         if (drop)
2259             deliveryType = 'D'; /* Drop the packet */
2260     }
2261 #endif
2262
2263     /* Get network byte order header */
2264     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2265                                  * touch ALL the fields */
2266
2267     /* Send the packet out on the same socket that related packets are being
2268      * received on */
2269     socket =
2270         (conn->type ==
2271          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2272
2273 #ifdef RXDEBUG
2274     /* Possibly drop this packet,  for testing purposes */
2275     if ((deliveryType == 'D')
2276         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2277             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2278         deliveryType = 'D';     /* Drop the packet */
2279     } else {
2280         deliveryType = 'S';     /* Send the packet */
2281 #endif /* RXDEBUG */
2282
2283         /* Loop until the packet is sent.  We'd prefer just to use a
2284          * blocking socket, but unfortunately the interface doesn't
2285          * allow us to have the socket block in send mode, and not
2286          * block in receive mode */
2287 #ifdef KERNEL
2288         waslocked = ISAFS_GLOCK();
2289 #ifdef RX_KERNEL_TRACE
2290         if (ICL_SETACTIVE(afs_iclSetp)) {
2291             if (!waslocked)
2292                 AFS_GLOCK();
2293             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2294                        "before osi_NetSend()");
2295             AFS_GUNLOCK();
2296         } else
2297 #else
2298         if (waslocked)
2299             AFS_GUNLOCK();
2300 #endif
2301 #endif
2302         if ((code =
2303              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2304                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2305             /* send failed, so let's hurry up the resend, eh? */
2306             if (rx_stats_active)
2307                 rx_atomic_inc(&rx_stats.netSendFailures);
2308             p->retryTime = p->timeSent; /* resend it very soon */
2309             clock_Addmsec(&(p->retryTime),
2310                           10 + (((afs_uint32) p->backoff) << 8));
2311             /* Some systems are nice and tell us right away that we cannot
2312              * reach this recipient by returning an error code.
2313              * So, when this happens let's "down" the host NOW so
2314              * we don't sit around waiting for this host to timeout later.
2315              */
2316             if (call &&
2317 #ifdef AFS_NT40_ENV
2318                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2319 #elif defined(AFS_LINUX20_ENV)
2320                 code == -ENETUNREACH
2321 #elif defined(AFS_DARWIN_ENV)
2322                 code == EHOSTUNREACH
2323 #else
2324                 0
2325 #endif
2326                 )
2327                 call->lastReceiveTime = 0;
2328         }
2329 #ifdef KERNEL
2330 #ifdef RX_KERNEL_TRACE
2331         if (ICL_SETACTIVE(afs_iclSetp)) {
2332             AFS_GLOCK();
2333             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2334                        "after osi_NetSend()");
2335             if (!waslocked)
2336                 AFS_GUNLOCK();
2337         } else
2338 #else
2339         if (waslocked)
2340             AFS_GLOCK();
2341 #endif
2342 #endif
2343 #ifdef RXDEBUG
2344     }
2345     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d\n",
2346           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2347           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2348           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2349 #endif
2350     if (rx_stats_active) {
2351         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2352         MUTEX_ENTER(&peer->peer_lock);
2353         hadd32(peer->bytesSent, p->length);
2354         MUTEX_EXIT(&peer->peer_lock);
2355     }
2356 }
2357
2358 /* Send a list of packets to appropriate destination for the specified
2359  * connection.  The headers are first encoded and placed in the packets.
2360  */
2361 void
2362 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2363                    struct rx_packet **list, int len, int istack)
2364 {
2365 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2366     int waslocked;
2367 #endif
2368     struct sockaddr_in addr;
2369     struct rx_peer *peer = conn->peer;
2370     osi_socket socket;
2371     struct rx_packet *p = NULL;
2372     struct iovec wirevec[RX_MAXIOVECS];
2373     int i, length, code;
2374     afs_uint32 serial;
2375     afs_uint32 temp;
2376     struct rx_jumboHeader *jp;
2377 #ifdef RXDEBUG
2378     char deliveryType = 'S';
2379 #endif
2380     /* The address we're sending the packet to */
2381     addr.sin_family = AF_INET;
2382     addr.sin_port = peer->port;
2383     addr.sin_addr.s_addr = peer->host;
2384
2385     if (len + 1 > RX_MAXIOVECS) {
2386         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2387     }
2388
2389     /*
2390      * Stamp the packets in this jumbogram with consecutive serial numbers
2391      */
2392     MUTEX_ENTER(&conn->conn_data_lock);
2393     serial = conn->serial;
2394     conn->serial += len;
2395     for (i = 0; i < len; i++) {
2396         p = list[i];
2397         if (p->length > conn->peer->maxPacketSize) {
2398             /* a ping *or* a sequenced packet can count */
2399             if ((p->length > conn->peer->maxPacketSize)) {
2400                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2401                      (p->header.flags & RX_REQUEST_ACK)) &&
2402                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2403                     conn->lastPingSize = p->length;
2404                     conn->lastPingSizeSer = serial + i;
2405                 } else if ((p->header.seq != 0) &&
2406                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2407                     conn->lastPacketSize = p->length;
2408                     conn->lastPacketSizeSeq = p->header.seq;
2409                 }
2410             }
2411         }
2412     }
2413     MUTEX_EXIT(&conn->conn_data_lock);
2414
2415
2416     /* This stuff should be revamped, I think, so that most, if not
2417      * all, of the header stuff is always added here.  We could
2418      * probably do away with the encode/decode routines. XXXXX */
2419
2420     jp = NULL;
2421     length = RX_HEADER_SIZE;
2422     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2423     wirevec[0].iov_len = RX_HEADER_SIZE;
2424     for (i = 0; i < len; i++) {
2425         p = list[i];
2426
2427         /* The whole 3.5 jumbogram scheme relies on packets fitting
2428          * in a single packet buffer. */
2429         if (p->niovecs > 2) {
2430             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2431         }
2432
2433         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2434          * in this chunk.  */
2435         if (i < len - 1) {
2436             if (p->length != RX_JUMBOBUFFERSIZE) {
2437                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2438             }
2439             p->header.flags |= RX_JUMBO_PACKET;
2440             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2441             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2442         } else {
2443             wirevec[i + 1].iov_len = p->length;
2444             length += p->length;
2445         }
2446         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2447         if (jp != NULL) {
2448             /* Convert jumbo packet header to network byte order */
2449             temp = (afs_uint32) (p->header.flags) << 24;
2450             temp |= (afs_uint32) (p->header.spare);
2451             *(afs_uint32 *) jp = htonl(temp);
2452         }
2453         jp = (struct rx_jumboHeader *)
2454             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2455
2456         /* Stamp each packet with a unique serial number.  The serial
2457          * number is maintained on a connection basis because some types
2458          * of security may be based on the serial number of the packet,
2459          * and security is handled on a per authenticated-connection
2460          * basis. */
2461         /* Pre-increment, to guarantee no zero serial number; a zero
2462          * serial number means the packet was never sent. */
2463         p->header.serial = ++serial;
2464         /* This is so we can adjust retransmit time-outs better in the face of
2465          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2466          */
2467         if (p->firstSerial == 0) {
2468             p->firstSerial = p->header.serial;
2469         }
2470 #ifdef RXDEBUG
2471         /* If an output tracer function is defined, call it with the packet and
2472          * network address.  Note this function may modify its arguments. */
2473         if (rx_almostSent) {
2474             int drop = (*rx_almostSent) (p, &addr);
2475             /* drop packet if return value is non-zero? */
2476             if (drop)
2477                 deliveryType = 'D';     /* Drop the packet */
2478         }
2479 #endif
2480
2481         /* Get network byte order header */
2482         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2483                                          * touch ALL the fields */
2484     }
2485
2486     /* Send the packet out on the same socket that related packets are being
2487      * received on */
2488     socket =
2489         (conn->type ==
2490          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2491
2492 #ifdef RXDEBUG
2493     /* Possibly drop this packet,  for testing purposes */
2494     if ((deliveryType == 'D')
2495         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2496             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2497         deliveryType = 'D';     /* Drop the packet */
2498     } else {
2499         deliveryType = 'S';     /* Send the packet */
2500 #endif /* RXDEBUG */
2501
2502         /* Loop until the packet is sent.  We'd prefer just to use a
2503          * blocking socket, but unfortunately the interface doesn't
2504          * allow us to have the socket block in send mode, and not
2505          * block in receive mode */
2506 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2507         waslocked = ISAFS_GLOCK();
2508         if (!istack && waslocked)
2509             AFS_GUNLOCK();
2510 #endif
2511         if ((code =
2512              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2513                          istack)) != 0) {
2514             /* send failed, so let's hurry up the resend, eh? */
2515             if (rx_stats_active)
2516                 rx_atomic_inc(&rx_stats.netSendFailures);
2517             for (i = 0; i < len; i++) {
2518                 p = list[i];
2519                 p->retryTime = p->timeSent;     /* resend it very soon */
2520                 clock_Addmsec(&(p->retryTime),
2521                               10 + (((afs_uint32) p->backoff) << 8));
2522             }
2523             /* Some systems are nice and tell us right away that we cannot
2524              * reach this recipient by returning an error code.
2525              * So, when this happens let's "down" the host NOW so
2526              * we don't sit around waiting for this host to timeout later.
2527              */
2528             if (call &&
2529 #ifdef AFS_NT40_ENV
2530                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2531 #elif defined(AFS_LINUX20_ENV)
2532                 code == -ENETUNREACH
2533 #elif defined(AFS_DARWIN_ENV)
2534                 code == EHOSTUNREACH
2535 #else
2536                 0
2537 #endif
2538                 )
2539                 call->lastReceiveTime = 0;
2540         }
2541 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2542         if (!istack && waslocked)
2543             AFS_GLOCK();
2544 #endif
2545 #ifdef RXDEBUG
2546     }
2547
2548     assert(p != NULL);
2549
2550     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d\n",
2551           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2552           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2553           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2554
2555 #endif
2556     if (rx_stats_active) {
2557         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2558         MUTEX_ENTER(&peer->peer_lock);
2559         hadd32(peer->bytesSent, p->length);
2560         MUTEX_EXIT(&peer->peer_lock);
2561     }
2562 }
2563
2564
2565 /* Send a "special" packet to the peer connection.  If call is
2566  * specified, then the packet is directed to a specific call channel
2567  * associated with the connection, otherwise it is directed to the
2568  * connection only. Uses optionalPacket if it is supplied, rather than
2569  * allocating a new packet buffer.  Nbytes is the length of the data
2570  * portion of the packet.  If data is non-null, nbytes of data are
2571  * copied into the packet.  Type is the type of the packet, as defined
2572  * in rx.h.  Bug: there's a lot of duplication between this and other
2573  * routines.  This needs to be cleaned up. */
2574 struct rx_packet *
2575 rxi_SendSpecial(struct rx_call *call,
2576                 struct rx_connection *conn,
2577                 struct rx_packet *optionalPacket, int type, char *data,
2578                 int nbytes, int istack)
2579 {
2580     /* Some of the following stuff should be common code for all
2581      * packet sends (it's repeated elsewhere) */
2582     struct rx_packet *p;
2583     unsigned int i = 0;
2584     int savelen = 0, saven = 0;
2585     int channel, callNumber;
2586     if (call) {
2587         channel = call->channel;
2588         callNumber = *call->callNumber;
2589         /* BUSY packets refer to the next call on this connection */
2590         if (type == RX_PACKET_TYPE_BUSY) {
2591             callNumber++;
2592         }
2593     } else {
2594         channel = 0;
2595         callNumber = 0;
2596     }
2597     p = optionalPacket;
2598     if (!p) {
2599         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2600         if (!p)
2601             osi_Panic("rxi_SendSpecial failure");
2602     }
2603
2604     if (nbytes != -1)
2605         p->length = nbytes;
2606     else
2607         nbytes = p->length;
2608     p->header.serviceId = conn->serviceId;
2609     p->header.securityIndex = conn->securityIndex;
2610     p->header.cid = (conn->cid | channel);
2611     p->header.callNumber = callNumber;
2612     p->header.seq = 0;
2613     p->header.epoch = conn->epoch;
2614     p->header.type = type;
2615     p->header.flags = 0;
2616     if (conn->type == RX_CLIENT_CONNECTION)
2617         p->header.flags |= RX_CLIENT_INITIATED;
2618     if (data)
2619         rx_packetwrite(p, 0, nbytes, data);
2620
2621     for (i = 1; i < p->niovecs; i++) {
2622         if (nbytes <= p->wirevec[i].iov_len) {
2623             savelen = p->wirevec[i].iov_len;
2624             saven = p->niovecs;
2625             p->wirevec[i].iov_len = nbytes;
2626             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2627         } else
2628             nbytes -= p->wirevec[i].iov_len;
2629     }
2630
2631     if (call)
2632         rxi_Send(call, p, istack);
2633     else
2634         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2635     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2636         /* really need to do this, but it seems safer this way, given that  */
2637         /* sneaky optionalPacket... */
2638         p->wirevec[i - 1].iov_len = savelen;
2639         p->niovecs = saven;
2640     }
2641     if (!optionalPacket)
2642         rxi_FreePacket(p);
2643     return optionalPacket;
2644 }
2645
2646
2647 /* Encode the packet's header (from the struct header in the packet to
2648  * the net byte order representation in the wire representation of the
2649  * packet, which is what is actually sent out on the wire) */
2650 void
2651 rxi_EncodePacketHeader(struct rx_packet *p)
2652 {
2653     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2654
2655     memset(buf, 0, RX_HEADER_SIZE);
2656     *buf++ = htonl(p->header.epoch);
2657     *buf++ = htonl(p->header.cid);
2658     *buf++ = htonl(p->header.callNumber);
2659     *buf++ = htonl(p->header.seq);
2660     *buf++ = htonl(p->header.serial);
2661     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2662                    | (((afs_uint32) p->header.flags) << 16)
2663                    | (p->header.userStatus << 8) | p->header.securityIndex);
2664     /* Note: top 16 bits of this next word were reserved */
2665     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2666 }
2667
2668 /* Decode the packet's header (from net byte order to a struct header) */
2669 void
2670 rxi_DecodePacketHeader(struct rx_packet *p)
2671 {
2672     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2673     afs_uint32 temp;
2674
2675     p->header.epoch = ntohl(*buf);
2676     buf++;
2677     p->header.cid = ntohl(*buf);
2678     buf++;
2679     p->header.callNumber = ntohl(*buf);
2680     buf++;
2681     p->header.seq = ntohl(*buf);
2682     buf++;
2683     p->header.serial = ntohl(*buf);
2684     buf++;
2685
2686     temp = ntohl(*buf);
2687     buf++;
2688
2689     /* C will truncate byte fields to bytes for me */
2690     p->header.type = temp >> 24;
2691     p->header.flags = temp >> 16;
2692     p->header.userStatus = temp >> 8;
2693     p->header.securityIndex = temp >> 0;
2694
2695     temp = ntohl(*buf);
2696     buf++;
2697
2698     p->header.serviceId = (temp & 0xffff);
2699     p->header.spare = temp >> 16;
2700     /* Note: top 16 bits of this last word are the security checksum */
2701 }
2702
2703 /*
2704  * LOCKS HELD: called with call->lock held.
2705  *
2706  * PrepareSendPacket is the only place in the code that
2707  * can increment call->tnext.  This could become an atomic
2708  * in the future.  Beyond that there is nothing in this
2709  * function that requires the call being locked.  This
2710  * function can only be called by the application thread.
2711  */
2712 void
2713 rxi_PrepareSendPacket(struct rx_call *call,
2714                       struct rx_packet *p, int last)
2715 {
2716     struct rx_connection *conn = call->conn;
2717     afs_uint32 seq = call->tnext++;
2718     unsigned int i;
2719     afs_int32 len;              /* len must be a signed type; it can go negative */
2720
2721     /* No data packets on call 0. Where do these come from? */
2722     if (*call->callNumber == 0)
2723         *call->callNumber = 1;
2724
2725     MUTEX_EXIT(&call->lock);
2726     p->flags &= ~RX_PKTFLAG_ACKED;
2727     p->header.cid = (conn->cid | call->channel);
2728     p->header.serviceId = conn->serviceId;
2729     p->header.securityIndex = conn->securityIndex;
2730
2731     p->header.callNumber = *call->callNumber;
2732     p->header.seq = seq;
2733     p->header.epoch = conn->epoch;
2734     p->header.type = RX_PACKET_TYPE_DATA;
2735     p->header.flags = 0;
2736     p->header.spare = 0;
2737     if (conn->type == RX_CLIENT_CONNECTION)
2738         p->header.flags |= RX_CLIENT_INITIATED;
2739
2740     if (last)
2741         p->header.flags |= RX_LAST_PACKET;
2742
2743     clock_Zero(&p->retryTime);  /* Never yet transmitted */
2744     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2745     p->header.serial = 0;       /* Another way of saying never transmitted... */
2746     p->backoff = 0;
2747
2748     /* Now that we're sure this is the last data on the call, make sure
2749      * that the "length" and the sum of the iov_lens matches. */
2750     len = p->length + call->conn->securityHeaderSize;
2751
2752     for (i = 1; i < p->niovecs && len > 0; i++) {
2753         len -= p->wirevec[i].iov_len;
2754     }
2755     if (len > 0) {
2756         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2757     } else if (i < p->niovecs) {
2758         /* Free any extra elements in the wirevec */
2759 #if defined(RX_ENABLE_TSFPQ)
2760         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2761 #else /* !RX_ENABLE_TSFPQ */
2762         MUTEX_ENTER(&rx_freePktQ_lock);
2763         rxi_FreeDataBufsNoLock(p, i);
2764         MUTEX_EXIT(&rx_freePktQ_lock);
2765 #endif /* !RX_ENABLE_TSFPQ */
2766
2767         p->niovecs = i;
2768     }
2769     if (len)
2770         p->wirevec[i - 1].iov_len += len;
2771     RXS_PreparePacket(conn->securityObject, call, p);
2772     MUTEX_ENTER(&call->lock);
2773 }
2774
2775 /* Given an interface MTU size, calculate an adjusted MTU size that
2776  * will make efficient use of the RX buffers when the peer is sending
2777  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2778 int
2779 rxi_AdjustIfMTU(int mtu)
2780 {
2781     int adjMTU;
2782     int frags;
2783
2784     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2785         return mtu;
2786     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2787     if (mtu <= adjMTU) {
2788         return mtu;
2789     }
2790     mtu -= adjMTU;
2791     if (mtu <= 0) {
2792         return adjMTU;
2793     }
2794     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2795     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2796 }
2797
2798 /* Given an interface MTU size, and the peer's advertised max receive
2799  * size, calculate an adjisted maxMTU size that makes efficient use
2800  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2801 int
2802 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2803 {
2804     int maxMTU = mtu * rxi_nSendFrags;
2805     maxMTU = MIN(maxMTU, peerMaxMTU);
2806     return rxi_AdjustIfMTU(maxMTU);
2807 }
2808
2809 /* Given a packet size, figure out how many datagram packet will fit.
2810  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2811  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2812  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2813 int
2814 rxi_AdjustDgramPackets(int frags, int mtu)
2815 {
2816     int maxMTU;
2817     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2818         return 1;
2819     }
2820     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2821     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2822     /* subtract the size of the first and last packets */
2823     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2824     if (maxMTU < 0) {
2825         return 1;
2826     }
2827     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2828 }
2829
2830 #ifndef KERNEL
2831 /*
2832  * This function can be used by the Windows Cache Manager
2833  * to dump the list of all rx packets so that we can determine
2834  * where the packet leakage is.
2835  */
2836 int rx_DumpPackets(FILE *outputFile, char *cookie)
2837 {
2838 #ifdef RXDEBUG_PACKET
2839     struct rx_packet *p;
2840 #ifdef AFS_NT40_ENV
2841     int zilch;
2842     char output[2048];
2843 #define RXDPRINTF sprintf
2844 #define RXDPRINTOUT output
2845 #else
2846 #define RXDPRINTF fprintf
2847 #define RXDPRINTOUT outputFile
2848 #endif
2849
2850     NETPRI;
2851     MUTEX_ENTER(&rx_freePktQ_lock);
2852     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2853 #ifdef AFS_NT40_ENV
2854     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2855 #endif
2856
2857     for (p = rx_mallocedP; p; p = p->allNextp) {
2858         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2859                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2860                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2861                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2862                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2863                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2864 #ifdef AFS_NT40_ENV
2865         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2866 #endif
2867     }
2868
2869     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2870 #ifdef AFS_NT40_ENV
2871     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2872 #endif
2873
2874     MUTEX_EXIT(&rx_freePktQ_lock);
2875     USERPRI;
2876 #endif /* RXDEBUG_PACKET */
2877     return 0;
2878 }
2879 #endif