src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # include <afs/opr.h>
  47 # if defined(AFS_NT40_ENV)
  48 #  ifndef EWOULDBLOCK
  49 #   define EWOULDBLOCK WSAEWOULDBLOCK
  50 #  endif
  51 #  include "rx_user.h"
  52 #  include "rx_xmit_nt.h"
  53 # endif
  54 # include <lwp.h>
  55 #endif /* KERNEL */
  56
  57 #ifdef  AFS_SUN5_ENV
  58 # include <sys/sysmacros.h>
  59 #endif
  60
  61 #include <opr/queue.h>
  62
  63 #include "rx.h"
  64 #include "rx_clock.h"
  65 #include "rx_packet.h"
  66 #include "rx_atomic.h"
  67 #include "rx_globals.h"
  68 #include "rx_internal.h"
  69 #include "rx_stats.h"
  70
  71 #include "rx_peer.h"
  72 #include "rx_conn.h"
  73 #include "rx_call.h"
  74
  75 /*!
  76  * \brief structure used to keep track of allocated packets
  77  */
  78 struct rx_mallocedPacket {
  79     struct opr_queue entry;     /*!< chained using opr_queue */
  80     struct rx_packet *addr;     /*!< address of the first element */
  81     afs_uint32 size;            /*!< array size in bytes */
  82 };
  83
  84 #ifdef RX_LOCKS_DB
  85 /* rxdb_fileID is used to identify the lock location, along with line#. */
  86 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  87 #endif /* RX_LOCKS_DB */
  88 static struct rx_packet *rx_mallocedP = 0;
  89 #ifdef RXDEBUG_PACKET
  90 static afs_uint32       rx_packet_id = 0;
  91 #endif
  92
  93 extern char cml_version_number[];
  94
  95 static int AllocPacketBufs(int class, int num_pkts, struct opr_queue *q);
  96
  97 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  98                                 afs_uint32 ahost, short aport,
  99                                 afs_int32 istack);
 100 static struct rx_packet *rxi_AllocPacketNoLock(int class);
 101
 102 #ifndef KERNEL
 103 static void rxi_MorePacketsNoLock(int apackets);
 104 #endif
 105
 106 #ifdef RX_ENABLE_TSFPQ
 107 static int rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first,
 108                                  int flush_global);
 109 static void rxi_AdjustLocalPacketsTSFPQ(int num_keep_local,
 110                                         int allow_overcommit);
 111 #else
 112 static void rxi_FreePacketNoLock(struct rx_packet *p);
 113 static int rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first);
 114 static int rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first,
 115                                    struct opr_queue * q);
 116 #endif
 117
 118 extern struct opr_queue rx_idleServerQueue;
 119
 120 /* some rules about packets:
 121  * 1.  When a packet is allocated, the final iov_buf contains room for
 122  * a security trailer, but iov_len masks that fact.  If the security
 123  * package wants to add the trailer, it may do so, and then extend
 124  * iov_len appropriately.  For this reason, packet's niovecs and
 125  * iov_len fields should be accurate before calling PreparePacket.
 126 */
 127
 128 /* Preconditions:
 129  *        all packet buffers (iov_base) are integral multiples of
 130  *        the word size.
 131  *        offset is an integral multiple of the word size.
 132  */
 133 afs_int32
 134 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 135 {
 136     unsigned int i;
 137     size_t l;
 138     for (l = 0, i = 1; i < packet->niovecs; i++) {
 139         if (l + packet->wirevec[i].iov_len > offset) {
 140             return
 141                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 142                                  (offset - l)));
 143         }
 144         l += packet->wirevec[i].iov_len;
 145     }
 146
 147     return 0;
 148 }
 149
 150 /* Preconditions:
 151  *        all packet buffers (iov_base) are integral multiples of the word size.
 152  *        offset is an integral multiple of the word size.
 153  */
 154 afs_int32
 155 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 156 {
 157     unsigned int i;
 158     size_t l;
 159     for (l = 0, i = 1; i < packet->niovecs; i++) {
 160         if (l + packet->wirevec[i].iov_len > offset) {
 161             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 162                              (offset - l))) = data;
 163             return 0;
 164         }
 165         l += packet->wirevec[i].iov_len;
 166     }
 167
 168     return 0;
 169 }
 170
 171 /* Preconditions:
 172  *        all packet buffers (iov_base) are integral multiples of the
 173  *        word size.
 174  *        offset is an integral multiple of the word size.
 175  * Packet Invariants:
 176  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 177  */
 178 afs_int32
 179 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 180                   char *out)
 181 {
 182     unsigned int i, j, l, r;
 183     for (l = 0, i = 1; i < packet->niovecs; i++) {
 184         if (l + packet->wirevec[i].iov_len > offset) {
 185             break;
 186         }
 187         l += packet->wirevec[i].iov_len;
 188     }
 189
 190     /* i is the iovec which contains the first little bit of data in which we
 191      * are interested.  l is the total length of everything prior to this iovec.
 192      * j is the number of bytes we can safely copy out of this iovec.
 193      * offset only applies to the first iovec.
 194      */
 195     r = resid;
 196     while ((r > 0) && (i < packet->niovecs)) {
 197         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 198         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 199         r -= j;
 200         out += j;
 201         l += packet->wirevec[i].iov_len;
 202         offset = l;
 203         i++;
 204     }
 205
 206     return (r ? (resid - r) : resid);
 207 }
 208
 209
 210 /* Preconditions:
 211  *        all packet buffers (iov_base) are integral multiples of the
 212  *        word size.
 213  *        offset is an integral multiple of the word size.
 214  */
 215 afs_int32
 216 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 217 {
 218     unsigned int i, j, l, o, r;
 219     char *b;
 220
 221     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 222         if (l + packet->wirevec[i].iov_len > o) {
 223             break;
 224         }
 225         l += packet->wirevec[i].iov_len;
 226     }
 227
 228     /* i is the iovec which contains the first little bit of data in which we
 229      * are interested.  l is the total length of everything prior to this iovec.
 230      * j is the number of bytes we can safely copy out of this iovec.
 231      * offset only applies to the first iovec.
 232      */
 233     r = resid;
 234     while ((r > 0) && (i <= RX_MAXWVECS)) {
 235         if (i >= packet->niovecs)
 236             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 237                 break;
 238
 239         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 240         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 241         memcpy(b, in, j);
 242         r -= j;
 243         in += j;
 244         l += packet->wirevec[i].iov_len;
 245         offset = l;
 246         i++;
 247     }
 248
 249     return (r ? (resid - r) : resid);
 250 }
 251
 252 int
 253 rxi_AllocPackets(int class, int num_pkts, struct opr_queue * q)
 254 {
 255     struct opr_queue *c;
 256
 257     num_pkts = AllocPacketBufs(class, num_pkts, q);
 258
 259     for (opr_queue_Scan(q, c)) {
 260         RX_PACKET_IOV_FULLINIT(opr_queue_Entry(c, struct rx_packet, entry));
 261     }
 262
 263     return num_pkts;
 264 }
 265
 266 #ifdef RX_ENABLE_TSFPQ
 267 static int
 268 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 269 {
 270     struct rx_ts_info_t * rx_ts_info;
 271     int transfer;
 272     SPLVAR;
 273
 274     RX_TS_INFO_GET(rx_ts_info);
 275
 276     transfer = num_pkts - rx_ts_info->_FPQ.len;
 277     if (transfer > 0) {
 278         NETPRI;
 279         MUTEX_ENTER(&rx_freePktQ_lock);
 280         transfer = MAX(transfer, rx_TSFPQGlobSize);
 281         if (transfer > rx_nFreePackets) {
 282             /* alloc enough for us, plus a few globs for other threads */
 283             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 284         }
 285
 286         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 287
 288         MUTEX_EXIT(&rx_freePktQ_lock);
 289         USERPRI;
 290     }
 291
 292     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 293
 294     return num_pkts;
 295 }
 296 #else /* RX_ENABLE_TSFPQ */
 297 static int
 298 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 299 {
 300     struct rx_packet *c;
 301     int i;
 302 #ifdef KERNEL
 303     int overq = 0;
 304 #endif
 305     SPLVAR;
 306
 307     NETPRI;
 308
 309     MUTEX_ENTER(&rx_freePktQ_lock);
 310
 311 #ifdef KERNEL
 312     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 313          num_pkts--, overq++);
 314
 315     if (overq) {
 316         rxi_NeedMorePackets = TRUE;
 317         if (rx_stats_active) {
 318             switch (class) {
 319             case RX_PACKET_CLASS_RECEIVE:
 320                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 321                 break;
 322             case RX_PACKET_CLASS_SEND:
 323                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 324                 break;
 325             case RX_PACKET_CLASS_SPECIAL:
 326                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 327                 break;
 328             case RX_PACKET_CLASS_RECV_CBUF:
 329                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 330                 break;
 331             case RX_PACKET_CLASS_SEND_CBUF:
 332                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 333                 break;
 334             }
 335         }
 336     }
 337
 338     if (rx_nFreePackets < num_pkts)
 339         num_pkts = rx_nFreePackets;
 340
 341     if (!num_pkts) {
 342         rxi_NeedMorePackets = TRUE;
 343         goto done;
 344     }
 345 #else /* KERNEL */
 346     if (rx_nFreePackets < num_pkts) {
 347         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 348     }
 349 #endif /* KERNEL */
 350
 351     for (i=0, c=opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
 352          i < num_pkts;
 353          i++, c=opr_queue_Next(&c->entry, struct rx_packet, entry)) {
 354         RX_FPQ_MARK_USED(c);
 355     }
 356
 357     opr_queue_SplitBeforeAppend(&rx_freePacketQueue, q, &c->entry);
 358
 359     rx_nFreePackets -= num_pkts;
 360
 361 #ifdef KERNEL
 362   done:
 363 #endif
 364     MUTEX_EXIT(&rx_freePktQ_lock);
 365
 366     USERPRI;
 367     return num_pkts;
 368 }
 369 #endif /* RX_ENABLE_TSFPQ */
 370
 371 /*
 372  * Free a packet currently used as a continuation buffer
 373  */
 374 #ifdef RX_ENABLE_TSFPQ
 375 /* num_pkts=0 means queue length is unknown */
 376 int
 377 rxi_FreePackets(int num_pkts, struct opr_queue * q)
 378 {
 379     struct rx_ts_info_t * rx_ts_info;
 380     struct opr_queue *cursor, *store;
 381     SPLVAR;
 382
 383     osi_Assert(num_pkts >= 0);
 384     RX_TS_INFO_GET(rx_ts_info);
 385
 386     if (!num_pkts) {
 387         for (opr_queue_ScanSafe(q, cursor, store)) {
 388             num_pkts++;
 389             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 390                                                  entry), 2, 0);
 391         }
 392     } else {
 393         for (opr_queue_ScanSafe(q, cursor, store)) {
 394             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 395                                                  entry), 2, 0);
 396         }
 397     }
 398
 399     if (num_pkts) {
 400         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 401     }
 402
 403     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 404         NETPRI;
 405         MUTEX_ENTER(&rx_freePktQ_lock);
 406
 407         RX_TS_FPQ_LTOG(rx_ts_info);
 408
 409         /* Wakeup anyone waiting for packets */
 410         rxi_PacketsUnWait();
 411
 412         MUTEX_EXIT(&rx_freePktQ_lock);
 413         USERPRI;
 414     }
 415
 416     return num_pkts;
 417 }
 418 #else /* RX_ENABLE_TSFPQ */
 419 /* num_pkts=0 means queue length is unknown */
 420 int
 421 rxi_FreePackets(int num_pkts, struct opr_queue *q)
 422 {
 423     struct opr_queue cbs;
 424     struct opr_queue *cursor, *store;
 425     int qlen = 0;
 426     SPLVAR;
 427
 428     osi_Assert(num_pkts >= 0);
 429     opr_queue_Init(&cbs);
 430
 431     if (!num_pkts) {
 432         for (opr_queue_ScanSafe(q, cursor, store)) {
 433             struct rx_packet *p
 434                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 435             if (p->niovecs > 2) {
 436                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 437             }
 438             RX_FPQ_MARK_FREE(p);
 439             num_pkts++;
 440         }
 441         if (!num_pkts)
 442             return 0;
 443     } else {
 444         for (opr_queue_ScanSafe(q, cursor, store)) {
 445             struct rx_packet *p
 446                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 447
 448             if (p->niovecs > 2) {
 449                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 450             }
 451             RX_FPQ_MARK_FREE(p);
 452         }
 453     }
 454
 455     if (qlen) {
 456         opr_queue_SpliceAppend(q, &cbs);
 457         qlen += num_pkts;
 458     } else
 459         qlen = num_pkts;
 460
 461     NETPRI;
 462     MUTEX_ENTER(&rx_freePktQ_lock);
 463
 464     opr_queue_SpliceAppend(&rx_freePacketQueue, q);
 465     rx_nFreePackets += qlen;
 466
 467     /* Wakeup anyone waiting for packets */
 468     rxi_PacketsUnWait();
 469
 470     MUTEX_EXIT(&rx_freePktQ_lock);
 471     USERPRI;
 472
 473     return num_pkts;
 474 }
 475 #endif /* RX_ENABLE_TSFPQ */
 476
 477 /* this one is kind of awful.
 478  * In rxkad, the packet has been all shortened, and everything, ready for
 479  * sending.  All of a sudden, we discover we need some of that space back.
 480  * This isn't terribly general, because it knows that the packets are only
 481  * rounded up to the EBS (userdata + security header).
 482  */
 483 int
 484 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 485 {
 486     int i;
 487     i = p->niovecs - 1;
 488     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 489         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 490             p->wirevec[i].iov_len += nb;
 491             return 0;
 492         }
 493     } else {
 494         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 495             p->wirevec[i].iov_len += nb;
 496             return 0;
 497         }
 498     }
 499
 500     return 0;
 501 }
 502
 503 /* get sufficient space to store nb bytes of data (or more), and hook
 504  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 505  * returns the number of bytes >0 which it failed to come up with.
 506  * Don't need to worry about locking on packet, since only
 507  * one thread can manipulate one at a time. Locking on continution
 508  * packets is handled by AllocPacketBufs */
 509 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 510 int
 511 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 512 {
 513     int i, nv;
 514     struct opr_queue q, *cursor, *store;
 515
 516     /* compute the number of cbuf's we need */
 517     nv = nb / RX_CBUFFERSIZE;
 518     if ((nv * RX_CBUFFERSIZE) < nb)
 519         nv++;
 520     if ((nv + p->niovecs) > RX_MAXWVECS)
 521         nv = RX_MAXWVECS - p->niovecs;
 522     if (nv < 1)
 523         return nb;
 524
 525     /* allocate buffers */
 526     opr_queue_Init(&q);
 527     nv = AllocPacketBufs(class, nv, &q);
 528
 529     /* setup packet iovs */
 530     i = p ->niovecs;
 531     for (opr_queue_ScanSafe(&q, cursor, store)) {
 532         struct rx_packet *cb
 533             = opr_queue_Entry(cursor, struct rx_packet, entry);
 534
 535         opr_queue_Remove(&cb->entry);
 536         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 537         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 538         i++;
 539     }
 540
 541     nb -= (nv * RX_CBUFFERSIZE);
 542     p->length += (nv * RX_CBUFFERSIZE);
 543     p->niovecs += nv;
 544
 545     return nb;
 546 }
 547
 548 /**
 549  * Register allocated packets.
 550  *
 551  * @param[in] addr array of packets
 552  * @param[in] npkt number of packets
 553  *
 554  * @return none
 555  */
 556 static void
 557 registerPackets(struct rx_packet *addr, afs_uint32 npkt)
 558 {
 559     struct rx_mallocedPacket *mp;
 560
 561     mp = osi_Alloc(sizeof(*mp));
 562
 563     osi_Assert(mp != NULL);
 564     memset(mp, 0, sizeof(*mp));
 565
 566     mp->addr = addr;
 567     mp->size = npkt * sizeof(struct rx_packet);
 568     osi_Assert(npkt <= MAX_AFS_UINT32 / sizeof(struct rx_packet));
 569
 570     MUTEX_ENTER(&rx_mallocedPktQ_lock);
 571     opr_queue_Append(&rx_mallocedPacketQueue, &mp->entry);
 572     MUTEX_EXIT(&rx_mallocedPktQ_lock);
 573 }
 574
 575 /* Add more packet buffers */
 576 #ifdef RX_ENABLE_TSFPQ
 577 void
 578 rxi_MorePackets(int apackets)
 579 {
 580     struct rx_packet *p, *e;
 581     struct rx_ts_info_t * rx_ts_info;
 582     int getme;
 583     SPLVAR;
 584
 585     getme = apackets * sizeof(struct rx_packet);
 586     p = osi_Alloc(getme);
 587     osi_Assert(p);
 588     registerPackets(p, apackets);
 589
 590     PIN(p, getme);              /* XXXXX */
 591     memset(p, 0, getme);
 592     RX_TS_INFO_GET(rx_ts_info);
 593
 594     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 595     /* TSFPQ patch also needs to keep track of total packets */
 596
 597     MUTEX_ENTER(&rx_packets_mutex);
 598     rx_nPackets += apackets;
 599     RX_TS_FPQ_COMPUTE_LIMITS;
 600     MUTEX_EXIT(&rx_packets_mutex);
 601
 602     for (e = p + apackets; p < e; p++) {
 603         RX_PACKET_IOV_INIT(p);
 604         p->niovecs = 2;
 605
 606         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 607
 608         NETPRI;
 609         MUTEX_ENTER(&rx_freePktQ_lock);
 610 #ifdef RXDEBUG_PACKET
 611         p->packetId = rx_packet_id++;
 612         p->allNextp = rx_mallocedP;
 613 #endif /* RXDEBUG_PACKET */
 614         rx_mallocedP = p;
 615         MUTEX_EXIT(&rx_freePktQ_lock);
 616         USERPRI;
 617     }
 618     rx_ts_info->_FPQ.delta += apackets;
 619
 620     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 621         NETPRI;
 622         MUTEX_ENTER(&rx_freePktQ_lock);
 623
 624         RX_TS_FPQ_LTOG(rx_ts_info);
 625         rxi_NeedMorePackets = FALSE;
 626         rxi_PacketsUnWait();
 627
 628         MUTEX_EXIT(&rx_freePktQ_lock);
 629         USERPRI;
 630     }
 631 }
 632 #else /* RX_ENABLE_TSFPQ */
 633 void
 634 rxi_MorePackets(int apackets)
 635 {
 636     struct rx_packet *p, *e;
 637     int getme;
 638     SPLVAR;
 639
 640     getme = apackets * sizeof(struct rx_packet);
 641     p = osi_Alloc(getme);
 642     osi_Assert(p);
 643     registerPackets(p, apackets);
 644
 645     PIN(p, getme);              /* XXXXX */
 646     memset(p, 0, getme);
 647     NETPRI;
 648     MUTEX_ENTER(&rx_freePktQ_lock);
 649
 650     for (e = p + apackets; p < e; p++) {
 651         RX_PACKET_IOV_INIT(p);
 652 #ifdef RX_TRACK_PACKETS
 653         p->flags |= RX_PKTFLAG_FREE;
 654 #endif
 655         p->niovecs = 2;
 656
 657         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 658 #ifdef RXDEBUG_PACKET
 659         p->packetId = rx_packet_id++;
 660         p->allNextp = rx_mallocedP;
 661 #endif /* RXDEBUG_PACKET */
 662         rx_mallocedP = p;
 663     }
 664
 665     rx_nPackets += apackets;
 666     rx_nFreePackets += apackets;
 667     rxi_NeedMorePackets = FALSE;
 668     rxi_PacketsUnWait();
 669
 670     MUTEX_EXIT(&rx_freePktQ_lock);
 671     USERPRI;
 672 }
 673 #endif /* RX_ENABLE_TSFPQ */
 674
 675 #ifdef RX_ENABLE_TSFPQ
 676 void
 677 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 678 {
 679     struct rx_packet *p, *e;
 680     struct rx_ts_info_t * rx_ts_info;
 681     int getme;
 682     SPLVAR;
 683
 684     getme = apackets * sizeof(struct rx_packet);
 685     p = osi_Alloc(getme);
 686     registerPackets(p, apackets);
 687
 688     PIN(p, getme);              /* XXXXX */
 689     memset(p, 0, getme);
 690     RX_TS_INFO_GET(rx_ts_info);
 691
 692     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 693     /* TSFPQ patch also needs to keep track of total packets */
 694     MUTEX_ENTER(&rx_packets_mutex);
 695     rx_nPackets += apackets;
 696     RX_TS_FPQ_COMPUTE_LIMITS;
 697     MUTEX_EXIT(&rx_packets_mutex);
 698
 699     for (e = p + apackets; p < e; p++) {
 700         RX_PACKET_IOV_INIT(p);
 701         p->niovecs = 2;
 702         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 703
 704         NETPRI;
 705         MUTEX_ENTER(&rx_freePktQ_lock);
 706 #ifdef RXDEBUG_PACKET
 707         p->packetId = rx_packet_id++;
 708         p->allNextp = rx_mallocedP;
 709 #endif /* RXDEBUG_PACKET */
 710         rx_mallocedP = p;
 711         MUTEX_EXIT(&rx_freePktQ_lock);
 712         USERPRI;
 713     }
 714     rx_ts_info->_FPQ.delta += apackets;
 715
 716     if (flush_global &&
 717         (num_keep_local < apackets)) {
 718         NETPRI;
 719         MUTEX_ENTER(&rx_freePktQ_lock);
 720
 721         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 722         rxi_NeedMorePackets = FALSE;
 723         rxi_PacketsUnWait();
 724
 725         MUTEX_EXIT(&rx_freePktQ_lock);
 726         USERPRI;
 727     }
 728 }
 729 #endif /* RX_ENABLE_TSFPQ */
 730
 731 #ifndef KERNEL
 732 /* Add more packet buffers */
 733 static void
 734 rxi_MorePacketsNoLock(int apackets)
 735 {
 736 #ifdef RX_ENABLE_TSFPQ
 737     struct rx_ts_info_t * rx_ts_info;
 738 #endif /* RX_ENABLE_TSFPQ */
 739     struct rx_packet *p, *e;
 740     int getme;
 741
 742     /* allocate enough packets that 1/4 of the packets will be able
 743      * to hold maximal amounts of data */
 744     apackets += (apackets / 4)
 745         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 746     do {
 747         getme = apackets * sizeof(struct rx_packet);
 748         p = osi_Alloc(getme);
 749         if (p == NULL) {
 750             apackets -= apackets / 4;
 751             osi_Assert(apackets > 0);
 752         }
 753     } while(p == NULL);
 754     memset(p, 0, getme);
 755     registerPackets(p, apackets);
 756
 757 #ifdef RX_ENABLE_TSFPQ
 758     RX_TS_INFO_GET(rx_ts_info);
 759     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 760 #endif /* RX_ENABLE_TSFPQ */
 761
 762     for (e = p + apackets; p < e; p++) {
 763         RX_PACKET_IOV_INIT(p);
 764 #ifdef RX_TRACK_PACKETS
 765         p->flags |= RX_PKTFLAG_FREE;
 766 #endif
 767         p->niovecs = 2;
 768
 769         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 770 #ifdef RXDEBUG_PACKET
 771         p->packetId = rx_packet_id++;
 772         p->allNextp = rx_mallocedP;
 773 #endif /* RXDEBUG_PACKET */
 774         rx_mallocedP = p;
 775     }
 776
 777     rx_nFreePackets += apackets;
 778     MUTEX_ENTER(&rx_packets_mutex);
 779     rx_nPackets += apackets;
 780 #ifdef RX_ENABLE_TSFPQ
 781     RX_TS_FPQ_COMPUTE_LIMITS;
 782 #endif /* RX_ENABLE_TSFPQ */
 783     MUTEX_EXIT(&rx_packets_mutex);
 784     rxi_NeedMorePackets = FALSE;
 785     rxi_PacketsUnWait();
 786 }
 787 #endif /* !KERNEL */
 788
 789 void
 790 rxi_FreeAllPackets(void)
 791 {
 792     struct rx_mallocedPacket *mp;
 793
 794     MUTEX_ENTER(&rx_mallocedPktQ_lock);
 795
 796     while (!opr_queue_IsEmpty(&rx_mallocedPacketQueue)) {
 797         mp = opr_queue_First(&rx_mallocedPacketQueue,
 798                              struct rx_mallocedPacket, entry);
 799         opr_queue_Remove(&mp->entry);
 800         osi_Free(mp->addr, mp->size);
 801         UNPIN(mp->addr, mp->size);
 802         osi_Free(mp, sizeof(*mp));
 803     }
 804     MUTEX_EXIT(&rx_mallocedPktQ_lock);
 805 }
 806
 807 #ifdef RX_ENABLE_TSFPQ
 808 static void
 809 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 810 {
 811     struct rx_ts_info_t * rx_ts_info;
 812     int xfer;
 813     SPLVAR;
 814
 815     RX_TS_INFO_GET(rx_ts_info);
 816
 817     if (num_keep_local != rx_ts_info->_FPQ.len) {
 818         NETPRI;
 819         MUTEX_ENTER(&rx_freePktQ_lock);
 820         if (num_keep_local < rx_ts_info->_FPQ.len) {
 821             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 822             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 823             rxi_PacketsUnWait();
 824         } else {
 825             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 826             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 827                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 828             if (rx_nFreePackets < xfer) {
 829                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 830             }
 831             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 832         }
 833         MUTEX_EXIT(&rx_freePktQ_lock);
 834         USERPRI;
 835     }
 836 }
 837
 838 void
 839 rxi_FlushLocalPacketsTSFPQ(void)
 840 {
 841     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 842 }
 843 #endif /* RX_ENABLE_TSFPQ */
 844
 845 /* Allocate more packets iff we need more continuation buffers */
 846 /* In kernel, can't page in memory with interrupts disabled, so we
 847  * don't use the event mechanism. */
 848 void
 849 rx_CheckPackets(void)
 850 {
 851     if (rxi_NeedMorePackets) {
 852         rxi_MorePackets(rx_maxSendWindow);
 853     }
 854 }
 855
 856 /* In the packet freeing routine below, the assumption is that
 857    we want all of the packets to be used equally frequently, so that we
 858    don't get packet buffers paging out.  It would be just as valid to
 859    assume that we DO want them to page out if not many are being used.
 860    In any event, we assume the former, and append the packets to the end
 861    of the free list.  */
 862 /* This explanation is bogus.  The free list doesn't remain in any kind of
 863    useful order for afs_int32: the packets in use get pretty much randomly scattered
 864    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 865    must be stored so that packets which are adjacent in memory are adjacent in the
 866    free list.  An array springs rapidly to mind.
 867    */
 868
 869 /* Actually free the packet p. */
 870 #ifndef RX_ENABLE_TSFPQ
 871 static void
 872 rxi_FreePacketNoLock(struct rx_packet *p)
 873 {
 874     dpf(("Free %p\n", p));
 875
 876     RX_FPQ_MARK_FREE(p);
 877     rx_nFreePackets++;
 878     opr_queue_Append(&rx_freePacketQueue, &p->entry);
 879 }
 880 #endif /* RX_ENABLE_TSFPQ */
 881
 882 #ifdef RX_ENABLE_TSFPQ
 883 static void
 884 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 885 {
 886     struct rx_ts_info_t * rx_ts_info;
 887     dpf(("Free %p\n", p));
 888
 889     RX_TS_INFO_GET(rx_ts_info);
 890     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 891
 892     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 893         NETPRI;
 894         MUTEX_ENTER(&rx_freePktQ_lock);
 895
 896         RX_TS_FPQ_LTOG(rx_ts_info);
 897
 898         /* Wakeup anyone waiting for packets */
 899         rxi_PacketsUnWait();
 900
 901         MUTEX_EXIT(&rx_freePktQ_lock);
 902         USERPRI;
 903     }
 904 }
 905 #endif /* RX_ENABLE_TSFPQ */
 906
 907 /*
 908  * free continuation buffers off a packet into a queue
 909  *
 910  * [IN] p      -- packet from which continuation buffers will be freed
 911  * [IN] first  -- iovec offset of first continuation buffer to free
 912  * [IN] q      -- queue into which continuation buffers will be chained
 913  *
 914  * returns:
 915  *   number of continuation buffers freed
 916  */
 917 #ifndef RX_ENABLE_TSFPQ
 918 static int
 919 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct opr_queue * q)
 920 {
 921     struct iovec *iov;
 922     struct rx_packet * cb;
 923     int count = 0;
 924
 925     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 926         iov = &p->wirevec[first];
 927         if (!iov->iov_base)
 928             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 929         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 930         RX_FPQ_MARK_FREE(cb);
 931         opr_queue_Append(q, &cb->entry);
 932     }
 933     p->length = 0;
 934     p->niovecs = 0;
 935
 936     return count;
 937 }
 938
 939 /*
 940  * free packet continuation buffers into the global free packet pool
 941  *
 942  * [IN] p      -- packet from which to free continuation buffers
 943  * [IN] first  -- iovec offset of first continuation buffer to free
 944  *
 945  * returns:
 946  *   zero always
 947  */
 948 static int
 949 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 950 {
 951     struct iovec *iov;
 952
 953     for (first = MAX(2, first); first < p->niovecs; first++) {
 954         iov = &p->wirevec[first];
 955         if (!iov->iov_base)
 956             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 957         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 958     }
 959     p->length = 0;
 960     p->niovecs = 0;
 961
 962     return 0;
 963 }
 964
 965 #else
 966
 967 /*
 968  * free packet continuation buffers into the thread-local free pool
 969  *
 970  * [IN] p             -- packet from which continuation buffers will be freed
 971  * [IN] first         -- iovec offset of first continuation buffer to free
 972  *                       any value less than 2, the min number of iovecs,
 973  *                       is treated as if it is 2.
 974  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 975  *                       global free pool before returning
 976  *
 977  * returns:
 978  *   zero always
 979  */
 980 static int
 981 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 982 {
 983     struct iovec *iov;
 984     struct rx_ts_info_t * rx_ts_info;
 985
 986     RX_TS_INFO_GET(rx_ts_info);
 987
 988     for (first = MAX(2, first); first < p->niovecs; first++) {
 989         iov = &p->wirevec[first];
 990         if (!iov->iov_base)
 991             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 992         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 993     }
 994     p->length = 0;
 995     p->niovecs = 0;
 996
 997     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 998         NETPRI;
 999         MUTEX_ENTER(&rx_freePktQ_lock);
1000
1001         RX_TS_FPQ_LTOG(rx_ts_info);
1002
1003         /* Wakeup anyone waiting for packets */
1004         rxi_PacketsUnWait();
1005
1006         MUTEX_EXIT(&rx_freePktQ_lock);
1007         USERPRI;
1008     }
1009     return 0;
1010 }
1011 #endif /* RX_ENABLE_TSFPQ */
1012
1013 int rxi_nBadIovecs = 0;
1014
1015 /* rxi_RestoreDataBufs
1016  *
1017  * Restore the correct sizes to the iovecs. Called when reusing a packet
1018  * for reading off the wire.
1019  */
1020 void
1021 rxi_RestoreDataBufs(struct rx_packet *p)
1022 {
1023     unsigned int i;
1024     struct iovec *iov;
1025
1026     RX_PACKET_IOV_INIT(p);
1027
1028     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
1029         if (!iov->iov_base) {
1030             rxi_nBadIovecs++;
1031             p->niovecs = i;
1032             break;
1033         }
1034         iov->iov_len = RX_CBUFFERSIZE;
1035     }
1036 }
1037
1038 #ifdef RX_ENABLE_TSFPQ
1039 int
1040 rxi_TrimDataBufs(struct rx_packet *p, int first)
1041 {
1042     int length;
1043     struct iovec *iov, *end;
1044     struct rx_ts_info_t * rx_ts_info;
1045     SPLVAR;
1046
1047     if (first != 1)
1048         osi_Panic("TrimDataBufs 1: first must be 1");
1049
1050     /* Skip over continuation buffers containing message data */
1051     iov = &p->wirevec[2];
1052     end = iov + (p->niovecs - 2);
1053     length = p->length - p->wirevec[1].iov_len;
1054     for (; iov < end && length > 0; iov++) {
1055         if (!iov->iov_base)
1056             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1057         length -= iov->iov_len;
1058     }
1059
1060     /* iov now points to the first empty data buffer. */
1061     if (iov >= end)
1062         return 0;
1063
1064     RX_TS_INFO_GET(rx_ts_info);
1065     for (; iov < end; iov++) {
1066         if (!iov->iov_base)
1067             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1068         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1069         p->niovecs--;
1070     }
1071     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1072         NETPRI;
1073         MUTEX_ENTER(&rx_freePktQ_lock);
1074
1075         RX_TS_FPQ_LTOG(rx_ts_info);
1076         rxi_PacketsUnWait();
1077
1078         MUTEX_EXIT(&rx_freePktQ_lock);
1079         USERPRI;
1080     }
1081
1082     return 0;
1083 }
1084 #else /* RX_ENABLE_TSFPQ */
1085 int
1086 rxi_TrimDataBufs(struct rx_packet *p, int first)
1087 {
1088     int length;
1089     struct iovec *iov, *end;
1090     SPLVAR;
1091
1092     if (first != 1)
1093         osi_Panic("TrimDataBufs 1: first must be 1");
1094
1095     /* Skip over continuation buffers containing message data */
1096     iov = &p->wirevec[2];
1097     end = iov + (p->niovecs - 2);
1098     length = p->length - p->wirevec[1].iov_len;
1099     for (; iov < end && length > 0; iov++) {
1100         if (!iov->iov_base)
1101             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1102         length -= iov->iov_len;
1103     }
1104
1105     /* iov now points to the first empty data buffer. */
1106     if (iov >= end)
1107         return 0;
1108
1109     NETPRI;
1110     MUTEX_ENTER(&rx_freePktQ_lock);
1111
1112     for (; iov < end; iov++) {
1113         if (!iov->iov_base)
1114             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1115         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1116         p->niovecs--;
1117     }
1118     rxi_PacketsUnWait();
1119
1120     MUTEX_EXIT(&rx_freePktQ_lock);
1121     USERPRI;
1122
1123     return 0;
1124 }
1125 #endif /* RX_ENABLE_TSFPQ */
1126
1127 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1128  * remove it yourself first if you call this routine. */
1129 #ifdef RX_ENABLE_TSFPQ
1130 void
1131 rxi_FreePacket(struct rx_packet *p)
1132 {
1133     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1134     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1135 }
1136 #else /* RX_ENABLE_TSFPQ */
1137 void
1138 rxi_FreePacket(struct rx_packet *p)
1139 {
1140     SPLVAR;
1141
1142     NETPRI;
1143     MUTEX_ENTER(&rx_freePktQ_lock);
1144
1145     rxi_FreeDataBufsNoLock(p, 2);
1146     rxi_FreePacketNoLock(p);
1147     /* Wakeup anyone waiting for packets */
1148     rxi_PacketsUnWait();
1149
1150     MUTEX_EXIT(&rx_freePktQ_lock);
1151     USERPRI;
1152 }
1153 #endif /* RX_ENABLE_TSFPQ */
1154
1155 /* rxi_AllocPacket sets up p->length so it reflects the number of
1156  * bytes in the packet at this point, **not including** the header.
1157  * The header is absolutely necessary, besides, this is the way the
1158  * length field is usually used */
1159 #ifdef RX_ENABLE_TSFPQ
1160 static struct rx_packet *
1161 rxi_AllocPacketNoLock(int class)
1162 {
1163     struct rx_packet *p;
1164     struct rx_ts_info_t * rx_ts_info;
1165
1166     RX_TS_INFO_GET(rx_ts_info);
1167
1168     if (rx_stats_active)
1169         rx_atomic_inc(&rx_stats.packetRequests);
1170     if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1171
1172 #ifdef KERNEL
1173         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1174             osi_Panic("rxi_AllocPacket error");
1175 #else /* KERNEL */
1176         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1177             rxi_MorePacketsNoLock(rx_maxSendWindow);
1178 #endif /* KERNEL */
1179
1180
1181         RX_TS_FPQ_GTOL(rx_ts_info);
1182     }
1183
1184     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1185
1186     dpf(("Alloc %p, class %d\n", p, class));
1187
1188
1189     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1190      * order to truncate outbound packets.  In the near future, may need
1191      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1192      */
1193     RX_PACKET_IOV_FULLINIT(p);
1194     return p;
1195 }
1196 #else /* RX_ENABLE_TSFPQ */
1197 static struct rx_packet *
1198 rxi_AllocPacketNoLock(int class)
1199 {
1200     struct rx_packet *p;
1201
1202 #ifdef KERNEL
1203     if (rxi_OverQuota(class)) {
1204         rxi_NeedMorePackets = TRUE;
1205         if (rx_stats_active) {
1206             switch (class) {
1207             case RX_PACKET_CLASS_RECEIVE:
1208                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1209                 break;
1210             case RX_PACKET_CLASS_SEND:
1211                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1212                 break;
1213             case RX_PACKET_CLASS_SPECIAL:
1214                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1215                 break;
1216             case RX_PACKET_CLASS_RECV_CBUF:
1217                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1218                 break;
1219             case RX_PACKET_CLASS_SEND_CBUF:
1220                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1221                 break;
1222             }
1223         }
1224         return (struct rx_packet *)0;
1225     }
1226 #endif /* KERNEL */
1227
1228     if (rx_stats_active)
1229         rx_atomic_inc(&rx_stats.packetRequests);
1230
1231 #ifdef KERNEL
1232     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1233         osi_Panic("rxi_AllocPacket error");
1234 #else /* KERNEL */
1235     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1236         rxi_MorePacketsNoLock(rx_maxSendWindow);
1237 #endif /* KERNEL */
1238
1239     rx_nFreePackets--;
1240     p = opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
1241     opr_queue_Remove(&p->entry);
1242     RX_FPQ_MARK_USED(p);
1243
1244     dpf(("Alloc %p, class %d\n", p, class));
1245
1246
1247     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1248      * order to truncate outbound packets.  In the near future, may need
1249      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1250      */
1251     RX_PACKET_IOV_FULLINIT(p);
1252     return p;
1253 }
1254 #endif /* RX_ENABLE_TSFPQ */
1255
1256 #ifdef RX_ENABLE_TSFPQ
1257 static struct rx_packet *
1258 rxi_AllocPacketTSFPQ(int class, int pull_global)
1259 {
1260     struct rx_packet *p;
1261     struct rx_ts_info_t * rx_ts_info;
1262
1263     RX_TS_INFO_GET(rx_ts_info);
1264
1265     if (rx_stats_active)
1266         rx_atomic_inc(&rx_stats.packetRequests);
1267     if (pull_global && opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1268         MUTEX_ENTER(&rx_freePktQ_lock);
1269
1270         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1271             rxi_MorePacketsNoLock(rx_maxSendWindow);
1272
1273         RX_TS_FPQ_GTOL(rx_ts_info);
1274
1275         MUTEX_EXIT(&rx_freePktQ_lock);
1276     } else if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1277         return NULL;
1278     }
1279
1280     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1281
1282     dpf(("Alloc %p, class %d\n", p, class));
1283
1284     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1285      * order to truncate outbound packets.  In the near future, may need
1286      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1287      */
1288     RX_PACKET_IOV_FULLINIT(p);
1289     return p;
1290 }
1291 #endif /* RX_ENABLE_TSFPQ */
1292
1293 #ifdef RX_ENABLE_TSFPQ
1294 struct rx_packet *
1295 rxi_AllocPacket(int class)
1296 {
1297     struct rx_packet *p;
1298
1299     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1300     return p;
1301 }
1302 #else /* RX_ENABLE_TSFPQ */
1303 struct rx_packet *
1304 rxi_AllocPacket(int class)
1305 {
1306     struct rx_packet *p;
1307
1308     MUTEX_ENTER(&rx_freePktQ_lock);
1309     p = rxi_AllocPacketNoLock(class);
1310     MUTEX_EXIT(&rx_freePktQ_lock);
1311     return p;
1312 }
1313 #endif /* RX_ENABLE_TSFPQ */
1314
1315 /* This guy comes up with as many buffers as it {takes,can get} given
1316  * the MTU for this call. It also sets the packet length before
1317  * returning.  caution: this is often called at NETPRI
1318  * Called with call locked.
1319  */
1320 struct rx_packet *
1321 rxi_AllocSendPacket(struct rx_call *call, int want)
1322 {
1323     struct rx_packet *p = (struct rx_packet *)0;
1324     int mud;
1325     unsigned delta;
1326
1327     SPLVAR;
1328     mud = call->MTU - RX_HEADER_SIZE;
1329     delta =
1330         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1331         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1332
1333 #ifdef RX_ENABLE_TSFPQ
1334     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1335         want += delta;
1336         want = MIN(want, mud);
1337
1338         if ((unsigned)want > p->length)
1339             (void)rxi_AllocDataBuf(p, (want - p->length),
1340                                    RX_PACKET_CLASS_SEND_CBUF);
1341
1342         if (p->length > mud)
1343             p->length = mud;
1344
1345         if (delta >= p->length) {
1346             rxi_FreePacket(p);
1347             p = NULL;
1348         } else {
1349             p->length -= delta;
1350         }
1351         return p;
1352     }
1353 #endif /* RX_ENABLE_TSFPQ */
1354
1355     while (!(call->error)) {
1356         MUTEX_ENTER(&rx_freePktQ_lock);
1357         /* if an error occurred, or we get the packet we want, we're done */
1358         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1359             MUTEX_EXIT(&rx_freePktQ_lock);
1360
1361             want += delta;
1362             want = MIN(want, mud);
1363
1364             if ((unsigned)want > p->length)
1365                 (void)rxi_AllocDataBuf(p, (want - p->length),
1366                                        RX_PACKET_CLASS_SEND_CBUF);
1367
1368             if (p->length > mud)
1369                 p->length = mud;
1370
1371             if (delta >= p->length) {
1372                 rxi_FreePacket(p);
1373                 p = NULL;
1374             } else {
1375                 p->length -= delta;
1376             }
1377             break;
1378         }
1379
1380         /* no error occurred, and we didn't get a packet, so we sleep.
1381          * At this point, we assume that packets will be returned
1382          * sooner or later, as packets are acknowledged, and so we
1383          * just wait.  */
1384         NETPRI;
1385         call->flags |= RX_CALL_WAIT_PACKETS;
1386         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1387         MUTEX_EXIT(&call->lock);
1388         rx_waitingForPackets = 1;
1389
1390 #ifdef  RX_ENABLE_LOCKS
1391         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1392 #else
1393         osi_rxSleep(&rx_waitingForPackets);
1394 #endif
1395         MUTEX_EXIT(&rx_freePktQ_lock);
1396         MUTEX_ENTER(&call->lock);
1397         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1398         call->flags &= ~RX_CALL_WAIT_PACKETS;
1399         USERPRI;
1400     }
1401
1402     return p;
1403 }
1404
1405 #ifndef KERNEL
1406 #ifdef AFS_NT40_ENV
1407 /* Windows does not use file descriptors. */
1408 #define CountFDs(amax) 0
1409 #else
1410 /* count the number of used FDs */
1411 static int
1412 CountFDs(int amax)
1413 {
1414     struct stat tstat;
1415     int i, code;
1416     int count;
1417
1418     count = 0;
1419     for (i = 0; i < amax; i++) {
1420         code = fstat(i, &tstat);
1421         if (code == 0)
1422             count++;
1423     }
1424     return count;
1425 }
1426 #endif /* AFS_NT40_ENV */
1427 #else /* KERNEL */
1428
1429 #define CountFDs(amax) amax
1430
1431 #endif /* KERNEL */
1432
1433 #if !defined(KERNEL) || defined(UKERNEL)
1434
1435 /* This function reads a single packet from the interface into the
1436  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1437  * (host,port) of the sender are stored in the supplied variables, and
1438  * the data length of the packet is stored in the packet structure.
1439  * The header is decoded. */
1440 int
1441 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1442                u_short * port)
1443 {
1444     struct sockaddr_in from;
1445     int nbytes;
1446     afs_int32 rlen;
1447     afs_uint32 tlen, savelen;
1448     struct msghdr msg;
1449     rx_computelen(p, tlen);
1450     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1451
1452     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1453     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1454                                  * it once in order to avoid races.  */
1455     tlen = rlen - tlen;
1456     if (tlen > 0) {
1457         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1458         if (tlen > 0) {
1459             tlen = rlen - tlen;
1460         } else
1461             tlen = rlen;
1462     } else
1463         tlen = rlen;
1464
1465     /* Extend the last iovec for padding, it's just to make sure that the
1466      * read doesn't return more data than we expect, and is done to get around
1467      * our problems caused by the lack of a length field in the rx header.
1468      * Use the extra buffer that follows the localdata in each packet
1469      * structure. */
1470     savelen = p->wirevec[p->niovecs - 1].iov_len;
1471     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1472
1473     memset(&msg, 0, sizeof(msg));
1474     msg.msg_name = (char *)&from;
1475     msg.msg_namelen = sizeof(struct sockaddr_in);
1476     msg.msg_iov = p->wirevec;
1477     msg.msg_iovlen = p->niovecs;
1478     nbytes = rxi_Recvmsg(socket, &msg, 0);
1479
1480     /* restore the vec to its correct state */
1481     p->wirevec[p->niovecs - 1].iov_len = savelen;
1482
1483     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1484     if (nbytes < 0 || (nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1485         if (nbytes < 0 && errno == EWOULDBLOCK) {
1486             if (rx_stats_active)
1487                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1488         } else if (nbytes <= 0) {
1489             if (rx_stats_active) {
1490                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1491                 rx_stats.bogusHost = from.sin_addr.s_addr;
1492             }
1493             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1494                  ntohs(from.sin_port), nbytes));
1495         }
1496         return 0;
1497     }
1498 #ifdef RXDEBUG
1499     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1500                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1501         rxi_DecodePacketHeader(p);
1502
1503         *host = from.sin_addr.s_addr;
1504         *port = from.sin_port;
1505
1506         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1507               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1508               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1509               p->length));
1510 #ifdef RX_TRIMDATABUFS
1511         rxi_TrimDataBufs(p, 1);
1512 #endif
1513         return 0;
1514     }
1515 #endif
1516     else {
1517         /* Extract packet header. */
1518         rxi_DecodePacketHeader(p);
1519
1520         *host = from.sin_addr.s_addr;
1521         *port = from.sin_port;
1522         if (rx_stats_active
1523             && p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1524
1525                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1526         }
1527
1528 #ifdef RX_TRIMDATABUFS
1529         /* Free any empty packet buffers at the end of this packet */
1530         rxi_TrimDataBufs(p, 1);
1531 #endif
1532         return 1;
1533     }
1534 }
1535
1536 #endif /* !KERNEL || UKERNEL */
1537
1538 /* This function splits off the first packet in a jumbo packet.
1539  * As of AFS 3.5, jumbograms contain more than one fixed size
1540  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1541  * last packet header. All packets (except the last) are padded to
1542  * fall on RX_CBUFFERSIZE boundaries.
1543  * HACK: We store the length of the first n-1 packets in the
1544  * last two pad bytes. */
1545
1546 struct rx_packet *
1547 rxi_SplitJumboPacket(struct rx_packet *p)
1548 {
1549     struct rx_packet *np;
1550     struct rx_jumboHeader *jp;
1551     int niov, i;
1552     struct iovec *iov;
1553     int length;
1554     afs_uint32 temp;
1555
1556     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1557      * bytes in length. All but the first packet are preceded by
1558      * an abbreviated four byte header. The length of the last packet
1559      * is calculated from the size of the jumbogram. */
1560     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1561
1562     if ((int)p->length < length) {
1563         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1564         return NULL;
1565     }
1566     niov = p->niovecs - 2;
1567     if (niov < 1) {
1568         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1569         return NULL;
1570     }
1571     iov = &p->wirevec[2];
1572     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1573
1574     /* Get a pointer to the abbreviated packet header */
1575     jp = (struct rx_jumboHeader *)
1576         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1577
1578     /* Set up the iovecs for the next packet */
1579     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1580     np->wirevec[0].iov_len = sizeof(struct rx_header);
1581     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1582     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1583     np->niovecs = niov + 1;
1584     for (i = 2, iov++; i <= niov; i++, iov++) {
1585         np->wirevec[i] = *iov;
1586     }
1587     np->length = p->length - length;
1588     p->length = RX_JUMBOBUFFERSIZE;
1589     p->niovecs = 2;
1590
1591     /* Convert the jumbo packet header to host byte order */
1592     temp = ntohl(*(afs_uint32 *) jp);
1593     jp->flags = (u_char) (temp >> 24);
1594     jp->cksum = (u_short) (temp);
1595
1596     /* Fill in the packet header */
1597     np->header = p->header;
1598     np->header.serial = p->header.serial + 1;
1599     np->header.seq = p->header.seq + 1;
1600     np->header.userStatus = 0;
1601     np->header.flags = jp->flags;
1602     np->header.spare = jp->cksum;
1603
1604     return np;
1605 }
1606
1607 #ifndef KERNEL
1608 /* Send a udp datagram */
1609 int
1610 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1611             int length, int istack)
1612 {
1613     struct msghdr msg;
1614         int ret;
1615
1616     memset(&msg, 0, sizeof(msg));
1617     msg.msg_iov = dvec;
1618     msg.msg_iovlen = nvecs;
1619     msg.msg_name = addr;
1620     msg.msg_namelen = sizeof(struct sockaddr_in);
1621
1622     ret = rxi_Sendmsg(socket, &msg, 0);
1623
1624     return ret;
1625 }
1626 #elif !defined(UKERNEL)
1627 /*
1628  * message receipt is done in rxk_input or rx_put.
1629  */
1630
1631 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1632 /*
1633  * Copy an mblock to the contiguous area pointed to by cp.
1634  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1635  * but it doesn't really.
1636  * Returns the number of bytes not transferred.
1637  * The message is NOT changed.
1638  */
1639 static int
1640 cpytoc(mblk_t * mp, int off, int len, char *cp)
1641 {
1642     int n;
1643
1644     for (; mp && len > 0; mp = mp->b_cont) {
1645         if (mp->b_datap->db_type != M_DATA) {
1646             return -1;
1647         }
1648         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1649         memcpy(cp, (char *)mp->b_rptr, n);
1650         cp += n;
1651         len -= n;
1652         mp->b_rptr += n;
1653     }
1654     return (len);
1655 }
1656
1657 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1658  * but it doesn't really.
1659  * This sucks, anyway, do it like m_cpy.... below
1660  */
1661 static int
1662 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1663            int niovs)
1664 {
1665     int m, n, o, t, i;
1666
1667     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1668         if (mp->b_datap->db_type != M_DATA) {
1669             return -1;
1670         }
1671         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1672         len -= n;
1673         while (n) {
1674             if (!t) {
1675                 o = 0;
1676                 i++;
1677                 t = iovs[i].iov_len;
1678             }
1679             m = MIN(n, t);
1680             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1681             mp->b_rptr += m;
1682             o += m;
1683             t -= m;
1684             n -= m;
1685         }
1686     }
1687     return (len);
1688 }
1689
1690 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1691 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1692 #else
1693 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1694 static int
1695 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1696 {
1697     caddr_t p1, p2;
1698     unsigned int l1, l2, i, t;
1699
1700     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1701         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1702
1703     while (off && m)
1704         if (m->m_len <= off) {
1705             off -= m->m_len;
1706             m = m->m_next;
1707             continue;
1708         } else
1709             break;
1710
1711     if (m == NULL)
1712         return len;
1713
1714     p1 = mtod(m, caddr_t) + off;
1715     l1 = m->m_len - off;
1716     i = 0;
1717     p2 = iovs[0].iov_base;
1718     l2 = iovs[0].iov_len;
1719
1720     while (len) {
1721         t = MIN(l1, MIN(l2, (unsigned int)len));
1722         memcpy(p2, p1, t);
1723         p1 += t;
1724         p2 += t;
1725         l1 -= t;
1726         l2 -= t;
1727         len -= t;
1728         if (!l1) {
1729             m = m->m_next;
1730             if (!m)
1731                 break;
1732             p1 = mtod(m, caddr_t);
1733             l1 = m->m_len;
1734         }
1735         if (!l2) {
1736             if (++i >= niovs)
1737                 break;
1738             p2 = iovs[i].iov_base;
1739             l2 = iovs[i].iov_len;
1740         }
1741
1742     }
1743
1744     return len;
1745 }
1746 #endif /* LINUX */
1747 #endif /* AFS_SUN5_ENV */
1748
1749 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1750 #if defined(AFS_NBSD_ENV)
1751 int
1752 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1753 #else
1754 int
1755 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1756 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1757      mblk_t *amb;
1758 #else
1759      struct mbuf *amb;
1760 #endif
1761      void (*free) ();
1762      struct rx_packet *phandle;
1763      int hdr_len, data_len;
1764 #endif /* AFS_NBSD_ENV */
1765 {
1766     int code;
1767
1768     code =
1769         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1770                      phandle->niovecs);
1771     (*free) (amb);
1772
1773     return code;
1774 }
1775 #endif /* LINUX */
1776 #endif /*KERNEL && !UKERNEL */
1777
1778
1779 /* send a response to a debug packet */
1780
1781 struct rx_packet *
1782 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1783                        afs_uint32 ahost, short aport, int istack)
1784 {
1785     struct rx_debugIn tin;
1786     afs_int32 tl;
1787
1788     /*
1789      * Only respond to client-initiated Rx debug packets,
1790      * and clear the client flag in the response.
1791      */
1792     if (ap->header.flags & RX_CLIENT_INITIATED) {
1793         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1794         rxi_EncodePacketHeader(ap);
1795     } else {
1796         return ap;
1797     }
1798
1799     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1800     /* all done with packet, now set length to the truth, so we can
1801      * reuse this packet */
1802     rx_computelen(ap, ap->length);
1803
1804     tin.type = ntohl(tin.type);
1805     tin.index = ntohl(tin.index);
1806     switch (tin.type) {
1807     case RX_DEBUGI_GETSTATS:{
1808             struct rx_debugStats tstat;
1809
1810             /* get basic stats */
1811             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1812             tstat.version = RX_DEBUGI_VERSION;
1813 #ifndef RX_ENABLE_LOCKS
1814             tstat.waitingForPackets = rx_waitingForPackets;
1815 #endif
1816             MUTEX_ENTER(&rx_serverPool_lock);
1817             tstat.nFreePackets = htonl(rx_nFreePackets);
1818             tstat.nPackets = htonl(rx_nPackets);
1819             tstat.callsExecuted = htonl(rxi_nCalls);
1820             tstat.packetReclaims = htonl(rx_packetReclaims);
1821             tstat.usedFDs = CountFDs(64);
1822             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1823             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1824             tstat.idleThreads = opr_queue_Count(&rx_idleServerQueue);
1825             MUTEX_EXIT(&rx_serverPool_lock);
1826             tstat.idleThreads = htonl(tstat.idleThreads);
1827             tl = sizeof(struct rx_debugStats) - ap->length;
1828             if (tl > 0)
1829                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1830
1831             if (tl <= 0) {
1832                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1833                                (char *)&tstat);
1834                 ap->length = sizeof(struct rx_debugStats);
1835                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1836                 rx_computelen(ap, ap->length);
1837             }
1838             break;
1839         }
1840
1841     case RX_DEBUGI_GETALLCONN:
1842     case RX_DEBUGI_GETCONN:{
1843             unsigned int i, j;
1844             struct rx_connection *tc;
1845             struct rx_call *tcall;
1846             struct rx_debugConn tconn;
1847             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1848
1849
1850             tl = sizeof(struct rx_debugConn) - ap->length;
1851             if (tl > 0)
1852                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1853             if (tl > 0)
1854                 return ap;
1855
1856             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1857             /* get N'th (maybe) "interesting" connection info */
1858             for (i = 0; i < rx_hashTableSize; i++) {
1859 #if !defined(KERNEL)
1860                 /* the time complexity of the algorithm used here
1861                  * exponentially increses with the number of connections.
1862                  */
1863 #ifdef AFS_PTHREAD_ENV
1864                 pthread_yield();
1865 #else
1866                 (void)IOMGR_Poll();
1867 #endif
1868 #endif
1869                 MUTEX_ENTER(&rx_connHashTable_lock);
1870                 /* We might be slightly out of step since we are not
1871                  * locking each call, but this is only debugging output.
1872                  */
1873                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1874                     if ((all || rxi_IsConnInteresting(tc))
1875                         && tin.index-- <= 0) {
1876                         int do_secstats = 0;
1877                         tconn.host = tc->peer->host;
1878                         tconn.port = tc->peer->port;
1879                         tconn.cid = htonl(tc->cid);
1880                         tconn.epoch = htonl(tc->epoch);
1881                         tconn.serial = htonl(tc->serial);
1882                         for (j = 0; j < RX_MAXCALLS; j++) {
1883                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1884                             if ((tcall = tc->call[j])) {
1885                                 tconn.callState[j] = tcall->state;
1886                                 tconn.callMode[j] = tcall->app.mode;
1887                                 tconn.callFlags[j] = tcall->flags;
1888                                 if (!opr_queue_IsEmpty(&tcall->rq))
1889                                     tconn.callOther[j] |= RX_OTHER_IN;
1890                                 if (!opr_queue_IsEmpty(&tcall->tq))
1891                                     tconn.callOther[j] |= RX_OTHER_OUT;
1892                             } else
1893                                 tconn.callState[j] = RX_STATE_NOTINIT;
1894                         }
1895
1896                         tconn.natMTU = htonl(tc->peer->natMTU);
1897                         tconn.error = htonl(tc->error);
1898                         tconn.flags = (u_char) (tc->flags & 0xff);  /* compat. */
1899                         tconn.type = tc->type;
1900                         tconn.securityIndex = tc->securityIndex;
1901                         if (tc->securityObject) {
1902                             int code;
1903                             code = RXS_GetStats(tc->securityObject, tc,
1904                                                 &tconn.secStats);
1905                             if (code == 0) {
1906                                 do_secstats = 1;
1907                             }
1908                         }
1909                         if (do_secstats) {
1910 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1911 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1912                             DOHTONL(flags);
1913                             DOHTONL(expires);
1914                             DOHTONL(packetsReceived);
1915                             DOHTONL(packetsSent);
1916                             DOHTONL(bytesReceived);
1917                             DOHTONL(bytesSent);
1918                             for (i = 0;
1919                                  i <
1920                                  sizeof(tconn.secStats.spares) /
1921                                  sizeof(short); i++)
1922                                 DOHTONS(spares[i]);
1923                             for (i = 0;
1924                                  i <
1925                                  sizeof(tconn.secStats.sparel) /
1926                                  sizeof(afs_int32); i++)
1927                                 DOHTONL(sparel[i]);
1928                         } else {
1929                             memset(&tconn.secStats, 0, sizeof(tconn.secStats));
1930                         }
1931
1932                         MUTEX_EXIT(&rx_connHashTable_lock);
1933                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1934                                        (char *)&tconn);
1935                         tl = ap->length;
1936                         ap->length = sizeof(struct rx_debugConn);
1937                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1938                                             istack);
1939                         ap->length = tl;
1940                         return ap;
1941                     }
1942                 }
1943                 MUTEX_EXIT(&rx_connHashTable_lock);
1944             }
1945             /* if we make it here, there are no interesting packets */
1946             tconn.cid = htonl(0xffffffff);      /* means end */
1947             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1948                            (char *)&tconn);
1949             tl = ap->length;
1950             ap->length = sizeof(struct rx_debugConn);
1951             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1952             ap->length = tl;
1953             break;
1954         }
1955
1956         /*
1957          * Pass back all the peer structures we have available
1958          */
1959
1960     case RX_DEBUGI_GETPEER:{
1961             unsigned int i;
1962             struct rx_peer *tp;
1963             struct rx_debugPeer tpeer;
1964
1965
1966             tl = sizeof(struct rx_debugPeer) - ap->length;
1967             if (tl > 0)
1968                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1969             if (tl > 0)
1970                 return ap;
1971
1972             memset(&tpeer, 0, sizeof(tpeer));
1973             for (i = 0; i < rx_hashTableSize; i++) {
1974 #if !defined(KERNEL)
1975                 /* the time complexity of the algorithm used here
1976                  * exponentially increses with the number of peers.
1977                  *
1978                  * Yielding after processing each hash table entry
1979                  * and dropping rx_peerHashTable_lock.
1980                  * also increases the risk that we will miss a new
1981                  * entry - but we are willing to live with this
1982                  * limitation since this is meant for debugging only
1983                  */
1984 #ifdef AFS_PTHREAD_ENV
1985                 pthread_yield();
1986 #else
1987                 (void)IOMGR_Poll();
1988 #endif
1989 #endif
1990                 MUTEX_ENTER(&rx_peerHashTable_lock);
1991                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1992                     if (tin.index-- <= 0) {
1993                         tp->refCount++;
1994                         MUTEX_EXIT(&rx_peerHashTable_lock);
1995
1996                         MUTEX_ENTER(&tp->peer_lock);
1997                         tpeer.host = tp->host;
1998                         tpeer.port = tp->port;
1999                         tpeer.ifMTU = htons(tp->ifMTU);
2000                         tpeer.idleWhen = htonl(tp->idleWhen);
2001                         tpeer.refCount = htons(tp->refCount);
2002                         tpeer.burstSize = 0;
2003                         tpeer.burst = 0;
2004                         tpeer.burstWait.sec = 0;
2005                         tpeer.burstWait.usec = 0;
2006                         tpeer.rtt = htonl(tp->rtt);
2007                         tpeer.rtt_dev = htonl(tp->rtt_dev);
2008                         tpeer.nSent = htonl(tp->nSent);
2009                         tpeer.reSends = htonl(tp->reSends);
2010                         tpeer.natMTU = htons(tp->natMTU);
2011                         tpeer.maxMTU = htons(tp->maxMTU);
2012                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2013                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2014                         tpeer.MTU = htons(tp->MTU);
2015                         tpeer.cwind = htons(tp->cwind);
2016                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2017                         tpeer.congestSeq = htons(tp->congestSeq);
2018                         tpeer.bytesSent.high =
2019                             htonl(tp->bytesSent >> 32);
2020                         tpeer.bytesSent.low =
2021                             htonl(tp->bytesSent & MAX_AFS_UINT32);
2022                         tpeer.bytesReceived.high =
2023                             htonl(tp->bytesReceived >> 32);
2024                         tpeer.bytesReceived.low =
2025                             htonl(tp->bytesReceived & MAX_AFS_UINT32);
2026                         MUTEX_EXIT(&tp->peer_lock);
2027
2028                         MUTEX_ENTER(&rx_peerHashTable_lock);
2029                         tp->refCount--;
2030                         MUTEX_EXIT(&rx_peerHashTable_lock);
2031
2032                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2033                                        (char *)&tpeer);
2034                         tl = ap->length;
2035                         ap->length = sizeof(struct rx_debugPeer);
2036                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2037                                             istack);
2038                         ap->length = tl;
2039                         return ap;
2040                     }
2041                 }
2042                 MUTEX_EXIT(&rx_peerHashTable_lock);
2043             }
2044             /* if we make it here, there are no interesting packets */
2045             tpeer.host = htonl(0xffffffff);     /* means end */
2046             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2047                            (char *)&tpeer);
2048             tl = ap->length;
2049             ap->length = sizeof(struct rx_debugPeer);
2050             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2051             ap->length = tl;
2052             break;
2053         }
2054
2055     case RX_DEBUGI_RXSTATS:{
2056             int i;
2057             afs_int32 *s;
2058
2059             tl = sizeof(rx_stats) - ap->length;
2060             if (tl > 0)
2061                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2062             if (tl > 0)
2063                 return ap;
2064
2065             /* Since its all int32s convert to network order with a loop. */
2066             if (rx_stats_active)
2067                 MUTEX_ENTER(&rx_stats_mutex);
2068             s = (afs_int32 *) & rx_stats;
2069             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2070                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2071
2072             tl = ap->length;
2073             ap->length = sizeof(rx_stats);
2074             if (rx_stats_active)
2075                 MUTEX_EXIT(&rx_stats_mutex);
2076             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2077             ap->length = tl;
2078             break;
2079         }
2080
2081     default:
2082         /* error response packet */
2083         tin.type = htonl(RX_DEBUGI_BADTYPE);
2084         tin.index = tin.type;
2085         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2086         tl = ap->length;
2087         ap->length = sizeof(struct rx_debugIn);
2088         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2089         ap->length = tl;
2090         break;
2091     }
2092     return ap;
2093 }
2094
2095 struct rx_packet *
2096 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2097                          afs_uint32 ahost, short aport, int istack)
2098 {
2099     afs_int32 tl;
2100
2101     /*
2102      * Only respond to client-initiated version requests, and
2103      * clear that flag in the response.
2104      */
2105     if (ap->header.flags & RX_CLIENT_INITIATED) {
2106         char buf[66];
2107
2108         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2109         rxi_EncodePacketHeader(ap);
2110         memset(buf, 0, sizeof(buf));
2111         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2112         rx_packetwrite(ap, 0, 65, buf);
2113         tl = ap->length;
2114         ap->length = 65;
2115         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2116         ap->length = tl;
2117     }
2118
2119     return ap;
2120 }
2121
2122
2123 /* send a debug packet back to the sender */
2124 static void
2125 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2126                     afs_uint32 ahost, short aport, afs_int32 istack)
2127 {
2128     struct sockaddr_in taddr;
2129     unsigned int i, nbytes, savelen = 0;
2130     int saven = 0;
2131 #ifdef KERNEL
2132     int waslocked = ISAFS_GLOCK();
2133 #endif
2134
2135     taddr.sin_family = AF_INET;
2136     taddr.sin_port = aport;
2137     taddr.sin_addr.s_addr = ahost;
2138     memset(&taddr.sin_zero, 0, sizeof(taddr.sin_zero));
2139 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2140     taddr.sin_len = sizeof(struct sockaddr_in);
2141 #endif
2142
2143     /* We need to trim the niovecs. */
2144     nbytes = apacket->length;
2145     for (i = 1; i < apacket->niovecs; i++) {
2146         if (nbytes <= apacket->wirevec[i].iov_len) {
2147             savelen = apacket->wirevec[i].iov_len;
2148             saven = apacket->niovecs;
2149             apacket->wirevec[i].iov_len = nbytes;
2150             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2151         } else
2152             nbytes -= apacket->wirevec[i].iov_len;
2153     }
2154 #ifdef KERNEL
2155 #ifdef RX_KERNEL_TRACE
2156     if (ICL_SETACTIVE(afs_iclSetp)) {
2157         if (!waslocked)
2158             AFS_GLOCK();
2159         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2160                    "before rxi_NetSend()");
2161         AFS_GUNLOCK();
2162     }
2163 #else
2164     if (waslocked)
2165         AFS_GUNLOCK();
2166 #endif
2167 #endif
2168     /* debug packets are not reliably delivered, hence the cast below. */
2169     (void)rxi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2170                       apacket->length + RX_HEADER_SIZE, istack);
2171 #ifdef KERNEL
2172 #ifdef RX_KERNEL_TRACE
2173     if (ICL_SETACTIVE(afs_iclSetp)) {
2174         AFS_GLOCK();
2175         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2176                    "after rxi_NetSend()");
2177         if (!waslocked)
2178             AFS_GUNLOCK();
2179     }
2180 #else
2181     if (waslocked)
2182         AFS_GLOCK();
2183 #endif
2184 #endif
2185     if (saven) {                /* means we truncated the packet above. */
2186         apacket->wirevec[i - 1].iov_len = savelen;
2187         apacket->niovecs = saven;
2188     }
2189
2190 }
2191
2192 static void
2193 rxi_NetSendError(struct rx_call *call, int code)
2194 {
2195     int down = 0;
2196 #ifdef AFS_NT40_ENV
2197     if (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) {
2198         down = 1;
2199     }
2200     if (code == -WSAEHOSTUNREACH) {
2201         down = 1;
2202     }
2203 #elif defined(AFS_LINUX20_ENV)
2204     if (code == -ENETUNREACH) {
2205         down = 1;
2206     }
2207 #elif defined(AFS_DARWIN_ENV)
2208     if (code == EHOSTUNREACH) {
2209         down = 1;
2210     }
2211 #endif
2212     if (down) {
2213         call->lastReceiveTime = 0;
2214     }
2215 }
2216
2217 /* Send the packet to appropriate destination for the specified
2218  * call.  The header is first encoded and placed in the packet.
2219  */
2220 void
2221 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2222                struct rx_packet *p, int istack)
2223 {
2224 #if defined(KERNEL)
2225     int waslocked;
2226 #endif
2227     int code;
2228     struct sockaddr_in addr;
2229     struct rx_peer *peer = conn->peer;
2230     osi_socket socket;
2231 #ifdef RXDEBUG
2232     char deliveryType = 'S';
2233 #endif
2234     /* The address we're sending the packet to */
2235     memset(&addr, 0, sizeof(addr));
2236     addr.sin_family = AF_INET;
2237     addr.sin_port = peer->port;
2238     addr.sin_addr.s_addr = peer->host;
2239     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2240
2241     /* This stuff should be revamped, I think, so that most, if not
2242      * all, of the header stuff is always added here.  We could
2243      * probably do away with the encode/decode routines. XXXXX */
2244
2245     /* Stamp each packet with a unique serial number.  The serial
2246      * number is maintained on a connection basis because some types
2247      * of security may be based on the serial number of the packet,
2248      * and security is handled on a per authenticated-connection
2249      * basis. */
2250     /* Pre-increment, to guarantee no zero serial number; a zero
2251      * serial number means the packet was never sent. */
2252     MUTEX_ENTER(&conn->conn_data_lock);
2253     p->header.serial = ++conn->serial;
2254     if (p->length > conn->peer->maxPacketSize) {
2255         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2256             (p->header.flags & RX_REQUEST_ACK)) {
2257             conn->lastPingSize = p->length;
2258             conn->lastPingSizeSer = p->header.serial;
2259         } else if (p->header.seq != 0) {
2260             conn->lastPacketSize = p->length;
2261             conn->lastPacketSizeSeq = p->header.seq;
2262         }
2263     }
2264     MUTEX_EXIT(&conn->conn_data_lock);
2265     /* This is so we can adjust retransmit time-outs better in the face of
2266      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2267      */
2268     if (p->firstSerial == 0) {
2269         p->firstSerial = p->header.serial;
2270     }
2271 #ifdef RXDEBUG
2272     /* If an output tracer function is defined, call it with the packet and
2273      * network address.  Note this function may modify its arguments. */
2274     if (rx_almostSent) {
2275         int drop = (*rx_almostSent) (p, &addr);
2276         /* drop packet if return value is non-zero? */
2277         if (drop)
2278             deliveryType = 'D'; /* Drop the packet */
2279     }
2280 #endif
2281
2282     /* Get network byte order header */
2283     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2284                                  * touch ALL the fields */
2285
2286     /* Send the packet out on the same socket that related packets are being
2287      * received on */
2288     socket =
2289         (conn->type ==
2290          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2291
2292 #ifdef RXDEBUG
2293     /* Possibly drop this packet,  for testing purposes */
2294     if ((deliveryType == 'D')
2295         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2296             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2297         deliveryType = 'D';     /* Drop the packet */
2298     } else {
2299         deliveryType = 'S';     /* Send the packet */
2300 #endif /* RXDEBUG */
2301
2302         /* Loop until the packet is sent.  We'd prefer just to use a
2303          * blocking socket, but unfortunately the interface doesn't
2304          * allow us to have the socket block in send mode, and not
2305          * block in receive mode */
2306 #ifdef KERNEL
2307         waslocked = ISAFS_GLOCK();
2308 #ifdef RX_KERNEL_TRACE
2309         if (ICL_SETACTIVE(afs_iclSetp)) {
2310             if (!waslocked)
2311                 AFS_GLOCK();
2312             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2313                        "before rxi_NetSend()");
2314             AFS_GUNLOCK();
2315         }
2316 #else
2317         if (waslocked)
2318             AFS_GUNLOCK();
2319 #endif
2320 #endif
2321         if ((code =
2322              rxi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2323                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2324             /* send failed, so let's hurry up the resend, eh? */
2325             if (rx_stats_active)
2326                 rx_atomic_inc(&rx_stats.netSendFailures);
2327             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2328
2329             /* Some systems are nice and tell us right away that we cannot
2330              * reach this recipient by returning an error code.
2331              * So, when this happens let's "down" the host NOW so
2332              * we don't sit around waiting for this host to timeout later.
2333              */
2334             if (call) {
2335                 rxi_NetSendError(call, code);
2336             }
2337         }
2338 #ifdef KERNEL
2339 #ifdef RX_KERNEL_TRACE
2340         if (ICL_SETACTIVE(afs_iclSetp)) {
2341             AFS_GLOCK();
2342             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2343                        "after rxi_NetSend()");
2344             if (!waslocked)
2345                 AFS_GUNLOCK();
2346         }
2347 #else
2348         if (waslocked)
2349             AFS_GLOCK();
2350 #endif
2351 #endif
2352 #ifdef RXDEBUG
2353     }
2354     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %p len %d\n",
2355           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2356           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2357           p->header.seq, p->header.flags, p, p->length));
2358 #endif
2359     if (rx_stats_active) {
2360         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2361         MUTEX_ENTER(&peer->peer_lock);
2362         peer->bytesSent += p->length;
2363         MUTEX_EXIT(&peer->peer_lock);
2364     }
2365 }
2366
2367 /* Send a list of packets to appropriate destination for the specified
2368  * connection.  The headers are first encoded and placed in the packets.
2369  */
2370 void
2371 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2372                    struct rx_packet **list, int len, int istack)
2373 {
2374 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2375     int waslocked;
2376 #endif
2377     struct sockaddr_in addr;
2378     struct rx_peer *peer = conn->peer;
2379     osi_socket socket;
2380     struct rx_packet *p = NULL;
2381     struct iovec wirevec[RX_MAXIOVECS];
2382     int i, length, code;
2383     afs_uint32 serial;
2384     afs_uint32 temp;
2385     struct rx_jumboHeader *jp;
2386 #ifdef RXDEBUG
2387     char deliveryType = 'S';
2388 #endif
2389     /* The address we're sending the packet to */
2390     addr.sin_family = AF_INET;
2391     addr.sin_port = peer->port;
2392     addr.sin_addr.s_addr = peer->host;
2393     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2394
2395     if (len + 1 > RX_MAXIOVECS) {
2396         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2397     }
2398
2399     /*
2400      * Stamp the packets in this jumbogram with consecutive serial numbers
2401      */
2402     MUTEX_ENTER(&conn->conn_data_lock);
2403     serial = conn->serial;
2404     conn->serial += len;
2405     for (i = 0; i < len; i++) {
2406         p = list[i];
2407         /* a ping *or* a sequenced packet can count */
2408         if (p->length > conn->peer->maxPacketSize) {
2409             if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2410                  (p->header.flags & RX_REQUEST_ACK)) &&
2411                 ((i == 0) || (p->length >= conn->lastPingSize))) {
2412                 conn->lastPingSize = p->length;
2413                 conn->lastPingSizeSer = serial + i;
2414             } else if ((p->header.seq != 0) &&
2415                        ((i == 0) || (p->length >= conn->lastPacketSize))) {
2416                 conn->lastPacketSize = p->length;
2417                 conn->lastPacketSizeSeq = p->header.seq;
2418             }
2419         }
2420     }
2421     MUTEX_EXIT(&conn->conn_data_lock);
2422
2423
2424     /* This stuff should be revamped, I think, so that most, if not
2425      * all, of the header stuff is always added here.  We could
2426      * probably do away with the encode/decode routines. XXXXX */
2427
2428     jp = NULL;
2429     length = RX_HEADER_SIZE;
2430     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2431     wirevec[0].iov_len = RX_HEADER_SIZE;
2432     for (i = 0; i < len; i++) {
2433         p = list[i];
2434
2435         /* The whole 3.5 jumbogram scheme relies on packets fitting
2436          * in a single packet buffer. */
2437         if (p->niovecs > 2) {
2438             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2439         }
2440
2441         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2442          * in this chunk.  */
2443         if (i < len - 1) {
2444             if (p->length != RX_JUMBOBUFFERSIZE) {
2445                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2446             }
2447             p->header.flags |= RX_JUMBO_PACKET;
2448             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2449             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2450         } else {
2451             wirevec[i + 1].iov_len = p->length;
2452             length += p->length;
2453         }
2454         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2455         if (jp != NULL) {
2456             /* Convert jumbo packet header to network byte order */
2457             temp = (afs_uint32) (p->header.flags) << 24;
2458             temp |= (afs_uint32) (p->header.spare);
2459             *(afs_uint32 *) jp = htonl(temp);
2460         }
2461         jp = (struct rx_jumboHeader *)
2462             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2463
2464         /* Stamp each packet with a unique serial number.  The serial
2465          * number is maintained on a connection basis because some types
2466          * of security may be based on the serial number of the packet,
2467          * and security is handled on a per authenticated-connection
2468          * basis. */
2469         /* Pre-increment, to guarantee no zero serial number; a zero
2470          * serial number means the packet was never sent. */
2471         p->header.serial = ++serial;
2472         /* This is so we can adjust retransmit time-outs better in the face of
2473          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2474          */
2475         if (p->firstSerial == 0) {
2476             p->firstSerial = p->header.serial;
2477         }
2478 #ifdef RXDEBUG
2479         /* If an output tracer function is defined, call it with the packet and
2480          * network address.  Note this function may modify its arguments. */
2481         if (rx_almostSent) {
2482             int drop = (*rx_almostSent) (p, &addr);
2483             /* drop packet if return value is non-zero? */
2484             if (drop)
2485                 deliveryType = 'D';     /* Drop the packet */
2486         }
2487 #endif
2488
2489         /* Get network byte order header */
2490         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2491                                          * touch ALL the fields */
2492     }
2493
2494     /* Send the packet out on the same socket that related packets are being
2495      * received on */
2496     socket =
2497         (conn->type ==
2498          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2499
2500 #ifdef RXDEBUG
2501     /* Possibly drop this packet,  for testing purposes */
2502     if ((deliveryType == 'D')
2503         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2504             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2505         deliveryType = 'D';     /* Drop the packet */
2506     } else {
2507         deliveryType = 'S';     /* Send the packet */
2508 #endif /* RXDEBUG */
2509
2510         /* Loop until the packet is sent.  We'd prefer just to use a
2511          * blocking socket, but unfortunately the interface doesn't
2512          * allow us to have the socket block in send mode, and not
2513          * block in receive mode */
2514 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2515         waslocked = ISAFS_GLOCK();
2516         if (!istack && waslocked)
2517             AFS_GUNLOCK();
2518 #endif
2519         if ((code =
2520              rxi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2521                          istack)) != 0) {
2522             /* send failed, so let's hurry up the resend, eh? */
2523             if (rx_stats_active)
2524                 rx_atomic_inc(&rx_stats.netSendFailures);
2525             for (i = 0; i < len; i++) {
2526                 p = list[i];
2527                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2528             }
2529             /* Some systems are nice and tell us right away that we cannot
2530              * reach this recipient by returning an error code.
2531              * So, when this happens let's "down" the host NOW so
2532              * we don't sit around waiting for this host to timeout later.
2533              */
2534             if (call) {
2535                 rxi_NetSendError(call, code);
2536             }
2537         }
2538 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2539         if (!istack && waslocked)
2540             AFS_GLOCK();
2541 #endif
2542 #ifdef RXDEBUG
2543     }
2544
2545     osi_Assert(p != NULL);
2546
2547     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %p len %d\n",
2548           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2549           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2550           p->header.seq, p->header.flags, p, p->length));
2551
2552 #endif
2553     if (rx_stats_active) {
2554         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2555         MUTEX_ENTER(&peer->peer_lock);
2556         peer->bytesSent += p->length;
2557         MUTEX_EXIT(&peer->peer_lock);
2558     }
2559 }
2560
2561 /* Send a raw abort packet, without any call or connection structures */
2562 void
2563 rxi_SendRawAbort(osi_socket socket, afs_uint32 host, u_short port,
2564                  afs_uint32 serial, afs_int32 error,
2565                  struct rx_packet *source, int istack)
2566 {
2567     struct rx_header theader;
2568     struct sockaddr_in addr;
2569     struct iovec iov[2];
2570
2571     memset(&theader, 0, sizeof(theader));
2572     theader.epoch = htonl(source->header.epoch);
2573     theader.callNumber = htonl(source->header.callNumber);
2574     theader.serial = htonl(serial);
2575     theader.type = RX_PACKET_TYPE_ABORT;
2576     theader.serviceId = htons(source->header.serviceId);
2577     theader.securityIndex = source->header.securityIndex;
2578     theader.cid = htonl(source->header.cid);
2579
2580     /*
2581      * If the abort is being sent in response to a server initiated packet,
2582      * set client_initiated in the abort to ensure it is not associated by
2583      * the receiver with a connection in the opposite direction.
2584      */
2585     if ((source->header.flags & RX_CLIENT_INITIATED) != RX_CLIENT_INITIATED)
2586         theader.flags |= RX_CLIENT_INITIATED;
2587
2588     error = htonl(error);
2589
2590     iov[0].iov_base = &theader;
2591     iov[0].iov_len = sizeof(struct rx_header);
2592     iov[1].iov_base = &error;
2593     iov[1].iov_len = sizeof(error);
2594
2595     addr.sin_family = AF_INET;
2596     addr.sin_addr.s_addr = host;
2597     addr.sin_port = port;
2598     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2599 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2600     addr.sin_len = sizeof(struct sockaddr_in);
2601 #endif
2602
2603     rxi_NetSend(socket, &addr, iov, 2,
2604                 sizeof(struct rx_header) + sizeof(error), istack);
2605 }
2606
2607 /* Send a "special" packet to the peer connection.  If call is
2608  * specified, then the packet is directed to a specific call channel
2609  * associated with the connection, otherwise it is directed to the
2610  * connection only. Uses optionalPacket if it is supplied, rather than
2611  * allocating a new packet buffer.  Nbytes is the length of the data
2612  * portion of the packet.  If data is non-null, nbytes of data are
2613  * copied into the packet.  Type is the type of the packet, as defined
2614  * in rx.h.  Bug: there's a lot of duplication between this and other
2615  * routines.  This needs to be cleaned up. */
2616 struct rx_packet *
2617 rxi_SendSpecial(struct rx_call *call,
2618                 struct rx_connection *conn,
2619                 struct rx_packet *optionalPacket, int type, char *data,
2620                 int nbytes, int istack)
2621 {
2622     /* Some of the following stuff should be common code for all
2623      * packet sends (it's repeated elsewhere) */
2624     struct rx_packet *p;
2625     unsigned int i = 0;
2626     int savelen = 0, saven = 0;
2627     int channel, callNumber;
2628     if (call) {
2629         channel = call->channel;
2630         callNumber = *call->callNumber;
2631         /* BUSY packets refer to the next call on this connection */
2632         if (type == RX_PACKET_TYPE_BUSY) {
2633             callNumber++;
2634         }
2635     } else {
2636         channel = 0;
2637         callNumber = 0;
2638     }
2639     p = optionalPacket;
2640     if (!p) {
2641         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2642         if (!p)
2643             osi_Panic("rxi_SendSpecial failure");
2644     }
2645
2646     if (nbytes != -1)
2647         p->length = nbytes;
2648     else
2649         nbytes = p->length;
2650     p->header.serviceId = conn->serviceId;
2651     p->header.securityIndex = conn->securityIndex;
2652     p->header.cid = (conn->cid | channel);
2653     p->header.callNumber = callNumber;
2654     p->header.seq = 0;
2655     p->header.epoch = conn->epoch;
2656     p->header.type = type;
2657     p->header.userStatus = 0;
2658     p->header.flags = 0;
2659     if (conn->type == RX_CLIENT_CONNECTION)
2660         p->header.flags |= RX_CLIENT_INITIATED;
2661     if (data)
2662         rx_packetwrite(p, 0, nbytes, data);
2663
2664     for (i = 1; i < p->niovecs; i++) {
2665         if (nbytes <= p->wirevec[i].iov_len) {
2666             savelen = p->wirevec[i].iov_len;
2667             saven = p->niovecs;
2668             p->wirevec[i].iov_len = nbytes;
2669             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2670         } else
2671             nbytes -= p->wirevec[i].iov_len;
2672     }
2673
2674     if (call)
2675         rxi_Send(call, p, istack);
2676     else
2677         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2678     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2679         /* really need to do this, but it seems safer this way, given that  */
2680         /* sneaky optionalPacket... */
2681         p->wirevec[i - 1].iov_len = savelen;
2682         p->niovecs = saven;
2683     }
2684     if (!optionalPacket)
2685         rxi_FreePacket(p);
2686     return optionalPacket;
2687 }
2688
2689
2690 /* Encode the packet's header (from the struct header in the packet to
2691  * the net byte order representation in the wire representation of the
2692  * packet, which is what is actually sent out on the wire) */
2693 void
2694 rxi_EncodePacketHeader(struct rx_packet *p)
2695 {
2696     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2697
2698     memset(buf, 0, RX_HEADER_SIZE);
2699     *buf++ = htonl(p->header.epoch);
2700     *buf++ = htonl(p->header.cid);
2701     *buf++ = htonl(p->header.callNumber);
2702     *buf++ = htonl(p->header.seq);
2703     *buf++ = htonl(p->header.serial);
2704     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2705                    | (((afs_uint32) p->header.flags) << 16)
2706                    | (p->header.userStatus << 8) | p->header.securityIndex);
2707     /* Note: top 16 bits of this next word were reserved */
2708     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2709 }
2710
2711 /* Decode the packet's header (from net byte order to a struct header) */
2712 void
2713 rxi_DecodePacketHeader(struct rx_packet *p)
2714 {
2715     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2716     afs_uint32 temp;
2717
2718     p->header.epoch = ntohl(*buf);
2719     buf++;
2720     p->header.cid = ntohl(*buf);
2721     buf++;
2722     p->header.callNumber = ntohl(*buf);
2723     buf++;
2724     p->header.seq = ntohl(*buf);
2725     buf++;
2726     p->header.serial = ntohl(*buf);
2727     buf++;
2728
2729     temp = ntohl(*buf);
2730     buf++;
2731
2732     /* C will truncate byte fields to bytes for me */
2733     p->header.type = temp >> 24;
2734     p->header.flags = temp >> 16;
2735     p->header.userStatus = temp >> 8;
2736     p->header.securityIndex = temp >> 0;
2737
2738     temp = ntohl(*buf);
2739     buf++;
2740
2741     p->header.serviceId = (temp & 0xffff);
2742     p->header.spare = temp >> 16;
2743     /* Note: top 16 bits of this last word are the security checksum */
2744 }
2745
2746 /*
2747  * LOCKS HELD: called with call->lock held.
2748  *
2749  * PrepareSendPacket is the only place in the code that
2750  * can increment call->tnext.  This could become an atomic
2751  * in the future.  Beyond that there is nothing in this
2752  * function that requires the call being locked.  This
2753  * function can only be called by the application thread.
2754  */
2755 void
2756 rxi_PrepareSendPacket(struct rx_call *call,
2757                       struct rx_packet *p, int last)
2758 {
2759     struct rx_connection *conn = call->conn;
2760     afs_uint32 seq = call->tnext++;
2761     unsigned int i;
2762     afs_int32 len;              /* len must be a signed type; it can go negative */
2763     int code;
2764
2765     /* No data packets on call 0. Where do these come from? */
2766     if (*call->callNumber == 0)
2767         *call->callNumber = 1;
2768
2769     MUTEX_EXIT(&call->lock);
2770     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2771
2772     p->header.cid = (conn->cid | call->channel);
2773     p->header.serviceId = conn->serviceId;
2774     p->header.securityIndex = conn->securityIndex;
2775
2776     p->header.callNumber = *call->callNumber;
2777     p->header.seq = seq;
2778     p->header.epoch = conn->epoch;
2779     p->header.type = RX_PACKET_TYPE_DATA;
2780     p->header.userStatus = 0;
2781     p->header.flags = 0;
2782     p->header.spare = 0;
2783     if (conn->type == RX_CLIENT_CONNECTION)
2784         p->header.flags |= RX_CLIENT_INITIATED;
2785
2786     if (last)
2787         p->header.flags |= RX_LAST_PACKET;
2788
2789     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2790     p->header.serial = 0;       /* Another way of saying never transmitted... */
2791
2792     /* Now that we're sure this is the last data on the call, make sure
2793      * that the "length" and the sum of the iov_lens matches. */
2794     len = p->length + call->conn->securityHeaderSize;
2795
2796     for (i = 1; i < p->niovecs && len > 0; i++) {
2797         len -= p->wirevec[i].iov_len;
2798     }
2799     if (len > 0) {
2800         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2801     } else if (i < p->niovecs) {
2802         /* Free any extra elements in the wirevec */
2803 #if defined(RX_ENABLE_TSFPQ)
2804         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2805 #else /* !RX_ENABLE_TSFPQ */
2806         MUTEX_ENTER(&rx_freePktQ_lock);
2807         rxi_FreeDataBufsNoLock(p, i);
2808         MUTEX_EXIT(&rx_freePktQ_lock);
2809 #endif /* !RX_ENABLE_TSFPQ */
2810
2811         p->niovecs = i;
2812     }
2813     if (len)
2814         p->wirevec[i - 1].iov_len += len;
2815     MUTEX_ENTER(&call->lock);
2816     code = RXS_PreparePacket(conn->securityObject, call, p);
2817     if (code) {
2818         MUTEX_EXIT(&call->lock);
2819         rxi_ConnectionError(conn, code);
2820         MUTEX_ENTER(&conn->conn_data_lock);
2821         p = rxi_SendConnectionAbort(conn, p, 0, 0);
2822         MUTEX_EXIT(&conn->conn_data_lock);
2823         MUTEX_ENTER(&call->lock);
2824         /* setting a connection error means all calls for that conn are also
2825          * error'd. if this call does not have an error by now, something is
2826          * very wrong, and we risk sending data in the clear that is supposed
2827          * to be encrypted. */
2828         osi_Assert(call->error);
2829     }
2830 }
2831
2832 /* Given an interface MTU size, calculate an adjusted MTU size that
2833  * will make efficient use of the RX buffers when the peer is sending
2834  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2835 int
2836 rxi_AdjustIfMTU(int mtu)
2837 {
2838     int adjMTU;
2839     int frags;
2840
2841     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2842         return mtu;
2843     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2844     if (mtu <= adjMTU) {
2845         return mtu;
2846     }
2847     mtu -= adjMTU;
2848     if (mtu <= 0) {
2849         return adjMTU;
2850     }
2851     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2852     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2853 }
2854
2855 /* Given an interface MTU size, and the peer's advertised max receive
2856  * size, calculate an adjisted maxMTU size that makes efficient use
2857  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2858 int
2859 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2860 {
2861     int maxMTU = mtu * rxi_nSendFrags;
2862     maxMTU = MIN(maxMTU, peerMaxMTU);
2863     return rxi_AdjustIfMTU(maxMTU);
2864 }
2865
2866 /* Given a packet size, figure out how many datagram packet will fit.
2867  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2868  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2869  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2870 int
2871 rxi_AdjustDgramPackets(int frags, int mtu)
2872 {
2873     int maxMTU;
2874     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2875         return 1;
2876     }
2877     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2878     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2879     /* subtract the size of the first and last packets */
2880     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2881     if (maxMTU < 0) {
2882         return 1;
2883     }
2884     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2885 }
2886
2887 #ifndef KERNEL
2888 /*
2889  * This function can be used by the Windows Cache Manager
2890  * to dump the list of all rx packets so that we can determine
2891  * where the packet leakage is.
2892  */
2893 int rx_DumpPackets(FILE *outputFile, char *cookie)
2894 {
2895 #ifdef RXDEBUG_PACKET
2896     struct rx_packet *p;
2897 #ifdef AFS_NT40_ENV
2898     int zilch;
2899     char output[2048];
2900 #define RXDPRINTF sprintf
2901 #define RXDPRINTOUT output
2902 #else
2903 #define RXDPRINTF fprintf
2904 #define RXDPRINTOUT outputFile
2905 #endif
2906
2907     NETPRI;
2908     MUTEX_ENTER(&rx_freePktQ_lock);
2909     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2910 #ifdef AFS_NT40_ENV
2911     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2912 #endif
2913
2914     for (p = rx_mallocedP; p; p = p->allNextp) {
2915         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2916                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2917                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2918                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2919                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2920                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2921 #ifdef AFS_NT40_ENV
2922         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2923 #endif
2924     }
2925
2926     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2927 #ifdef AFS_NT40_ENV
2928     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2929 #endif
2930
2931     MUTEX_EXIT(&rx_freePktQ_lock);
2932     USERPRI;
2933 #endif /* RXDEBUG_PACKET */
2934     return 0;
2935 }
2936 #endif