src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # include <afs/opr.h>
  47 # if defined(AFS_NT40_ENV)
  48 #  ifndef EWOULDBLOCK
  49 #   define EWOULDBLOCK WSAEWOULDBLOCK
  50 #  endif
  51 #  include "rx_user.h"
  52 #  include "rx_xmit_nt.h"
  53 # endif
  54 # include <lwp.h>
  55 #endif /* KERNEL */
  56
  57 #ifdef  AFS_SUN5_ENV
  58 # include <sys/sysmacros.h>
  59 #endif
  60
  61 #include <opr/queue.h>
  62
  63 #include "rx.h"
  64 #include "rx_clock.h"
  65 #include "rx_packet.h"
  66 #include "rx_atomic.h"
  67 #include "rx_globals.h"
  68 #include "rx_internal.h"
  69 #include "rx_stats.h"
  70
  71 #include "rx_peer.h"
  72 #include "rx_conn.h"
  73 #include "rx_call.h"
  74
  75 /*!
  76  * \brief structure used to keep track of allocated packets
  77  */
  78 struct rx_mallocedPacket {
  79     struct opr_queue entry;     /*!< chained using opr_queue */
  80     struct rx_packet *addr;     /*!< address of the first element */
  81     afs_uint32 size;            /*!< array size in bytes */
  82 };
  83
  84 #ifdef RX_LOCKS_DB
  85 /* rxdb_fileID is used to identify the lock location, along with line#. */
  86 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  87 #endif /* RX_LOCKS_DB */
  88 static struct rx_packet *rx_mallocedP = 0;
  89 #ifdef RXDEBUG_PACKET
  90 static afs_uint32       rx_packet_id = 0;
  91 #endif
  92
  93 extern char cml_version_number[];
  94
  95 static int AllocPacketBufs(int class, int num_pkts, struct opr_queue *q);
  96
  97 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  98                                 afs_uint32 ahost, short aport,
  99                                 afs_int32 istack);
 100 static struct rx_packet *rxi_AllocPacketNoLock(int class);
 101
 102 #ifndef KERNEL
 103 static void rxi_MorePacketsNoLock(int apackets);
 104 #endif
 105
 106 #ifdef RX_ENABLE_TSFPQ
 107 static int rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first,
 108                                  int flush_global);
 109 static void rxi_AdjustLocalPacketsTSFPQ(int num_keep_local,
 110                                         int allow_overcommit);
 111 #else
 112 static void rxi_FreePacketNoLock(struct rx_packet *p);
 113 static int rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first);
 114 static int rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first,
 115                                    struct opr_queue * q);
 116 #endif
 117
 118 extern struct opr_queue rx_idleServerQueue;
 119
 120 /* some rules about packets:
 121  * 1.  When a packet is allocated, the final iov_buf contains room for
 122  * a security trailer, but iov_len masks that fact.  If the security
 123  * package wants to add the trailer, it may do so, and then extend
 124  * iov_len appropriately.  For this reason, packet's niovecs and
 125  * iov_len fields should be accurate before calling PreparePacket.
 126 */
 127
 128 /* Preconditions:
 129  *        all packet buffers (iov_base) are integral multiples of
 130  *        the word size.
 131  *        offset is an integral multiple of the word size.
 132  */
 133 afs_int32
 134 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 135 {
 136     unsigned int i;
 137     size_t l;
 138     for (l = 0, i = 1; i < packet->niovecs; i++) {
 139         if (l + packet->wirevec[i].iov_len > offset) {
 140             return
 141                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 142                                  (offset - l)));
 143         }
 144         l += packet->wirevec[i].iov_len;
 145     }
 146
 147     return 0;
 148 }
 149
 150 /* Preconditions:
 151  *        all packet buffers (iov_base) are integral multiples of the word size.
 152  *        offset is an integral multiple of the word size.
 153  */
 154 afs_int32
 155 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 156 {
 157     unsigned int i;
 158     size_t l;
 159     for (l = 0, i = 1; i < packet->niovecs; i++) {
 160         if (l + packet->wirevec[i].iov_len > offset) {
 161             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 162                              (offset - l))) = data;
 163             return 0;
 164         }
 165         l += packet->wirevec[i].iov_len;
 166     }
 167
 168     return 0;
 169 }
 170
 171 /* Preconditions:
 172  *        all packet buffers (iov_base) are integral multiples of the
 173  *        word size.
 174  *        offset is an integral multiple of the word size.
 175  * Packet Invariants:
 176  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 177  */
 178 afs_int32
 179 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 180                   char *out)
 181 {
 182     unsigned int i, j, l, r;
 183     for (l = 0, i = 1; i < packet->niovecs; i++) {
 184         if (l + packet->wirevec[i].iov_len > offset) {
 185             break;
 186         }
 187         l += packet->wirevec[i].iov_len;
 188     }
 189
 190     /* i is the iovec which contains the first little bit of data in which we
 191      * are interested.  l is the total length of everything prior to this iovec.
 192      * j is the number of bytes we can safely copy out of this iovec.
 193      * offset only applies to the first iovec.
 194      */
 195     r = resid;
 196     while ((r > 0) && (i < packet->niovecs)) {
 197         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 198         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 199         r -= j;
 200         out += j;
 201         l += packet->wirevec[i].iov_len;
 202         offset = l;
 203         i++;
 204     }
 205
 206     return (r ? (resid - r) : resid);
 207 }
 208
 209
 210 /* Preconditions:
 211  *        all packet buffers (iov_base) are integral multiples of the
 212  *        word size.
 213  *        offset is an integral multiple of the word size.
 214  */
 215 afs_int32
 216 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 217 {
 218     unsigned int i, j, l, o, r;
 219     char *b;
 220
 221     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 222         if (l + packet->wirevec[i].iov_len > o) {
 223             break;
 224         }
 225         l += packet->wirevec[i].iov_len;
 226     }
 227
 228     /* i is the iovec which contains the first little bit of data in which we
 229      * are interested.  l is the total length of everything prior to this iovec.
 230      * j is the number of bytes we can safely copy out of this iovec.
 231      * offset only applies to the first iovec.
 232      */
 233     r = resid;
 234     while ((r > 0) && (i <= RX_MAXWVECS)) {
 235         if (i >= packet->niovecs)
 236             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 237                 break;
 238
 239         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 240         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 241         memcpy(b, in, j);
 242         r -= j;
 243         in += j;
 244         l += packet->wirevec[i].iov_len;
 245         offset = l;
 246         i++;
 247     }
 248
 249     return (r ? (resid - r) : resid);
 250 }
 251
 252 int
 253 rxi_AllocPackets(int class, int num_pkts, struct opr_queue * q)
 254 {
 255     struct opr_queue *c;
 256
 257     num_pkts = AllocPacketBufs(class, num_pkts, q);
 258
 259     for (opr_queue_Scan(q, c)) {
 260         RX_PACKET_IOV_FULLINIT(opr_queue_Entry(c, struct rx_packet, entry));
 261     }
 262
 263     return num_pkts;
 264 }
 265
 266 #ifdef RX_ENABLE_TSFPQ
 267 static int
 268 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 269 {
 270     struct rx_ts_info_t * rx_ts_info;
 271     int transfer;
 272     SPLVAR;
 273
 274     RX_TS_INFO_GET(rx_ts_info);
 275
 276     transfer = num_pkts - rx_ts_info->_FPQ.len;
 277     if (transfer > 0) {
 278         NETPRI;
 279         MUTEX_ENTER(&rx_freePktQ_lock);
 280         transfer = MAX(transfer, rx_TSFPQGlobSize);
 281         if (transfer > rx_nFreePackets) {
 282             /* alloc enough for us, plus a few globs for other threads */
 283             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 284         }
 285
 286         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 287
 288         MUTEX_EXIT(&rx_freePktQ_lock);
 289         USERPRI;
 290     }
 291
 292     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 293
 294     return num_pkts;
 295 }
 296 #else /* RX_ENABLE_TSFPQ */
 297 static int
 298 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 299 {
 300     struct rx_packet *c;
 301     int i;
 302 #ifdef KERNEL
 303     int overq = 0;
 304 #endif
 305     SPLVAR;
 306
 307     NETPRI;
 308
 309     MUTEX_ENTER(&rx_freePktQ_lock);
 310
 311 #ifdef KERNEL
 312     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 313          num_pkts--, overq++);
 314
 315     if (overq) {
 316         rxi_NeedMorePackets = TRUE;
 317         if (rx_stats_active) {
 318             switch (class) {
 319             case RX_PACKET_CLASS_RECEIVE:
 320                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 321                 break;
 322             case RX_PACKET_CLASS_SEND:
 323                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 324                 break;
 325             case RX_PACKET_CLASS_SPECIAL:
 326                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 327                 break;
 328             case RX_PACKET_CLASS_RECV_CBUF:
 329                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 330                 break;
 331             case RX_PACKET_CLASS_SEND_CBUF:
 332                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 333                 break;
 334             }
 335         }
 336     }
 337
 338     if (rx_nFreePackets < num_pkts)
 339         num_pkts = rx_nFreePackets;
 340
 341     if (!num_pkts) {
 342         rxi_NeedMorePackets = TRUE;
 343         goto done;
 344     }
 345 #else /* KERNEL */
 346     if (rx_nFreePackets < num_pkts) {
 347         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 348     }
 349 #endif /* KERNEL */
 350
 351     for (i=0, c=opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
 352          i < num_pkts;
 353          i++, c=opr_queue_Next(&c->entry, struct rx_packet, entry)) {
 354         RX_FPQ_MARK_USED(c);
 355     }
 356
 357     opr_queue_SplitBeforeAppend(&rx_freePacketQueue, q, &c->entry);
 358
 359     rx_nFreePackets -= num_pkts;
 360
 361 #ifdef KERNEL
 362   done:
 363 #endif
 364     MUTEX_EXIT(&rx_freePktQ_lock);
 365
 366     USERPRI;
 367     return num_pkts;
 368 }
 369 #endif /* RX_ENABLE_TSFPQ */
 370
 371 /*
 372  * Free a packet currently used as a continuation buffer
 373  */
 374 #ifdef RX_ENABLE_TSFPQ
 375 /* num_pkts=0 means queue length is unknown */
 376 int
 377 rxi_FreePackets(int num_pkts, struct opr_queue * q)
 378 {
 379     struct rx_ts_info_t * rx_ts_info;
 380     struct opr_queue *cursor, *store;
 381     SPLVAR;
 382
 383     osi_Assert(num_pkts >= 0);
 384     RX_TS_INFO_GET(rx_ts_info);
 385
 386     if (!num_pkts) {
 387         for (opr_queue_ScanSafe(q, cursor, store)) {
 388             num_pkts++;
 389             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 390                                                  entry), 2, 0);
 391         }
 392     } else {
 393         for (opr_queue_ScanSafe(q, cursor, store)) {
 394             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 395                                                  entry), 2, 0);
 396         }
 397     }
 398
 399     if (num_pkts) {
 400         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 401     }
 402
 403     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 404         NETPRI;
 405         MUTEX_ENTER(&rx_freePktQ_lock);
 406
 407         RX_TS_FPQ_LTOG(rx_ts_info);
 408
 409         /* Wakeup anyone waiting for packets */
 410         rxi_PacketsUnWait();
 411
 412         MUTEX_EXIT(&rx_freePktQ_lock);
 413         USERPRI;
 414     }
 415
 416     return num_pkts;
 417 }
 418 #else /* RX_ENABLE_TSFPQ */
 419 /* num_pkts=0 means queue length is unknown */
 420 int
 421 rxi_FreePackets(int num_pkts, struct opr_queue *q)
 422 {
 423     struct opr_queue cbs;
 424     struct opr_queue *cursor, *store;
 425     int qlen = 0;
 426     SPLVAR;
 427
 428     osi_Assert(num_pkts >= 0);
 429     opr_queue_Init(&cbs);
 430
 431     if (!num_pkts) {
 432         for (opr_queue_ScanSafe(q, cursor, store)) {
 433             struct rx_packet *p
 434                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 435             if (p->niovecs > 2) {
 436                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 437             }
 438             RX_FPQ_MARK_FREE(p);
 439             num_pkts++;
 440         }
 441         if (!num_pkts)
 442             return 0;
 443     } else {
 444         for (opr_queue_ScanSafe(q, cursor, store)) {
 445             struct rx_packet *p
 446                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 447
 448             if (p->niovecs > 2) {
 449                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 450             }
 451             RX_FPQ_MARK_FREE(p);
 452         }
 453     }
 454
 455     if (qlen) {
 456         opr_queue_SpliceAppend(q, &cbs);
 457         qlen += num_pkts;
 458     } else
 459         qlen = num_pkts;
 460
 461     NETPRI;
 462     MUTEX_ENTER(&rx_freePktQ_lock);
 463
 464     opr_queue_SpliceAppend(&rx_freePacketQueue, q);
 465     rx_nFreePackets += qlen;
 466
 467     /* Wakeup anyone waiting for packets */
 468     rxi_PacketsUnWait();
 469
 470     MUTEX_EXIT(&rx_freePktQ_lock);
 471     USERPRI;
 472
 473     return num_pkts;
 474 }
 475 #endif /* RX_ENABLE_TSFPQ */
 476
 477 /* this one is kind of awful.
 478  * In rxkad, the packet has been all shortened, and everything, ready for
 479  * sending.  All of a sudden, we discover we need some of that space back.
 480  * This isn't terribly general, because it knows that the packets are only
 481  * rounded up to the EBS (userdata + security header).
 482  */
 483 int
 484 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 485 {
 486     int i;
 487     i = p->niovecs - 1;
 488     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 489         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 490             p->wirevec[i].iov_len += nb;
 491             return 0;
 492         }
 493     } else {
 494         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 495             p->wirevec[i].iov_len += nb;
 496             return 0;
 497         }
 498     }
 499
 500     return 0;
 501 }
 502
 503 /* get sufficient space to store nb bytes of data (or more), and hook
 504  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 505  * returns the number of bytes >0 which it failed to come up with.
 506  * Don't need to worry about locking on packet, since only
 507  * one thread can manipulate one at a time. Locking on continution
 508  * packets is handled by AllocPacketBufs */
 509 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 510 int
 511 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 512 {
 513     int i, nv;
 514     struct opr_queue q, *cursor, *store;
 515
 516     /* compute the number of cbuf's we need */
 517     nv = nb / RX_CBUFFERSIZE;
 518     if ((nv * RX_CBUFFERSIZE) < nb)
 519         nv++;
 520     if ((nv + p->niovecs) > RX_MAXWVECS)
 521         nv = RX_MAXWVECS - p->niovecs;
 522     if (nv < 1)
 523         return nb;
 524
 525     /* allocate buffers */
 526     opr_queue_Init(&q);
 527     nv = AllocPacketBufs(class, nv, &q);
 528
 529     /* setup packet iovs */
 530     i = p ->niovecs;
 531     for (opr_queue_ScanSafe(&q, cursor, store)) {
 532         struct rx_packet *cb
 533             = opr_queue_Entry(cursor, struct rx_packet, entry);
 534
 535         opr_queue_Remove(&cb->entry);
 536         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 537         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 538         i++;
 539     }
 540
 541     nb -= (nv * RX_CBUFFERSIZE);
 542     p->length += (nv * RX_CBUFFERSIZE);
 543     p->niovecs += nv;
 544
 545     return nb;
 546 }
 547
 548 /**
 549  * Register allocated packets.
 550  *
 551  * @param[in] addr array of packets
 552  * @param[in] npkt number of packets
 553  *
 554  * @return none
 555  */
 556 static void
 557 registerPackets(struct rx_packet *addr, afs_uint32 npkt)
 558 {
 559     struct rx_mallocedPacket *mp;
 560
 561     mp = osi_Alloc(sizeof(*mp));
 562
 563     osi_Assert(mp != NULL);
 564     memset(mp, 0, sizeof(*mp));
 565
 566     mp->addr = addr;
 567     mp->size = npkt * sizeof(struct rx_packet);
 568     osi_Assert(npkt <= MAX_AFS_UINT32 / sizeof(struct rx_packet));
 569
 570     MUTEX_ENTER(&rx_mallocedPktQ_lock);
 571     opr_queue_Append(&rx_mallocedPacketQueue, &mp->entry);
 572     MUTEX_EXIT(&rx_mallocedPktQ_lock);
 573 }
 574
 575 /* Add more packet buffers */
 576 #ifdef RX_ENABLE_TSFPQ
 577 void
 578 rxi_MorePackets(int apackets)
 579 {
 580     struct rx_packet *p, *e;
 581     struct rx_ts_info_t * rx_ts_info;
 582     int getme;
 583     SPLVAR;
 584
 585     getme = apackets * sizeof(struct rx_packet);
 586     p = osi_Alloc(getme);
 587     osi_Assert(p);
 588     registerPackets(p, apackets);
 589
 590     PIN(p, getme);              /* XXXXX */
 591     memset(p, 0, getme);
 592     RX_TS_INFO_GET(rx_ts_info);
 593
 594     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 595     /* TSFPQ patch also needs to keep track of total packets */
 596
 597     MUTEX_ENTER(&rx_packets_mutex);
 598     rx_nPackets += apackets;
 599     RX_TS_FPQ_COMPUTE_LIMITS;
 600     MUTEX_EXIT(&rx_packets_mutex);
 601
 602     for (e = p + apackets; p < e; p++) {
 603         RX_PACKET_IOV_INIT(p);
 604         p->niovecs = 2;
 605
 606         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 607
 608         NETPRI;
 609         MUTEX_ENTER(&rx_freePktQ_lock);
 610 #ifdef RXDEBUG_PACKET
 611         p->packetId = rx_packet_id++;
 612         p->allNextp = rx_mallocedP;
 613 #endif /* RXDEBUG_PACKET */
 614         rx_mallocedP = p;
 615         MUTEX_EXIT(&rx_freePktQ_lock);
 616         USERPRI;
 617     }
 618     rx_ts_info->_FPQ.delta += apackets;
 619
 620     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 621         NETPRI;
 622         MUTEX_ENTER(&rx_freePktQ_lock);
 623
 624         RX_TS_FPQ_LTOG(rx_ts_info);
 625         rxi_NeedMorePackets = FALSE;
 626         rxi_PacketsUnWait();
 627
 628         MUTEX_EXIT(&rx_freePktQ_lock);
 629         USERPRI;
 630     }
 631 }
 632 #else /* RX_ENABLE_TSFPQ */
 633 void
 634 rxi_MorePackets(int apackets)
 635 {
 636     struct rx_packet *p, *e;
 637     int getme;
 638     SPLVAR;
 639
 640     getme = apackets * sizeof(struct rx_packet);
 641     p = osi_Alloc(getme);
 642     osi_Assert(p);
 643     registerPackets(p, apackets);
 644
 645     PIN(p, getme);              /* XXXXX */
 646     memset(p, 0, getme);
 647     NETPRI;
 648     MUTEX_ENTER(&rx_freePktQ_lock);
 649
 650     for (e = p + apackets; p < e; p++) {
 651         RX_PACKET_IOV_INIT(p);
 652 #ifdef RX_TRACK_PACKETS
 653         p->flags |= RX_PKTFLAG_FREE;
 654 #endif
 655         p->niovecs = 2;
 656
 657         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 658 #ifdef RXDEBUG_PACKET
 659         p->packetId = rx_packet_id++;
 660         p->allNextp = rx_mallocedP;
 661 #endif /* RXDEBUG_PACKET */
 662         rx_mallocedP = p;
 663     }
 664
 665     rx_nPackets += apackets;
 666     rx_nFreePackets += apackets;
 667     rxi_NeedMorePackets = FALSE;
 668     rxi_PacketsUnWait();
 669
 670     MUTEX_EXIT(&rx_freePktQ_lock);
 671     USERPRI;
 672 }
 673 #endif /* RX_ENABLE_TSFPQ */
 674
 675 #ifdef RX_ENABLE_TSFPQ
 676 void
 677 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 678 {
 679     struct rx_packet *p, *e;
 680     struct rx_ts_info_t * rx_ts_info;
 681     int getme;
 682     SPLVAR;
 683
 684     getme = apackets * sizeof(struct rx_packet);
 685     p = osi_Alloc(getme);
 686     registerPackets(p, apackets);
 687
 688     PIN(p, getme);              /* XXXXX */
 689     memset(p, 0, getme);
 690     RX_TS_INFO_GET(rx_ts_info);
 691
 692     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 693     /* TSFPQ patch also needs to keep track of total packets */
 694     MUTEX_ENTER(&rx_packets_mutex);
 695     rx_nPackets += apackets;
 696     RX_TS_FPQ_COMPUTE_LIMITS;
 697     MUTEX_EXIT(&rx_packets_mutex);
 698
 699     for (e = p + apackets; p < e; p++) {
 700         RX_PACKET_IOV_INIT(p);
 701         p->niovecs = 2;
 702         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 703
 704         NETPRI;
 705         MUTEX_ENTER(&rx_freePktQ_lock);
 706 #ifdef RXDEBUG_PACKET
 707         p->packetId = rx_packet_id++;
 708         p->allNextp = rx_mallocedP;
 709 #endif /* RXDEBUG_PACKET */
 710         rx_mallocedP = p;
 711         MUTEX_EXIT(&rx_freePktQ_lock);
 712         USERPRI;
 713     }
 714     rx_ts_info->_FPQ.delta += apackets;
 715
 716     if (flush_global &&
 717         (num_keep_local < apackets)) {
 718         NETPRI;
 719         MUTEX_ENTER(&rx_freePktQ_lock);
 720
 721         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 722         rxi_NeedMorePackets = FALSE;
 723         rxi_PacketsUnWait();
 724
 725         MUTEX_EXIT(&rx_freePktQ_lock);
 726         USERPRI;
 727     }
 728 }
 729 #endif /* RX_ENABLE_TSFPQ */
 730
 731 #ifndef KERNEL
 732 /* Add more packet buffers */
 733 static void
 734 rxi_MorePacketsNoLock(int apackets)
 735 {
 736 #ifdef RX_ENABLE_TSFPQ
 737     struct rx_ts_info_t * rx_ts_info;
 738 #endif /* RX_ENABLE_TSFPQ */
 739     struct rx_packet *p, *e;
 740     int getme;
 741
 742     /* allocate enough packets that 1/4 of the packets will be able
 743      * to hold maximal amounts of data */
 744     apackets += (apackets / 4)
 745         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 746     do {
 747         getme = apackets * sizeof(struct rx_packet);
 748         p = osi_Alloc(getme);
 749         if (p == NULL) {
 750             apackets -= apackets / 4;
 751             osi_Assert(apackets > 0);
 752         }
 753     } while(p == NULL);
 754     memset(p, 0, getme);
 755     registerPackets(p, apackets);
 756
 757 #ifdef RX_ENABLE_TSFPQ
 758     RX_TS_INFO_GET(rx_ts_info);
 759     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 760 #endif /* RX_ENABLE_TSFPQ */
 761
 762     for (e = p + apackets; p < e; p++) {
 763         RX_PACKET_IOV_INIT(p);
 764 #ifdef RX_TRACK_PACKETS
 765         p->flags |= RX_PKTFLAG_FREE;
 766 #endif
 767         p->niovecs = 2;
 768
 769         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 770 #ifdef RXDEBUG_PACKET
 771         p->packetId = rx_packet_id++;
 772         p->allNextp = rx_mallocedP;
 773 #endif /* RXDEBUG_PACKET */
 774         rx_mallocedP = p;
 775     }
 776
 777     rx_nFreePackets += apackets;
 778     MUTEX_ENTER(&rx_packets_mutex);
 779     rx_nPackets += apackets;
 780 #ifdef RX_ENABLE_TSFPQ
 781     RX_TS_FPQ_COMPUTE_LIMITS;
 782 #endif /* RX_ENABLE_TSFPQ */
 783     MUTEX_EXIT(&rx_packets_mutex);
 784     rxi_NeedMorePackets = FALSE;
 785     rxi_PacketsUnWait();
 786 }
 787 #endif /* !KERNEL */
 788
 789 void
 790 rxi_FreeAllPackets(void)
 791 {
 792     struct rx_mallocedPacket *mp;
 793
 794     MUTEX_ENTER(&rx_mallocedPktQ_lock);
 795
 796     while (!opr_queue_IsEmpty(&rx_mallocedPacketQueue)) {
 797         mp = opr_queue_First(&rx_mallocedPacketQueue,
 798                              struct rx_mallocedPacket, entry);
 799         opr_queue_Remove(&mp->entry);
 800         osi_Free(mp->addr, mp->size);
 801         UNPIN(mp->addr, mp->size);
 802         osi_Free(mp, sizeof(*mp));
 803     }
 804     MUTEX_EXIT(&rx_mallocedPktQ_lock);
 805 }
 806
 807 #ifdef RX_ENABLE_TSFPQ
 808 static void
 809 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 810 {
 811     struct rx_ts_info_t * rx_ts_info;
 812     int xfer;
 813     SPLVAR;
 814
 815     RX_TS_INFO_GET(rx_ts_info);
 816
 817     if (num_keep_local != rx_ts_info->_FPQ.len) {
 818         NETPRI;
 819         MUTEX_ENTER(&rx_freePktQ_lock);
 820         if (num_keep_local < rx_ts_info->_FPQ.len) {
 821             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 822             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 823             rxi_PacketsUnWait();
 824         } else {
 825             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 826             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 827                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 828             if (rx_nFreePackets < xfer) {
 829                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 830             }
 831             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 832         }
 833         MUTEX_EXIT(&rx_freePktQ_lock);
 834         USERPRI;
 835     }
 836 }
 837
 838 void
 839 rxi_FlushLocalPacketsTSFPQ(void)
 840 {
 841     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 842 }
 843 #endif /* RX_ENABLE_TSFPQ */
 844
 845 /* Allocate more packets iff we need more continuation buffers */
 846 /* In kernel, can't page in memory with interrupts disabled, so we
 847  * don't use the event mechanism. */
 848 void
 849 rx_CheckPackets(void)
 850 {
 851     if (rxi_NeedMorePackets) {
 852         rxi_MorePackets(rx_maxSendWindow);
 853     }
 854 }
 855
 856 /* In the packet freeing routine below, the assumption is that
 857    we want all of the packets to be used equally frequently, so that we
 858    don't get packet buffers paging out.  It would be just as valid to
 859    assume that we DO want them to page out if not many are being used.
 860    In any event, we assume the former, and append the packets to the end
 861    of the free list.  */
 862 /* This explanation is bogus.  The free list doesn't remain in any kind of
 863    useful order for afs_int32: the packets in use get pretty much randomly scattered
 864    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 865    must be stored so that packets which are adjacent in memory are adjacent in the
 866    free list.  An array springs rapidly to mind.
 867    */
 868
 869 /* Actually free the packet p. */
 870 #ifndef RX_ENABLE_TSFPQ
 871 static void
 872 rxi_FreePacketNoLock(struct rx_packet *p)
 873 {
 874     dpf(("Free %"AFS_PTR_FMT"\n", p));
 875
 876     RX_FPQ_MARK_FREE(p);
 877     rx_nFreePackets++;
 878     opr_queue_Append(&rx_freePacketQueue, &p->entry);
 879 }
 880 #endif /* RX_ENABLE_TSFPQ */
 881
 882 #ifdef RX_ENABLE_TSFPQ
 883 static void
 884 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 885 {
 886     struct rx_ts_info_t * rx_ts_info;
 887     dpf(("Free %"AFS_PTR_FMT"\n", p));
 888
 889     RX_TS_INFO_GET(rx_ts_info);
 890     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 891
 892     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 893         NETPRI;
 894         MUTEX_ENTER(&rx_freePktQ_lock);
 895
 896         RX_TS_FPQ_LTOG(rx_ts_info);
 897
 898         /* Wakeup anyone waiting for packets */
 899         rxi_PacketsUnWait();
 900
 901         MUTEX_EXIT(&rx_freePktQ_lock);
 902         USERPRI;
 903     }
 904 }
 905 #endif /* RX_ENABLE_TSFPQ */
 906
 907 /*
 908  * free continuation buffers off a packet into a queue
 909  *
 910  * [IN] p      -- packet from which continuation buffers will be freed
 911  * [IN] first  -- iovec offset of first continuation buffer to free
 912  * [IN] q      -- queue into which continuation buffers will be chained
 913  *
 914  * returns:
 915  *   number of continuation buffers freed
 916  */
 917 #ifndef RX_ENABLE_TSFPQ
 918 static int
 919 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct opr_queue * q)
 920 {
 921     struct iovec *iov;
 922     struct rx_packet * cb;
 923     int count = 0;
 924
 925     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 926         iov = &p->wirevec[first];
 927         if (!iov->iov_base)
 928             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 929         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 930         RX_FPQ_MARK_FREE(cb);
 931         opr_queue_Append(q, &cb->entry);
 932     }
 933     p->length = 0;
 934     p->niovecs = 0;
 935
 936     return count;
 937 }
 938
 939 /*
 940  * free packet continuation buffers into the global free packet pool
 941  *
 942  * [IN] p      -- packet from which to free continuation buffers
 943  * [IN] first  -- iovec offset of first continuation buffer to free
 944  *
 945  * returns:
 946  *   zero always
 947  */
 948 static int
 949 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 950 {
 951     struct iovec *iov;
 952
 953     for (first = MAX(2, first); first < p->niovecs; first++) {
 954         iov = &p->wirevec[first];
 955         if (!iov->iov_base)
 956             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 957         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 958     }
 959     p->length = 0;
 960     p->niovecs = 0;
 961
 962     return 0;
 963 }
 964
 965 #else
 966
 967 /*
 968  * free packet continuation buffers into the thread-local free pool
 969  *
 970  * [IN] p             -- packet from which continuation buffers will be freed
 971  * [IN] first         -- iovec offset of first continuation buffer to free
 972  *                       any value less than 2, the min number of iovecs,
 973  *                       is treated as if it is 2.
 974  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 975  *                       global free pool before returning
 976  *
 977  * returns:
 978  *   zero always
 979  */
 980 static int
 981 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 982 {
 983     struct iovec *iov;
 984     struct rx_ts_info_t * rx_ts_info;
 985
 986     RX_TS_INFO_GET(rx_ts_info);
 987
 988     for (first = MAX(2, first); first < p->niovecs; first++) {
 989         iov = &p->wirevec[first];
 990         if (!iov->iov_base)
 991             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 992         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 993     }
 994     p->length = 0;
 995     p->niovecs = 0;
 996
 997     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 998         NETPRI;
 999         MUTEX_ENTER(&rx_freePktQ_lock);
1000
1001         RX_TS_FPQ_LTOG(rx_ts_info);
1002
1003         /* Wakeup anyone waiting for packets */
1004         rxi_PacketsUnWait();
1005
1006         MUTEX_EXIT(&rx_freePktQ_lock);
1007         USERPRI;
1008     }
1009     return 0;
1010 }
1011 #endif /* RX_ENABLE_TSFPQ */
1012
1013 int rxi_nBadIovecs = 0;
1014
1015 /* rxi_RestoreDataBufs
1016  *
1017  * Restore the correct sizes to the iovecs. Called when reusing a packet
1018  * for reading off the wire.
1019  */
1020 void
1021 rxi_RestoreDataBufs(struct rx_packet *p)
1022 {
1023     unsigned int i;
1024     struct iovec *iov;
1025
1026     RX_PACKET_IOV_INIT(p);
1027
1028     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
1029         if (!iov->iov_base) {
1030             rxi_nBadIovecs++;
1031             p->niovecs = i;
1032             break;
1033         }
1034         iov->iov_len = RX_CBUFFERSIZE;
1035     }
1036 }
1037
1038 #ifdef RX_ENABLE_TSFPQ
1039 int
1040 rxi_TrimDataBufs(struct rx_packet *p, int first)
1041 {
1042     int length;
1043     struct iovec *iov, *end;
1044     struct rx_ts_info_t * rx_ts_info;
1045     SPLVAR;
1046
1047     if (first != 1)
1048         osi_Panic("TrimDataBufs 1: first must be 1");
1049
1050     /* Skip over continuation buffers containing message data */
1051     iov = &p->wirevec[2];
1052     end = iov + (p->niovecs - 2);
1053     length = p->length - p->wirevec[1].iov_len;
1054     for (; iov < end && length > 0; iov++) {
1055         if (!iov->iov_base)
1056             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1057         length -= iov->iov_len;
1058     }
1059
1060     /* iov now points to the first empty data buffer. */
1061     if (iov >= end)
1062         return 0;
1063
1064     RX_TS_INFO_GET(rx_ts_info);
1065     for (; iov < end; iov++) {
1066         if (!iov->iov_base)
1067             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1068         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1069         p->niovecs--;
1070     }
1071     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1072         NETPRI;
1073         MUTEX_ENTER(&rx_freePktQ_lock);
1074
1075         RX_TS_FPQ_LTOG(rx_ts_info);
1076         rxi_PacketsUnWait();
1077
1078         MUTEX_EXIT(&rx_freePktQ_lock);
1079         USERPRI;
1080     }
1081
1082     return 0;
1083 }
1084 #else /* RX_ENABLE_TSFPQ */
1085 int
1086 rxi_TrimDataBufs(struct rx_packet *p, int first)
1087 {
1088     int length;
1089     struct iovec *iov, *end;
1090     SPLVAR;
1091
1092     if (first != 1)
1093         osi_Panic("TrimDataBufs 1: first must be 1");
1094
1095     /* Skip over continuation buffers containing message data */
1096     iov = &p->wirevec[2];
1097     end = iov + (p->niovecs - 2);
1098     length = p->length - p->wirevec[1].iov_len;
1099     for (; iov < end && length > 0; iov++) {
1100         if (!iov->iov_base)
1101             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1102         length -= iov->iov_len;
1103     }
1104
1105     /* iov now points to the first empty data buffer. */
1106     if (iov >= end)
1107         return 0;
1108
1109     NETPRI;
1110     MUTEX_ENTER(&rx_freePktQ_lock);
1111
1112     for (; iov < end; iov++) {
1113         if (!iov->iov_base)
1114             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1115         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1116         p->niovecs--;
1117     }
1118     rxi_PacketsUnWait();
1119
1120     MUTEX_EXIT(&rx_freePktQ_lock);
1121     USERPRI;
1122
1123     return 0;
1124 }
1125 #endif /* RX_ENABLE_TSFPQ */
1126
1127 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1128  * remove it yourself first if you call this routine. */
1129 #ifdef RX_ENABLE_TSFPQ
1130 void
1131 rxi_FreePacket(struct rx_packet *p)
1132 {
1133     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1134     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1135 }
1136 #else /* RX_ENABLE_TSFPQ */
1137 void
1138 rxi_FreePacket(struct rx_packet *p)
1139 {
1140     SPLVAR;
1141
1142     NETPRI;
1143     MUTEX_ENTER(&rx_freePktQ_lock);
1144
1145     rxi_FreeDataBufsNoLock(p, 2);
1146     rxi_FreePacketNoLock(p);
1147     /* Wakeup anyone waiting for packets */
1148     rxi_PacketsUnWait();
1149
1150     MUTEX_EXIT(&rx_freePktQ_lock);
1151     USERPRI;
1152 }
1153 #endif /* RX_ENABLE_TSFPQ */
1154
1155 /* rxi_AllocPacket sets up p->length so it reflects the number of
1156  * bytes in the packet at this point, **not including** the header.
1157  * The header is absolutely necessary, besides, this is the way the
1158  * length field is usually used */
1159 #ifdef RX_ENABLE_TSFPQ
1160 static struct rx_packet *
1161 rxi_AllocPacketNoLock(int class)
1162 {
1163     struct rx_packet *p;
1164     struct rx_ts_info_t * rx_ts_info;
1165
1166     RX_TS_INFO_GET(rx_ts_info);
1167
1168     if (rx_stats_active)
1169         rx_atomic_inc(&rx_stats.packetRequests);
1170     if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1171
1172 #ifdef KERNEL
1173         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1174             osi_Panic("rxi_AllocPacket error");
1175 #else /* KERNEL */
1176         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1177             rxi_MorePacketsNoLock(rx_maxSendWindow);
1178 #endif /* KERNEL */
1179
1180
1181         RX_TS_FPQ_GTOL(rx_ts_info);
1182     }
1183
1184     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1185
1186     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1187
1188
1189     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1190      * order to truncate outbound packets.  In the near future, may need
1191      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1192      */
1193     RX_PACKET_IOV_FULLINIT(p);
1194     return p;
1195 }
1196 #else /* RX_ENABLE_TSFPQ */
1197 static struct rx_packet *
1198 rxi_AllocPacketNoLock(int class)
1199 {
1200     struct rx_packet *p;
1201
1202 #ifdef KERNEL
1203     if (rxi_OverQuota(class)) {
1204         rxi_NeedMorePackets = TRUE;
1205         if (rx_stats_active) {
1206             switch (class) {
1207             case RX_PACKET_CLASS_RECEIVE:
1208                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1209                 break;
1210             case RX_PACKET_CLASS_SEND:
1211                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1212                 break;
1213             case RX_PACKET_CLASS_SPECIAL:
1214                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1215                 break;
1216             case RX_PACKET_CLASS_RECV_CBUF:
1217                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1218                 break;
1219             case RX_PACKET_CLASS_SEND_CBUF:
1220                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1221                 break;
1222             }
1223         }
1224         return (struct rx_packet *)0;
1225     }
1226 #endif /* KERNEL */
1227
1228     if (rx_stats_active)
1229         rx_atomic_inc(&rx_stats.packetRequests);
1230
1231 #ifdef KERNEL
1232     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1233         osi_Panic("rxi_AllocPacket error");
1234 #else /* KERNEL */
1235     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1236         rxi_MorePacketsNoLock(rx_maxSendWindow);
1237 #endif /* KERNEL */
1238
1239     rx_nFreePackets--;
1240     p = opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
1241     opr_queue_Remove(&p->entry);
1242     RX_FPQ_MARK_USED(p);
1243
1244     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1245
1246
1247     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1248      * order to truncate outbound packets.  In the near future, may need
1249      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1250      */
1251     RX_PACKET_IOV_FULLINIT(p);
1252     return p;
1253 }
1254 #endif /* RX_ENABLE_TSFPQ */
1255
1256 #ifdef RX_ENABLE_TSFPQ
1257 static struct rx_packet *
1258 rxi_AllocPacketTSFPQ(int class, int pull_global)
1259 {
1260     struct rx_packet *p;
1261     struct rx_ts_info_t * rx_ts_info;
1262
1263     RX_TS_INFO_GET(rx_ts_info);
1264
1265     if (rx_stats_active)
1266         rx_atomic_inc(&rx_stats.packetRequests);
1267     if (pull_global && opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1268         MUTEX_ENTER(&rx_freePktQ_lock);
1269
1270         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1271             rxi_MorePacketsNoLock(rx_maxSendWindow);
1272
1273         RX_TS_FPQ_GTOL(rx_ts_info);
1274
1275         MUTEX_EXIT(&rx_freePktQ_lock);
1276     } else if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1277         return NULL;
1278     }
1279
1280     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1281
1282     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1283
1284     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1285      * order to truncate outbound packets.  In the near future, may need
1286      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1287      */
1288     RX_PACKET_IOV_FULLINIT(p);
1289     return p;
1290 }
1291 #endif /* RX_ENABLE_TSFPQ */
1292
1293 #ifdef RX_ENABLE_TSFPQ
1294 struct rx_packet *
1295 rxi_AllocPacket(int class)
1296 {
1297     struct rx_packet *p;
1298
1299     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1300     return p;
1301 }
1302 #else /* RX_ENABLE_TSFPQ */
1303 struct rx_packet *
1304 rxi_AllocPacket(int class)
1305 {
1306     struct rx_packet *p;
1307
1308     MUTEX_ENTER(&rx_freePktQ_lock);
1309     p = rxi_AllocPacketNoLock(class);
1310     MUTEX_EXIT(&rx_freePktQ_lock);
1311     return p;
1312 }
1313 #endif /* RX_ENABLE_TSFPQ */
1314
1315 /* This guy comes up with as many buffers as it {takes,can get} given
1316  * the MTU for this call. It also sets the packet length before
1317  * returning.  caution: this is often called at NETPRI
1318  * Called with call locked.
1319  */
1320 struct rx_packet *
1321 rxi_AllocSendPacket(struct rx_call *call, int want)
1322 {
1323     struct rx_packet *p = (struct rx_packet *)0;
1324     int mud;
1325     unsigned delta;
1326
1327     SPLVAR;
1328     mud = call->MTU - RX_HEADER_SIZE;
1329     delta =
1330         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1331         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1332
1333 #ifdef RX_ENABLE_TSFPQ
1334     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1335         want += delta;
1336         want = MIN(want, mud);
1337
1338         if ((unsigned)want > p->length)
1339             (void)rxi_AllocDataBuf(p, (want - p->length),
1340                                    RX_PACKET_CLASS_SEND_CBUF);
1341
1342         if (p->length > mud)
1343             p->length = mud;
1344
1345         if (delta >= p->length) {
1346             rxi_FreePacket(p);
1347             p = NULL;
1348         } else {
1349             p->length -= delta;
1350         }
1351         return p;
1352     }
1353 #endif /* RX_ENABLE_TSFPQ */
1354
1355     while (!(call->error)) {
1356         MUTEX_ENTER(&rx_freePktQ_lock);
1357         /* if an error occurred, or we get the packet we want, we're done */
1358         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1359             MUTEX_EXIT(&rx_freePktQ_lock);
1360
1361             want += delta;
1362             want = MIN(want, mud);
1363
1364             if ((unsigned)want > p->length)
1365                 (void)rxi_AllocDataBuf(p, (want - p->length),
1366                                        RX_PACKET_CLASS_SEND_CBUF);
1367
1368             if (p->length > mud)
1369                 p->length = mud;
1370
1371             if (delta >= p->length) {
1372                 rxi_FreePacket(p);
1373                 p = NULL;
1374             } else {
1375                 p->length -= delta;
1376             }
1377             break;
1378         }
1379
1380         /* no error occurred, and we didn't get a packet, so we sleep.
1381          * At this point, we assume that packets will be returned
1382          * sooner or later, as packets are acknowledged, and so we
1383          * just wait.  */
1384         NETPRI;
1385         call->flags |= RX_CALL_WAIT_PACKETS;
1386         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1387         MUTEX_EXIT(&call->lock);
1388         rx_waitingForPackets = 1;
1389
1390 #ifdef  RX_ENABLE_LOCKS
1391         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1392 #else
1393         osi_rxSleep(&rx_waitingForPackets);
1394 #endif
1395         MUTEX_EXIT(&rx_freePktQ_lock);
1396         MUTEX_ENTER(&call->lock);
1397         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1398         call->flags &= ~RX_CALL_WAIT_PACKETS;
1399         USERPRI;
1400     }
1401
1402     return p;
1403 }
1404
1405 #ifndef KERNEL
1406 #ifdef AFS_NT40_ENV
1407 /* Windows does not use file descriptors. */
1408 #define CountFDs(amax) 0
1409 #else
1410 /* count the number of used FDs */
1411 static int
1412 CountFDs(int amax)
1413 {
1414     struct stat tstat;
1415     int i, code;
1416     int count;
1417
1418     count = 0;
1419     for (i = 0; i < amax; i++) {
1420         code = fstat(i, &tstat);
1421         if (code == 0)
1422             count++;
1423     }
1424     return count;
1425 }
1426 #endif /* AFS_NT40_ENV */
1427 #else /* KERNEL */
1428
1429 #define CountFDs(amax) amax
1430
1431 #endif /* KERNEL */
1432
1433 #if !defined(KERNEL) || defined(UKERNEL)
1434
1435 /* This function reads a single packet from the interface into the
1436  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1437  * (host,port) of the sender are stored in the supplied variables, and
1438  * the data length of the packet is stored in the packet structure.
1439  * The header is decoded. */
1440 int
1441 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1442                u_short * port)
1443 {
1444     struct sockaddr_in from;
1445     int nbytes;
1446     afs_int32 rlen;
1447     afs_uint32 tlen, savelen;
1448     struct msghdr msg;
1449     rx_computelen(p, tlen);
1450     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1451
1452     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1453     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1454                                  * it once in order to avoid races.  */
1455     tlen = rlen - tlen;
1456     if (tlen > 0) {
1457         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1458         if (tlen > 0) {
1459             tlen = rlen - tlen;
1460         } else
1461             tlen = rlen;
1462     } else
1463         tlen = rlen;
1464
1465     /* Extend the last iovec for padding, it's just to make sure that the
1466      * read doesn't return more data than we expect, and is done to get around
1467      * our problems caused by the lack of a length field in the rx header.
1468      * Use the extra buffer that follows the localdata in each packet
1469      * structure. */
1470     savelen = p->wirevec[p->niovecs - 1].iov_len;
1471     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1472
1473     memset(&msg, 0, sizeof(msg));
1474     msg.msg_name = (char *)&from;
1475     msg.msg_namelen = sizeof(struct sockaddr_in);
1476     msg.msg_iov = p->wirevec;
1477     msg.msg_iovlen = p->niovecs;
1478     nbytes = rxi_Recvmsg(socket, &msg, 0);
1479
1480     /* restore the vec to its correct state */
1481     p->wirevec[p->niovecs - 1].iov_len = savelen;
1482
1483     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1484     if (nbytes < 0 || (nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1485         if (nbytes < 0 && errno == EWOULDBLOCK) {
1486             if (rx_stats_active)
1487                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1488         } else if (nbytes <= 0) {
1489             if (rx_stats_active) {
1490                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1491                 rx_stats.bogusHost = from.sin_addr.s_addr;
1492             }
1493             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1494                  ntohs(from.sin_port), nbytes));
1495         }
1496         return 0;
1497     }
1498 #ifdef RXDEBUG
1499     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1500                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1501         rxi_DecodePacketHeader(p);
1502
1503         *host = from.sin_addr.s_addr;
1504         *port = from.sin_port;
1505
1506         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1507               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1508               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1509               p->length));
1510 #ifdef RX_TRIMDATABUFS
1511         rxi_TrimDataBufs(p, 1);
1512 #endif
1513         return 0;
1514     }
1515 #endif
1516     else {
1517         /* Extract packet header. */
1518         rxi_DecodePacketHeader(p);
1519
1520         *host = from.sin_addr.s_addr;
1521         *port = from.sin_port;
1522         if (rx_stats_active
1523             && p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1524
1525                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1526         }
1527
1528 #ifdef RX_TRIMDATABUFS
1529         /* Free any empty packet buffers at the end of this packet */
1530         rxi_TrimDataBufs(p, 1);
1531 #endif
1532         return 1;
1533     }
1534 }
1535
1536 #endif /* !KERNEL || UKERNEL */
1537
1538 /* This function splits off the first packet in a jumbo packet.
1539  * As of AFS 3.5, jumbograms contain more than one fixed size
1540  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1541  * last packet header. All packets (except the last) are padded to
1542  * fall on RX_CBUFFERSIZE boundaries.
1543  * HACK: We store the length of the first n-1 packets in the
1544  * last two pad bytes. */
1545
1546 struct rx_packet *
1547 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1548                      int first)
1549 {
1550     struct rx_packet *np;
1551     struct rx_jumboHeader *jp;
1552     int niov, i;
1553     struct iovec *iov;
1554     int length;
1555     afs_uint32 temp;
1556
1557     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1558      * bytes in length. All but the first packet are preceded by
1559      * an abbreviated four byte header. The length of the last packet
1560      * is calculated from the size of the jumbogram. */
1561     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1562
1563     if ((int)p->length < length) {
1564         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1565         return NULL;
1566     }
1567     niov = p->niovecs - 2;
1568     if (niov < 1) {
1569         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1570         return NULL;
1571     }
1572     iov = &p->wirevec[2];
1573     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1574
1575     /* Get a pointer to the abbreviated packet header */
1576     jp = (struct rx_jumboHeader *)
1577         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1578
1579     /* Set up the iovecs for the next packet */
1580     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1581     np->wirevec[0].iov_len = sizeof(struct rx_header);
1582     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1583     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1584     np->niovecs = niov + 1;
1585     for (i = 2, iov++; i <= niov; i++, iov++) {
1586         np->wirevec[i] = *iov;
1587     }
1588     np->length = p->length - length;
1589     p->length = RX_JUMBOBUFFERSIZE;
1590     p->niovecs = 2;
1591
1592     /* Convert the jumbo packet header to host byte order */
1593     temp = ntohl(*(afs_uint32 *) jp);
1594     jp->flags = (u_char) (temp >> 24);
1595     jp->cksum = (u_short) (temp);
1596
1597     /* Fill in the packet header */
1598     np->header = p->header;
1599     np->header.serial = p->header.serial + 1;
1600     np->header.seq = p->header.seq + 1;
1601     np->header.userStatus = 0;
1602     np->header.flags = jp->flags;
1603     np->header.spare = jp->cksum;
1604
1605     return np;
1606 }
1607
1608 #ifndef KERNEL
1609 /* Send a udp datagram */
1610 int
1611 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1612             int length, int istack)
1613 {
1614     struct msghdr msg;
1615         int ret;
1616
1617     memset(&msg, 0, sizeof(msg));
1618     msg.msg_iov = dvec;
1619     msg.msg_iovlen = nvecs;
1620     msg.msg_name = addr;
1621     msg.msg_namelen = sizeof(struct sockaddr_in);
1622
1623     ret = rxi_Sendmsg(socket, &msg, 0);
1624
1625     return ret;
1626 }
1627 #elif !defined(UKERNEL)
1628 /*
1629  * message receipt is done in rxk_input or rx_put.
1630  */
1631
1632 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1633 /*
1634  * Copy an mblock to the contiguous area pointed to by cp.
1635  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1636  * but it doesn't really.
1637  * Returns the number of bytes not transferred.
1638  * The message is NOT changed.
1639  */
1640 static int
1641 cpytoc(mblk_t * mp, int off, int len, char *cp)
1642 {
1643     int n;
1644
1645     for (; mp && len > 0; mp = mp->b_cont) {
1646         if (mp->b_datap->db_type != M_DATA) {
1647             return -1;
1648         }
1649         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1650         memcpy(cp, (char *)mp->b_rptr, n);
1651         cp += n;
1652         len -= n;
1653         mp->b_rptr += n;
1654     }
1655     return (len);
1656 }
1657
1658 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1659  * but it doesn't really.
1660  * This sucks, anyway, do it like m_cpy.... below
1661  */
1662 static int
1663 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1664            int niovs)
1665 {
1666     int m, n, o, t, i;
1667
1668     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1669         if (mp->b_datap->db_type != M_DATA) {
1670             return -1;
1671         }
1672         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1673         len -= n;
1674         while (n) {
1675             if (!t) {
1676                 o = 0;
1677                 i++;
1678                 t = iovs[i].iov_len;
1679             }
1680             m = MIN(n, t);
1681             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1682             mp->b_rptr += m;
1683             o += m;
1684             t -= m;
1685             n -= m;
1686         }
1687     }
1688     return (len);
1689 }
1690
1691 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1692 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1693 #else
1694 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1695 static int
1696 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1697 {
1698     caddr_t p1, p2;
1699     unsigned int l1, l2, i, t;
1700
1701     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1702         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1703
1704     while (off && m)
1705         if (m->m_len <= off) {
1706             off -= m->m_len;
1707             m = m->m_next;
1708             continue;
1709         } else
1710             break;
1711
1712     if (m == NULL)
1713         return len;
1714
1715     p1 = mtod(m, caddr_t) + off;
1716     l1 = m->m_len - off;
1717     i = 0;
1718     p2 = iovs[0].iov_base;
1719     l2 = iovs[0].iov_len;
1720
1721     while (len) {
1722         t = MIN(l1, MIN(l2, (unsigned int)len));
1723         memcpy(p2, p1, t);
1724         p1 += t;
1725         p2 += t;
1726         l1 -= t;
1727         l2 -= t;
1728         len -= t;
1729         if (!l1) {
1730             m = m->m_next;
1731             if (!m)
1732                 break;
1733             p1 = mtod(m, caddr_t);
1734             l1 = m->m_len;
1735         }
1736         if (!l2) {
1737             if (++i >= niovs)
1738                 break;
1739             p2 = iovs[i].iov_base;
1740             l2 = iovs[i].iov_len;
1741         }
1742
1743     }
1744
1745     return len;
1746 }
1747 #endif /* LINUX */
1748 #endif /* AFS_SUN5_ENV */
1749
1750 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1751 #if defined(AFS_NBSD_ENV)
1752 int
1753 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1754 #else
1755 int
1756 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1757 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1758      mblk_t *amb;
1759 #else
1760      struct mbuf *amb;
1761 #endif
1762      void (*free) ();
1763      struct rx_packet *phandle;
1764      int hdr_len, data_len;
1765 #endif /* AFS_NBSD_ENV */
1766 {
1767     int code;
1768
1769     code =
1770         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1771                      phandle->niovecs);
1772     (*free) (amb);
1773
1774     return code;
1775 }
1776 #endif /* LINUX */
1777 #endif /*KERNEL && !UKERNEL */
1778
1779
1780 /* send a response to a debug packet */
1781
1782 struct rx_packet *
1783 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1784                        afs_uint32 ahost, short aport, int istack)
1785 {
1786     struct rx_debugIn tin;
1787     afs_int32 tl;
1788
1789     /*
1790      * Only respond to client-initiated Rx debug packets,
1791      * and clear the client flag in the response.
1792      */
1793     if (ap->header.flags & RX_CLIENT_INITIATED) {
1794         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1795         rxi_EncodePacketHeader(ap);
1796     } else {
1797         return ap;
1798     }
1799
1800     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1801     /* all done with packet, now set length to the truth, so we can
1802      * reuse this packet */
1803     rx_computelen(ap, ap->length);
1804
1805     tin.type = ntohl(tin.type);
1806     tin.index = ntohl(tin.index);
1807     switch (tin.type) {
1808     case RX_DEBUGI_GETSTATS:{
1809             struct rx_debugStats tstat;
1810
1811             /* get basic stats */
1812             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1813             tstat.version = RX_DEBUGI_VERSION;
1814 #ifndef RX_ENABLE_LOCKS
1815             tstat.waitingForPackets = rx_waitingForPackets;
1816 #endif
1817             MUTEX_ENTER(&rx_serverPool_lock);
1818             tstat.nFreePackets = htonl(rx_nFreePackets);
1819             tstat.nPackets = htonl(rx_nPackets);
1820             tstat.callsExecuted = htonl(rxi_nCalls);
1821             tstat.packetReclaims = htonl(rx_packetReclaims);
1822             tstat.usedFDs = CountFDs(64);
1823             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1824             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1825             tstat.idleThreads = opr_queue_Count(&rx_idleServerQueue);
1826             MUTEX_EXIT(&rx_serverPool_lock);
1827             tstat.idleThreads = htonl(tstat.idleThreads);
1828             tl = sizeof(struct rx_debugStats) - ap->length;
1829             if (tl > 0)
1830                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1831
1832             if (tl <= 0) {
1833                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1834                                (char *)&tstat);
1835                 ap->length = sizeof(struct rx_debugStats);
1836                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1837                 rx_computelen(ap, ap->length);
1838             }
1839             break;
1840         }
1841
1842     case RX_DEBUGI_GETALLCONN:
1843     case RX_DEBUGI_GETCONN:{
1844             unsigned int i, j;
1845             struct rx_connection *tc;
1846             struct rx_call *tcall;
1847             struct rx_debugConn tconn;
1848             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1849
1850
1851             tl = sizeof(struct rx_debugConn) - ap->length;
1852             if (tl > 0)
1853                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1854             if (tl > 0)
1855                 return ap;
1856
1857             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1858             /* get N'th (maybe) "interesting" connection info */
1859             for (i = 0; i < rx_hashTableSize; i++) {
1860 #if !defined(KERNEL)
1861                 /* the time complexity of the algorithm used here
1862                  * exponentially increses with the number of connections.
1863                  */
1864 #ifdef AFS_PTHREAD_ENV
1865                 pthread_yield();
1866 #else
1867                 (void)IOMGR_Poll();
1868 #endif
1869 #endif
1870                 MUTEX_ENTER(&rx_connHashTable_lock);
1871                 /* We might be slightly out of step since we are not
1872                  * locking each call, but this is only debugging output.
1873                  */
1874                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1875                     if ((all || rxi_IsConnInteresting(tc))
1876                         && tin.index-- <= 0) {
1877                         tconn.host = tc->peer->host;
1878                         tconn.port = tc->peer->port;
1879                         tconn.cid = htonl(tc->cid);
1880                         tconn.epoch = htonl(tc->epoch);
1881                         tconn.serial = htonl(tc->serial);
1882                         for (j = 0; j < RX_MAXCALLS; j++) {
1883                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1884                             if ((tcall = tc->call[j])) {
1885                                 tconn.callState[j] = tcall->state;
1886                                 tconn.callMode[j] = tcall->app.mode;
1887                                 tconn.callFlags[j] = tcall->flags;
1888                                 if (!opr_queue_IsEmpty(&tcall->rq))
1889                                     tconn.callOther[j] |= RX_OTHER_IN;
1890                                 if (!opr_queue_IsEmpty(&tcall->tq))
1891                                     tconn.callOther[j] |= RX_OTHER_OUT;
1892                             } else
1893                                 tconn.callState[j] = RX_STATE_NOTINIT;
1894                         }
1895
1896                         tconn.natMTU = htonl(tc->peer->natMTU);
1897                         tconn.error = htonl(tc->error);
1898                         tconn.flags = tc->flags;
1899                         tconn.type = tc->type;
1900                         tconn.securityIndex = tc->securityIndex;
1901                         if (tc->securityObject) {
1902                             RXS_GetStats(tc->securityObject, tc,
1903                                          &tconn.secStats);
1904 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1905 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1906                             DOHTONL(flags);
1907                             DOHTONL(expires);
1908                             DOHTONL(packetsReceived);
1909                             DOHTONL(packetsSent);
1910                             DOHTONL(bytesReceived);
1911                             DOHTONL(bytesSent);
1912                             for (i = 0;
1913                                  i <
1914                                  sizeof(tconn.secStats.spares) /
1915                                  sizeof(short); i++)
1916                                 DOHTONS(spares[i]);
1917                             for (i = 0;
1918                                  i <
1919                                  sizeof(tconn.secStats.sparel) /
1920                                  sizeof(afs_int32); i++)
1921                                 DOHTONL(sparel[i]);
1922                         }
1923
1924                         MUTEX_EXIT(&rx_connHashTable_lock);
1925                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1926                                        (char *)&tconn);
1927                         tl = ap->length;
1928                         ap->length = sizeof(struct rx_debugConn);
1929                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1930                                             istack);
1931                         ap->length = tl;
1932                         return ap;
1933                     }
1934                 }
1935                 MUTEX_EXIT(&rx_connHashTable_lock);
1936             }
1937             /* if we make it here, there are no interesting packets */
1938             tconn.cid = htonl(0xffffffff);      /* means end */
1939             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1940                            (char *)&tconn);
1941             tl = ap->length;
1942             ap->length = sizeof(struct rx_debugConn);
1943             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1944             ap->length = tl;
1945             break;
1946         }
1947
1948         /*
1949          * Pass back all the peer structures we have available
1950          */
1951
1952     case RX_DEBUGI_GETPEER:{
1953             unsigned int i;
1954             struct rx_peer *tp;
1955             struct rx_debugPeer tpeer;
1956
1957
1958             tl = sizeof(struct rx_debugPeer) - ap->length;
1959             if (tl > 0)
1960                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1961             if (tl > 0)
1962                 return ap;
1963
1964             memset(&tpeer, 0, sizeof(tpeer));
1965             for (i = 0; i < rx_hashTableSize; i++) {
1966 #if !defined(KERNEL)
1967                 /* the time complexity of the algorithm used here
1968                  * exponentially increses with the number of peers.
1969                  *
1970                  * Yielding after processing each hash table entry
1971                  * and dropping rx_peerHashTable_lock.
1972                  * also increases the risk that we will miss a new
1973                  * entry - but we are willing to live with this
1974                  * limitation since this is meant for debugging only
1975                  */
1976 #ifdef AFS_PTHREAD_ENV
1977                 pthread_yield();
1978 #else
1979                 (void)IOMGR_Poll();
1980 #endif
1981 #endif
1982                 MUTEX_ENTER(&rx_peerHashTable_lock);
1983                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1984                     if (tin.index-- <= 0) {
1985                         tp->refCount++;
1986                         MUTEX_EXIT(&rx_peerHashTable_lock);
1987
1988                         MUTEX_ENTER(&tp->peer_lock);
1989                         tpeer.host = tp->host;
1990                         tpeer.port = tp->port;
1991                         tpeer.ifMTU = htons(tp->ifMTU);
1992                         tpeer.idleWhen = htonl(tp->idleWhen);
1993                         tpeer.refCount = htons(tp->refCount);
1994                         tpeer.burstSize = 0;
1995                         tpeer.burst = 0;
1996                         tpeer.burstWait.sec = 0;
1997                         tpeer.burstWait.usec = 0;
1998                         tpeer.rtt = htonl(tp->rtt);
1999                         tpeer.rtt_dev = htonl(tp->rtt_dev);
2000                         tpeer.nSent = htonl(tp->nSent);
2001                         tpeer.reSends = htonl(tp->reSends);
2002                         tpeer.natMTU = htons(tp->natMTU);
2003                         tpeer.maxMTU = htons(tp->maxMTU);
2004                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2005                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2006                         tpeer.MTU = htons(tp->MTU);
2007                         tpeer.cwind = htons(tp->cwind);
2008                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2009                         tpeer.congestSeq = htons(tp->congestSeq);
2010                         tpeer.bytesSent.high =
2011                             htonl(tp->bytesSent >> 32);
2012                         tpeer.bytesSent.low =
2013                             htonl(tp->bytesSent & MAX_AFS_UINT32);
2014                         tpeer.bytesReceived.high =
2015                             htonl(tp->bytesReceived >> 32);
2016                         tpeer.bytesReceived.low =
2017                             htonl(tp->bytesReceived & MAX_AFS_UINT32);
2018                         MUTEX_EXIT(&tp->peer_lock);
2019
2020                         MUTEX_ENTER(&rx_peerHashTable_lock);
2021                         tp->refCount--;
2022                         MUTEX_EXIT(&rx_peerHashTable_lock);
2023
2024                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2025                                        (char *)&tpeer);
2026                         tl = ap->length;
2027                         ap->length = sizeof(struct rx_debugPeer);
2028                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2029                                             istack);
2030                         ap->length = tl;
2031                         return ap;
2032                     }
2033                 }
2034                 MUTEX_EXIT(&rx_peerHashTable_lock);
2035             }
2036             /* if we make it here, there are no interesting packets */
2037             tpeer.host = htonl(0xffffffff);     /* means end */
2038             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2039                            (char *)&tpeer);
2040             tl = ap->length;
2041             ap->length = sizeof(struct rx_debugPeer);
2042             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2043             ap->length = tl;
2044             break;
2045         }
2046
2047     case RX_DEBUGI_RXSTATS:{
2048             int i;
2049             afs_int32 *s;
2050
2051             tl = sizeof(rx_stats) - ap->length;
2052             if (tl > 0)
2053                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2054             if (tl > 0)
2055                 return ap;
2056
2057             /* Since its all int32s convert to network order with a loop. */
2058             if (rx_stats_active)
2059                 MUTEX_ENTER(&rx_stats_mutex);
2060             s = (afs_int32 *) & rx_stats;
2061             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2062                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2063
2064             tl = ap->length;
2065             ap->length = sizeof(rx_stats);
2066             if (rx_stats_active)
2067                 MUTEX_EXIT(&rx_stats_mutex);
2068             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2069             ap->length = tl;
2070             break;
2071         }
2072
2073     default:
2074         /* error response packet */
2075         tin.type = htonl(RX_DEBUGI_BADTYPE);
2076         tin.index = tin.type;
2077         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2078         tl = ap->length;
2079         ap->length = sizeof(struct rx_debugIn);
2080         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2081         ap->length = tl;
2082         break;
2083     }
2084     return ap;
2085 }
2086
2087 struct rx_packet *
2088 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2089                          afs_uint32 ahost, short aport, int istack)
2090 {
2091     afs_int32 tl;
2092
2093     /*
2094      * Only respond to client-initiated version requests, and
2095      * clear that flag in the response.
2096      */
2097     if (ap->header.flags & RX_CLIENT_INITIATED) {
2098         char buf[66];
2099
2100         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2101         rxi_EncodePacketHeader(ap);
2102         memset(buf, 0, sizeof(buf));
2103         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2104         rx_packetwrite(ap, 0, 65, buf);
2105         tl = ap->length;
2106         ap->length = 65;
2107         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2108         ap->length = tl;
2109     }
2110
2111     return ap;
2112 }
2113
2114
2115 /* send a debug packet back to the sender */
2116 static void
2117 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2118                     afs_uint32 ahost, short aport, afs_int32 istack)
2119 {
2120     struct sockaddr_in taddr;
2121     unsigned int i, nbytes, savelen = 0;
2122     int saven = 0;
2123 #ifdef KERNEL
2124     int waslocked = ISAFS_GLOCK();
2125 #endif
2126
2127     taddr.sin_family = AF_INET;
2128     taddr.sin_port = aport;
2129     taddr.sin_addr.s_addr = ahost;
2130     memset(&taddr.sin_zero, 0, sizeof(taddr.sin_zero));
2131 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2132     taddr.sin_len = sizeof(struct sockaddr_in);
2133 #endif
2134
2135     /* We need to trim the niovecs. */
2136     nbytes = apacket->length;
2137     for (i = 1; i < apacket->niovecs; i++) {
2138         if (nbytes <= apacket->wirevec[i].iov_len) {
2139             savelen = apacket->wirevec[i].iov_len;
2140             saven = apacket->niovecs;
2141             apacket->wirevec[i].iov_len = nbytes;
2142             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2143         } else
2144             nbytes -= apacket->wirevec[i].iov_len;
2145     }
2146 #ifdef KERNEL
2147 #ifdef RX_KERNEL_TRACE
2148     if (ICL_SETACTIVE(afs_iclSetp)) {
2149         if (!waslocked)
2150             AFS_GLOCK();
2151         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2152                    "before osi_NetSend()");
2153         AFS_GUNLOCK();
2154     }
2155 #else
2156     if (waslocked)
2157         AFS_GUNLOCK();
2158 #endif
2159 #endif
2160     /* debug packets are not reliably delivered, hence the cast below. */
2161     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2162                       apacket->length + RX_HEADER_SIZE, istack);
2163 #ifdef KERNEL
2164 #ifdef RX_KERNEL_TRACE
2165     if (ICL_SETACTIVE(afs_iclSetp)) {
2166         AFS_GLOCK();
2167         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2168                    "after osi_NetSend()");
2169         if (!waslocked)
2170             AFS_GUNLOCK();
2171     }
2172 #else
2173     if (waslocked)
2174         AFS_GLOCK();
2175 #endif
2176 #endif
2177     if (saven) {                /* means we truncated the packet above. */
2178         apacket->wirevec[i - 1].iov_len = savelen;
2179         apacket->niovecs = saven;
2180     }
2181
2182 }
2183
2184 static void
2185 rxi_NetSendError(struct rx_call *call, int code)
2186 {
2187     int down = 0;
2188 #ifdef AFS_NT40_ENV
2189     if (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) {
2190         down = 1;
2191     }
2192     if (code == -WSAEHOSTUNREACH) {
2193         down = 1;
2194     }
2195 #elif defined(AFS_LINUX20_ENV)
2196     if (code == -ENETUNREACH) {
2197         down = 1;
2198     }
2199 #elif defined(AFS_DARWIN_ENV)
2200     if (code == EHOSTUNREACH) {
2201         down = 1;
2202     }
2203 #endif
2204     if (down) {
2205         call->lastReceiveTime = 0;
2206     }
2207 }
2208
2209 /* Send the packet to appropriate destination for the specified
2210  * call.  The header is first encoded and placed in the packet.
2211  */
2212 void
2213 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2214                struct rx_packet *p, int istack)
2215 {
2216 #if defined(KERNEL)
2217     int waslocked;
2218 #endif
2219     int code;
2220     struct sockaddr_in addr;
2221     struct rx_peer *peer = conn->peer;
2222     osi_socket socket;
2223 #ifdef RXDEBUG
2224     char deliveryType = 'S';
2225 #endif
2226     /* The address we're sending the packet to */
2227     memset(&addr, 0, sizeof(addr));
2228     addr.sin_family = AF_INET;
2229     addr.sin_port = peer->port;
2230     addr.sin_addr.s_addr = peer->host;
2231     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2232
2233     /* This stuff should be revamped, I think, so that most, if not
2234      * all, of the header stuff is always added here.  We could
2235      * probably do away with the encode/decode routines. XXXXX */
2236
2237     /* Stamp each packet with a unique serial number.  The serial
2238      * number is maintained on a connection basis because some types
2239      * of security may be based on the serial number of the packet,
2240      * and security is handled on a per authenticated-connection
2241      * basis. */
2242     /* Pre-increment, to guarantee no zero serial number; a zero
2243      * serial number means the packet was never sent. */
2244     MUTEX_ENTER(&conn->conn_data_lock);
2245     p->header.serial = ++conn->serial;
2246     if (p->length > conn->peer->maxPacketSize) {
2247         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2248             (p->header.flags & RX_REQUEST_ACK)) {
2249             conn->lastPingSize = p->length;
2250             conn->lastPingSizeSer = p->header.serial;
2251         } else if (p->header.seq != 0) {
2252             conn->lastPacketSize = p->length;
2253             conn->lastPacketSizeSeq = p->header.seq;
2254         }
2255     }
2256     MUTEX_EXIT(&conn->conn_data_lock);
2257     /* This is so we can adjust retransmit time-outs better in the face of
2258      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2259      */
2260     if (p->firstSerial == 0) {
2261         p->firstSerial = p->header.serial;
2262     }
2263 #ifdef RXDEBUG
2264     /* If an output tracer function is defined, call it with the packet and
2265      * network address.  Note this function may modify its arguments. */
2266     if (rx_almostSent) {
2267         int drop = (*rx_almostSent) (p, &addr);
2268         /* drop packet if return value is non-zero? */
2269         if (drop)
2270             deliveryType = 'D'; /* Drop the packet */
2271     }
2272 #endif
2273
2274     /* Get network byte order header */
2275     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2276                                  * touch ALL the fields */
2277
2278     /* Send the packet out on the same socket that related packets are being
2279      * received on */
2280     socket =
2281         (conn->type ==
2282          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2283
2284 #ifdef RXDEBUG
2285     /* Possibly drop this packet,  for testing purposes */
2286     if ((deliveryType == 'D')
2287         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2288             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2289         deliveryType = 'D';     /* Drop the packet */
2290     } else {
2291         deliveryType = 'S';     /* Send the packet */
2292 #endif /* RXDEBUG */
2293
2294         /* Loop until the packet is sent.  We'd prefer just to use a
2295          * blocking socket, but unfortunately the interface doesn't
2296          * allow us to have the socket block in send mode, and not
2297          * block in receive mode */
2298 #ifdef KERNEL
2299         waslocked = ISAFS_GLOCK();
2300 #ifdef RX_KERNEL_TRACE
2301         if (ICL_SETACTIVE(afs_iclSetp)) {
2302             if (!waslocked)
2303                 AFS_GLOCK();
2304             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2305                        "before osi_NetSend()");
2306             AFS_GUNLOCK();
2307         }
2308 #else
2309         if (waslocked)
2310             AFS_GUNLOCK();
2311 #endif
2312 #endif
2313         if ((code =
2314              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2315                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2316             /* send failed, so let's hurry up the resend, eh? */
2317             if (rx_stats_active)
2318                 rx_atomic_inc(&rx_stats.netSendFailures);
2319             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2320
2321             /* Some systems are nice and tell us right away that we cannot
2322              * reach this recipient by returning an error code.
2323              * So, when this happens let's "down" the host NOW so
2324              * we don't sit around waiting for this host to timeout later.
2325              */
2326             if (call) {
2327                 rxi_NetSendError(call, code);
2328             }
2329         }
2330 #ifdef KERNEL
2331 #ifdef RX_KERNEL_TRACE
2332         if (ICL_SETACTIVE(afs_iclSetp)) {
2333             AFS_GLOCK();
2334             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2335                        "after osi_NetSend()");
2336             if (!waslocked)
2337                 AFS_GUNLOCK();
2338         }
2339 #else
2340         if (waslocked)
2341             AFS_GLOCK();
2342 #endif
2343 #endif
2344 #ifdef RXDEBUG
2345     }
2346     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2347           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2348           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2349           p->header.seq, p->header.flags, p, p->length));
2350 #endif
2351     if (rx_stats_active) {
2352         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2353         MUTEX_ENTER(&peer->peer_lock);
2354         peer->bytesSent += p->length;
2355         MUTEX_EXIT(&peer->peer_lock);
2356     }
2357 }
2358
2359 /* Send a list of packets to appropriate destination for the specified
2360  * connection.  The headers are first encoded and placed in the packets.
2361  */
2362 void
2363 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2364                    struct rx_packet **list, int len, int istack)
2365 {
2366 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2367     int waslocked;
2368 #endif
2369     struct sockaddr_in addr;
2370     struct rx_peer *peer = conn->peer;
2371     osi_socket socket;
2372     struct rx_packet *p = NULL;
2373     struct iovec wirevec[RX_MAXIOVECS];
2374     int i, length, code;
2375     afs_uint32 serial;
2376     afs_uint32 temp;
2377     struct rx_jumboHeader *jp;
2378 #ifdef RXDEBUG
2379     char deliveryType = 'S';
2380 #endif
2381     /* The address we're sending the packet to */
2382     addr.sin_family = AF_INET;
2383     addr.sin_port = peer->port;
2384     addr.sin_addr.s_addr = peer->host;
2385     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2386
2387     if (len + 1 > RX_MAXIOVECS) {
2388         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2389     }
2390
2391     /*
2392      * Stamp the packets in this jumbogram with consecutive serial numbers
2393      */
2394     MUTEX_ENTER(&conn->conn_data_lock);
2395     serial = conn->serial;
2396     conn->serial += len;
2397     for (i = 0; i < len; i++) {
2398         p = list[i];
2399         /* a ping *or* a sequenced packet can count */
2400         if (p->length > conn->peer->maxPacketSize) {
2401             if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2402                  (p->header.flags & RX_REQUEST_ACK)) &&
2403                 ((i == 0) || (p->length >= conn->lastPingSize))) {
2404                 conn->lastPingSize = p->length;
2405                 conn->lastPingSizeSer = serial + i;
2406             } else if ((p->header.seq != 0) &&
2407                        ((i == 0) || (p->length >= conn->lastPacketSize))) {
2408                 conn->lastPacketSize = p->length;
2409                 conn->lastPacketSizeSeq = p->header.seq;
2410             }
2411         }
2412     }
2413     MUTEX_EXIT(&conn->conn_data_lock);
2414
2415
2416     /* This stuff should be revamped, I think, so that most, if not
2417      * all, of the header stuff is always added here.  We could
2418      * probably do away with the encode/decode routines. XXXXX */
2419
2420     jp = NULL;
2421     length = RX_HEADER_SIZE;
2422     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2423     wirevec[0].iov_len = RX_HEADER_SIZE;
2424     for (i = 0; i < len; i++) {
2425         p = list[i];
2426
2427         /* The whole 3.5 jumbogram scheme relies on packets fitting
2428          * in a single packet buffer. */
2429         if (p->niovecs > 2) {
2430             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2431         }
2432
2433         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2434          * in this chunk.  */
2435         if (i < len - 1) {
2436             if (p->length != RX_JUMBOBUFFERSIZE) {
2437                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2438             }
2439             p->header.flags |= RX_JUMBO_PACKET;
2440             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2441             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2442         } else {
2443             wirevec[i + 1].iov_len = p->length;
2444             length += p->length;
2445         }
2446         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2447         if (jp != NULL) {
2448             /* Convert jumbo packet header to network byte order */
2449             temp = (afs_uint32) (p->header.flags) << 24;
2450             temp |= (afs_uint32) (p->header.spare);
2451             *(afs_uint32 *) jp = htonl(temp);
2452         }
2453         jp = (struct rx_jumboHeader *)
2454             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2455
2456         /* Stamp each packet with a unique serial number.  The serial
2457          * number is maintained on a connection basis because some types
2458          * of security may be based on the serial number of the packet,
2459          * and security is handled on a per authenticated-connection
2460          * basis. */
2461         /* Pre-increment, to guarantee no zero serial number; a zero
2462          * serial number means the packet was never sent. */
2463         p->header.serial = ++serial;
2464         /* This is so we can adjust retransmit time-outs better in the face of
2465          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2466          */
2467         if (p->firstSerial == 0) {
2468             p->firstSerial = p->header.serial;
2469         }
2470 #ifdef RXDEBUG
2471         /* If an output tracer function is defined, call it with the packet and
2472          * network address.  Note this function may modify its arguments. */
2473         if (rx_almostSent) {
2474             int drop = (*rx_almostSent) (p, &addr);
2475             /* drop packet if return value is non-zero? */
2476             if (drop)
2477                 deliveryType = 'D';     /* Drop the packet */
2478         }
2479 #endif
2480
2481         /* Get network byte order header */
2482         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2483                                          * touch ALL the fields */
2484     }
2485
2486     /* Send the packet out on the same socket that related packets are being
2487      * received on */
2488     socket =
2489         (conn->type ==
2490          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2491
2492 #ifdef RXDEBUG
2493     /* Possibly drop this packet,  for testing purposes */
2494     if ((deliveryType == 'D')
2495         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2496             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2497         deliveryType = 'D';     /* Drop the packet */
2498     } else {
2499         deliveryType = 'S';     /* Send the packet */
2500 #endif /* RXDEBUG */
2501
2502         /* Loop until the packet is sent.  We'd prefer just to use a
2503          * blocking socket, but unfortunately the interface doesn't
2504          * allow us to have the socket block in send mode, and not
2505          * block in receive mode */
2506 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2507         waslocked = ISAFS_GLOCK();
2508         if (!istack && waslocked)
2509             AFS_GUNLOCK();
2510 #endif
2511         if ((code =
2512              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2513                          istack)) != 0) {
2514             /* send failed, so let's hurry up the resend, eh? */
2515             if (rx_stats_active)
2516                 rx_atomic_inc(&rx_stats.netSendFailures);
2517             for (i = 0; i < len; i++) {
2518                 p = list[i];
2519                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2520             }
2521             /* Some systems are nice and tell us right away that we cannot
2522              * reach this recipient by returning an error code.
2523              * So, when this happens let's "down" the host NOW so
2524              * we don't sit around waiting for this host to timeout later.
2525              */
2526             if (call) {
2527                 rxi_NetSendError(call, code);
2528             }
2529         }
2530 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2531         if (!istack && waslocked)
2532             AFS_GLOCK();
2533 #endif
2534 #ifdef RXDEBUG
2535     }
2536
2537     osi_Assert(p != NULL);
2538
2539     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2540           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2541           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2542           p->header.seq, p->header.flags, p, p->length));
2543
2544 #endif
2545     if (rx_stats_active) {
2546         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2547         MUTEX_ENTER(&peer->peer_lock);
2548         peer->bytesSent += p->length;
2549         MUTEX_EXIT(&peer->peer_lock);
2550     }
2551 }
2552
2553 /* Send a raw abort packet, without any call or connection structures */
2554 void
2555 rxi_SendRawAbort(osi_socket socket, afs_uint32 host, u_short port,
2556                  afs_uint32 serial, afs_int32 error,
2557                  struct rx_packet *source, int istack)
2558 {
2559     struct rx_header theader;
2560     struct sockaddr_in addr;
2561     struct iovec iov[2];
2562
2563     memset(&theader, 0, sizeof(theader));
2564     theader.epoch = htonl(source->header.epoch);
2565     theader.callNumber = htonl(source->header.callNumber);
2566     theader.serial = htonl(serial);
2567     theader.type = RX_PACKET_TYPE_ABORT;
2568     theader.serviceId = htons(source->header.serviceId);
2569     theader.securityIndex = source->header.securityIndex;
2570     theader.cid = htonl(source->header.cid);
2571
2572     /*
2573      * If the abort is being sent in response to a server initiated packet,
2574      * set client_initiated in the abort to ensure it is not associated by
2575      * the receiver with a connection in the opposite direction.
2576      */
2577     if ((source->header.flags & RX_CLIENT_INITIATED) != RX_CLIENT_INITIATED)
2578         theader.flags |= RX_CLIENT_INITIATED;
2579
2580     error = htonl(error);
2581
2582     iov[0].iov_base = &theader;
2583     iov[0].iov_len = sizeof(struct rx_header);
2584     iov[1].iov_base = &error;
2585     iov[1].iov_len = sizeof(error);
2586
2587     addr.sin_family = AF_INET;
2588     addr.sin_addr.s_addr = host;
2589     addr.sin_port = port;
2590     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2591 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2592     addr.sin_len = sizeof(struct sockaddr_in);
2593 #endif
2594
2595     osi_NetSend(socket, &addr, iov, 2,
2596                 sizeof(struct rx_header) + sizeof(error), istack);
2597 }
2598
2599 /* Send a "special" packet to the peer connection.  If call is
2600  * specified, then the packet is directed to a specific call channel
2601  * associated with the connection, otherwise it is directed to the
2602  * connection only. Uses optionalPacket if it is supplied, rather than
2603  * allocating a new packet buffer.  Nbytes is the length of the data
2604  * portion of the packet.  If data is non-null, nbytes of data are
2605  * copied into the packet.  Type is the type of the packet, as defined
2606  * in rx.h.  Bug: there's a lot of duplication between this and other
2607  * routines.  This needs to be cleaned up. */
2608 struct rx_packet *
2609 rxi_SendSpecial(struct rx_call *call,
2610                 struct rx_connection *conn,
2611                 struct rx_packet *optionalPacket, int type, char *data,
2612                 int nbytes, int istack)
2613 {
2614     /* Some of the following stuff should be common code for all
2615      * packet sends (it's repeated elsewhere) */
2616     struct rx_packet *p;
2617     unsigned int i = 0;
2618     int savelen = 0, saven = 0;
2619     int channel, callNumber;
2620     if (call) {
2621         channel = call->channel;
2622         callNumber = *call->callNumber;
2623         /* BUSY packets refer to the next call on this connection */
2624         if (type == RX_PACKET_TYPE_BUSY) {
2625             callNumber++;
2626         }
2627     } else {
2628         channel = 0;
2629         callNumber = 0;
2630     }
2631     p = optionalPacket;
2632     if (!p) {
2633         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2634         if (!p)
2635             osi_Panic("rxi_SendSpecial failure");
2636     }
2637
2638     if (nbytes != -1)
2639         p->length = nbytes;
2640     else
2641         nbytes = p->length;
2642     p->header.serviceId = conn->serviceId;
2643     p->header.securityIndex = conn->securityIndex;
2644     p->header.cid = (conn->cid | channel);
2645     p->header.callNumber = callNumber;
2646     p->header.seq = 0;
2647     p->header.epoch = conn->epoch;
2648     p->header.type = type;
2649     p->header.userStatus = 0;
2650     p->header.flags = 0;
2651     if (conn->type == RX_CLIENT_CONNECTION)
2652         p->header.flags |= RX_CLIENT_INITIATED;
2653     if (data)
2654         rx_packetwrite(p, 0, nbytes, data);
2655
2656     for (i = 1; i < p->niovecs; i++) {
2657         if (nbytes <= p->wirevec[i].iov_len) {
2658             savelen = p->wirevec[i].iov_len;
2659             saven = p->niovecs;
2660             p->wirevec[i].iov_len = nbytes;
2661             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2662         } else
2663             nbytes -= p->wirevec[i].iov_len;
2664     }
2665
2666     if (call)
2667         rxi_Send(call, p, istack);
2668     else
2669         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2670     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2671         /* really need to do this, but it seems safer this way, given that  */
2672         /* sneaky optionalPacket... */
2673         p->wirevec[i - 1].iov_len = savelen;
2674         p->niovecs = saven;
2675     }
2676     if (!optionalPacket)
2677         rxi_FreePacket(p);
2678     return optionalPacket;
2679 }
2680
2681
2682 /* Encode the packet's header (from the struct header in the packet to
2683  * the net byte order representation in the wire representation of the
2684  * packet, which is what is actually sent out on the wire) */
2685 void
2686 rxi_EncodePacketHeader(struct rx_packet *p)
2687 {
2688     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2689
2690     memset(buf, 0, RX_HEADER_SIZE);
2691     *buf++ = htonl(p->header.epoch);
2692     *buf++ = htonl(p->header.cid);
2693     *buf++ = htonl(p->header.callNumber);
2694     *buf++ = htonl(p->header.seq);
2695     *buf++ = htonl(p->header.serial);
2696     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2697                    | (((afs_uint32) p->header.flags) << 16)
2698                    | (p->header.userStatus << 8) | p->header.securityIndex);
2699     /* Note: top 16 bits of this next word were reserved */
2700     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2701 }
2702
2703 /* Decode the packet's header (from net byte order to a struct header) */
2704 void
2705 rxi_DecodePacketHeader(struct rx_packet *p)
2706 {
2707     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2708     afs_uint32 temp;
2709
2710     p->header.epoch = ntohl(*buf);
2711     buf++;
2712     p->header.cid = ntohl(*buf);
2713     buf++;
2714     p->header.callNumber = ntohl(*buf);
2715     buf++;
2716     p->header.seq = ntohl(*buf);
2717     buf++;
2718     p->header.serial = ntohl(*buf);
2719     buf++;
2720
2721     temp = ntohl(*buf);
2722     buf++;
2723
2724     /* C will truncate byte fields to bytes for me */
2725     p->header.type = temp >> 24;
2726     p->header.flags = temp >> 16;
2727     p->header.userStatus = temp >> 8;
2728     p->header.securityIndex = temp >> 0;
2729
2730     temp = ntohl(*buf);
2731     buf++;
2732
2733     p->header.serviceId = (temp & 0xffff);
2734     p->header.spare = temp >> 16;
2735     /* Note: top 16 bits of this last word are the security checksum */
2736 }
2737
2738 /*
2739  * LOCKS HELD: called with call->lock held.
2740  *
2741  * PrepareSendPacket is the only place in the code that
2742  * can increment call->tnext.  This could become an atomic
2743  * in the future.  Beyond that there is nothing in this
2744  * function that requires the call being locked.  This
2745  * function can only be called by the application thread.
2746  */
2747 void
2748 rxi_PrepareSendPacket(struct rx_call *call,
2749                       struct rx_packet *p, int last)
2750 {
2751     struct rx_connection *conn = call->conn;
2752     afs_uint32 seq = call->tnext++;
2753     unsigned int i;
2754     afs_int32 len;              /* len must be a signed type; it can go negative */
2755     int code;
2756
2757     /* No data packets on call 0. Where do these come from? */
2758     if (*call->callNumber == 0)
2759         *call->callNumber = 1;
2760
2761     MUTEX_EXIT(&call->lock);
2762     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2763
2764     p->header.cid = (conn->cid | call->channel);
2765     p->header.serviceId = conn->serviceId;
2766     p->header.securityIndex = conn->securityIndex;
2767
2768     p->header.callNumber = *call->callNumber;
2769     p->header.seq = seq;
2770     p->header.epoch = conn->epoch;
2771     p->header.type = RX_PACKET_TYPE_DATA;
2772     p->header.userStatus = 0;
2773     p->header.flags = 0;
2774     p->header.spare = 0;
2775     if (conn->type == RX_CLIENT_CONNECTION)
2776         p->header.flags |= RX_CLIENT_INITIATED;
2777
2778     if (last)
2779         p->header.flags |= RX_LAST_PACKET;
2780
2781     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2782     p->header.serial = 0;       /* Another way of saying never transmitted... */
2783
2784     /* Now that we're sure this is the last data on the call, make sure
2785      * that the "length" and the sum of the iov_lens matches. */
2786     len = p->length + call->conn->securityHeaderSize;
2787
2788     for (i = 1; i < p->niovecs && len > 0; i++) {
2789         len -= p->wirevec[i].iov_len;
2790     }
2791     if (len > 0) {
2792         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2793     } else if (i < p->niovecs) {
2794         /* Free any extra elements in the wirevec */
2795 #if defined(RX_ENABLE_TSFPQ)
2796         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2797 #else /* !RX_ENABLE_TSFPQ */
2798         MUTEX_ENTER(&rx_freePktQ_lock);
2799         rxi_FreeDataBufsNoLock(p, i);
2800         MUTEX_EXIT(&rx_freePktQ_lock);
2801 #endif /* !RX_ENABLE_TSFPQ */
2802
2803         p->niovecs = i;
2804     }
2805     if (len)
2806         p->wirevec[i - 1].iov_len += len;
2807     MUTEX_ENTER(&call->lock);
2808     code = RXS_PreparePacket(conn->securityObject, call, p);
2809     if (code) {
2810         MUTEX_EXIT(&call->lock);
2811         rxi_ConnectionError(conn, code);
2812         MUTEX_ENTER(&conn->conn_data_lock);
2813         p = rxi_SendConnectionAbort(conn, p, 0, 0);
2814         MUTEX_EXIT(&conn->conn_data_lock);
2815         MUTEX_ENTER(&call->lock);
2816         /* setting a connection error means all calls for that conn are also
2817          * error'd. if this call does not have an error by now, something is
2818          * very wrong, and we risk sending data in the clear that is supposed
2819          * to be encrypted. */
2820         osi_Assert(call->error);
2821     }
2822 }
2823
2824 /* Given an interface MTU size, calculate an adjusted MTU size that
2825  * will make efficient use of the RX buffers when the peer is sending
2826  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2827 int
2828 rxi_AdjustIfMTU(int mtu)
2829 {
2830     int adjMTU;
2831     int frags;
2832
2833     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2834         return mtu;
2835     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2836     if (mtu <= adjMTU) {
2837         return mtu;
2838     }
2839     mtu -= adjMTU;
2840     if (mtu <= 0) {
2841         return adjMTU;
2842     }
2843     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2844     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2845 }
2846
2847 /* Given an interface MTU size, and the peer's advertised max receive
2848  * size, calculate an adjisted maxMTU size that makes efficient use
2849  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2850 int
2851 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2852 {
2853     int maxMTU = mtu * rxi_nSendFrags;
2854     maxMTU = MIN(maxMTU, peerMaxMTU);
2855     return rxi_AdjustIfMTU(maxMTU);
2856 }
2857
2858 /* Given a packet size, figure out how many datagram packet will fit.
2859  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2860  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2861  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2862 int
2863 rxi_AdjustDgramPackets(int frags, int mtu)
2864 {
2865     int maxMTU;
2866     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2867         return 1;
2868     }
2869     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2870     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2871     /* subtract the size of the first and last packets */
2872     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2873     if (maxMTU < 0) {
2874         return 1;
2875     }
2876     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2877 }
2878
2879 #ifndef KERNEL
2880 /*
2881  * This function can be used by the Windows Cache Manager
2882  * to dump the list of all rx packets so that we can determine
2883  * where the packet leakage is.
2884  */
2885 int rx_DumpPackets(FILE *outputFile, char *cookie)
2886 {
2887 #ifdef RXDEBUG_PACKET
2888     struct rx_packet *p;
2889 #ifdef AFS_NT40_ENV
2890     int zilch;
2891     char output[2048];
2892 #define RXDPRINTF sprintf
2893 #define RXDPRINTOUT output
2894 #else
2895 #define RXDPRINTF fprintf
2896 #define RXDPRINTOUT outputFile
2897 #endif
2898
2899     NETPRI;
2900     MUTEX_ENTER(&rx_freePktQ_lock);
2901     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2902 #ifdef AFS_NT40_ENV
2903     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2904 #endif
2905
2906     for (p = rx_mallocedP; p; p = p->allNextp) {
2907         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2908                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2909                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2910                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2911                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2912                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2913 #ifdef AFS_NT40_ENV
2914         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2915 #endif
2916     }
2917
2918     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2919 #ifdef AFS_NT40_ENV
2920     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2921 #endif
2922
2923     MUTEX_EXIT(&rx_freePktQ_lock);
2924     USERPRI;
2925 #endif /* RXDEBUG_PACKET */
2926     return 0;
2927 }
2928 #endif