src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #ifdef KERNEL
  12 #include "afs/param.h"
  13 #else
  14 #include <afs/param.h>
  15 #endif
  16
  17 #ifdef KERNEL
  18 #if defined(UKERNEL)
  19 #include "afs/sysincludes.h"
  20 #include "afsincludes.h"
  21 #include "rx/rx_kcommon.h"
  22 #include "rx/rx_clock.h"
  23 #include "rx/rx_queue.h"
  24 #include "rx/rx_packet.h"
  25 #include "rx/rx_atomic.h"
  26 #include "rx/rx_internal.h"
  27 #else /* defined(UKERNEL) */
  28 #ifdef RX_KERNEL_TRACE
  29 #include "../rx/rx_kcommon.h"
  30 #endif
  31 #include "h/types.h"
  32 #ifndef AFS_LINUX20_ENV
  33 #include "h/systm.h"
  34 #endif
  35 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  36 #include "afs/sysincludes.h"
  37 #endif
  38 #if defined(AFS_OBSD_ENV)
  39 #include "h/proc.h"
  40 #endif
  41 #include "h/socket.h"
  42 #if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  43 #if     !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  44 #include "sys/mount.h"          /* it gets pulled in by something later anyway */
  45 #endif
  46 #include "h/mbuf.h"
  47 #endif
  48 #include "netinet/in.h"
  49 #include "afs/afs_osi.h"
  50 #include "rx_kmutex.h"
  51 #include "rx/rx_clock.h"
  52 #include "rx/rx_queue.h"
  53 #include "rx_atomic.h"
  54 #ifdef  AFS_SUN5_ENV
  55 #include <sys/sysmacros.h>
  56 #endif
  57 #include "rx/rx_packet.h"
  58 #include "rx_internal.h"
  59 #endif /* defined(UKERNEL) */
  60 #include "rx/rx_globals.h"
  61 #else /* KERNEL */
  62 #include "sys/types.h"
  63 #include <sys/stat.h>
  64 #include <errno.h>
  65 #if defined(AFS_NT40_ENV)
  66 #include <winsock2.h>
  67 #ifndef EWOULDBLOCK
  68 #define EWOULDBLOCK WSAEWOULDBLOCK
  69 #endif
  70 #include "rx_user.h"
  71 #include "rx_xmit_nt.h"
  72 #include <stdlib.h>
  73 #else
  74 #include <sys/socket.h>
  75 #include <netinet/in.h>
  76 #endif
  77 #include "rx_clock.h"
  78 #include "rx.h"
  79 #include "rx_queue.h"
  80 #ifdef  AFS_SUN5_ENV
  81 #include <sys/sysmacros.h>
  82 #endif
  83 #include "rx_packet.h"
  84 #include "rx_atomic.h"
  85 #include "rx_globals.h"
  86 #include "rx_internal.h"
  87 #include <lwp.h>
  88 #include <assert.h>
  89 #include <string.h>
  90 #ifdef HAVE_UNISTD_H
  91 #include <unistd.h>
  92 #endif
  93 #endif /* KERNEL */
  94
  95 #ifdef RX_LOCKS_DB
  96 /* rxdb_fileID is used to identify the lock location, along with line#. */
  97 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  98 #endif /* RX_LOCKS_DB */
  99 static struct rx_packet *rx_mallocedP = 0;
 100 #ifdef RXDEBUG_PACKET
 101 static afs_uint32       rx_packet_id = 0;
 102 #endif
 103
 104 extern char cml_version_number[];
 105
 106 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
 107
 108 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
 109                                 afs_uint32 ahost, short aport,
 110                                 afs_int32 istack);
 111
 112 #ifdef RX_ENABLE_TSFPQ
 113 static int
 114 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
 115 #else
 116 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
 117                                    afs_uint32 first,
 118                                    struct rx_queue * q);
 119 #endif
 120
 121 /* some rules about packets:
 122  * 1.  When a packet is allocated, the final iov_buf contains room for
 123  * a security trailer, but iov_len masks that fact.  If the security
 124  * package wants to add the trailer, it may do so, and then extend
 125  * iov_len appropriately.  For this reason, packet's niovecs and
 126  * iov_len fields should be accurate before calling PreparePacket.
 127 */
 128
 129 /* Preconditions:
 130  *        all packet buffers (iov_base) are integral multiples of
 131  *        the word size.
 132  *        offset is an integral multiple of the word size.
 133  */
 134 afs_int32
 135 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 136 {
 137     unsigned int i;
 138     size_t l;
 139     for (l = 0, i = 1; i < packet->niovecs; i++) {
 140         if (l + packet->wirevec[i].iov_len > offset) {
 141             return
 142                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 143                                  (offset - l)));
 144         }
 145         l += packet->wirevec[i].iov_len;
 146     }
 147
 148     return 0;
 149 }
 150
 151 /* Preconditions:
 152  *        all packet buffers (iov_base) are integral multiples of the word size.
 153  *        offset is an integral multiple of the word size.
 154  */
 155 afs_int32
 156 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 157 {
 158     unsigned int i;
 159     size_t l;
 160     for (l = 0, i = 1; i < packet->niovecs; i++) {
 161         if (l + packet->wirevec[i].iov_len > offset) {
 162             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 163                              (offset - l))) = data;
 164             return 0;
 165         }
 166         l += packet->wirevec[i].iov_len;
 167     }
 168
 169     return 0;
 170 }
 171
 172 /* Preconditions:
 173  *        all packet buffers (iov_base) are integral multiples of the
 174  *        word size.
 175  *        offset is an integral multiple of the word size.
 176  * Packet Invariants:
 177  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 178  */
 179 afs_int32
 180 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 181                   char *out)
 182 {
 183     unsigned int i, j, l, r;
 184     for (l = 0, i = 1; i < packet->niovecs; i++) {
 185         if (l + packet->wirevec[i].iov_len > offset) {
 186             break;
 187         }
 188         l += packet->wirevec[i].iov_len;
 189     }
 190
 191     /* i is the iovec which contains the first little bit of data in which we
 192      * are interested.  l is the total length of everything prior to this iovec.
 193      * j is the number of bytes we can safely copy out of this iovec.
 194      * offset only applies to the first iovec.
 195      */
 196     r = resid;
 197     while ((r > 0) && (i < packet->niovecs)) {
 198         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 199         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 200         r -= j;
 201         out += j;
 202         l += packet->wirevec[i].iov_len;
 203         offset = l;
 204         i++;
 205     }
 206
 207     return (r ? (resid - r) : resid);
 208 }
 209
 210
 211 /* Preconditions:
 212  *        all packet buffers (iov_base) are integral multiples of the
 213  *        word size.
 214  *        offset is an integral multiple of the word size.
 215  */
 216 afs_int32
 217 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 218 {
 219     unsigned int i, j, l, o, r;
 220     char *b;
 221
 222     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 223         if (l + packet->wirevec[i].iov_len > o) {
 224             break;
 225         }
 226         l += packet->wirevec[i].iov_len;
 227     }
 228
 229     /* i is the iovec which contains the first little bit of data in which we
 230      * are interested.  l is the total length of everything prior to this iovec.
 231      * j is the number of bytes we can safely copy out of this iovec.
 232      * offset only applies to the first iovec.
 233      */
 234     r = resid;
 235     while ((r > 0) && (i <= RX_MAXWVECS)) {
 236         if (i >= packet->niovecs)
 237             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 238                 break;
 239
 240         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 241         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 242         memcpy(b, in, j);
 243         r -= j;
 244         in += j;
 245         l += packet->wirevec[i].iov_len;
 246         offset = l;
 247         i++;
 248     }
 249
 250     return (r ? (resid - r) : resid);
 251 }
 252
 253 int
 254 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 255 {
 256     struct rx_packet *p, *np;
 257
 258     num_pkts = AllocPacketBufs(class, num_pkts, q);
 259
 260     for (queue_Scan(q, p, np, rx_packet)) {
 261         RX_PACKET_IOV_FULLINIT(p);
 262     }
 263
 264     return num_pkts;
 265 }
 266
 267 #ifdef RX_ENABLE_TSFPQ
 268 static int
 269 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 270 {
 271     struct rx_ts_info_t * rx_ts_info;
 272     int transfer;
 273     SPLVAR;
 274
 275     RX_TS_INFO_GET(rx_ts_info);
 276
 277     transfer = num_pkts - rx_ts_info->_FPQ.len;
 278     if (transfer > 0) {
 279         NETPRI;
 280         MUTEX_ENTER(&rx_freePktQ_lock);
 281         transfer = MAX(transfer, rx_TSFPQGlobSize);
 282         if (transfer > rx_nFreePackets) {
 283             /* alloc enough for us, plus a few globs for other threads */
 284             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 285         }
 286
 287         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 288
 289         MUTEX_EXIT(&rx_freePktQ_lock);
 290         USERPRI;
 291     }
 292
 293     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 294
 295     return num_pkts;
 296 }
 297 #else /* RX_ENABLE_TSFPQ */
 298 static int
 299 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 300 {
 301     struct rx_packet *c;
 302     int i;
 303 #ifdef KERNEL
 304     int overq = 0;
 305 #endif
 306     SPLVAR;
 307
 308     NETPRI;
 309
 310     MUTEX_ENTER(&rx_freePktQ_lock);
 311
 312 #ifdef KERNEL
 313     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 314          num_pkts--, overq++);
 315
 316     if (overq) {
 317         rxi_NeedMorePackets = TRUE;
 318         if (rx_stats_active) {
 319             switch (class) {
 320             case RX_PACKET_CLASS_RECEIVE:
 321                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
 322                 break;
 323             case RX_PACKET_CLASS_SEND:
 324                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
 325                 break;
 326             case RX_PACKET_CLASS_SPECIAL:
 327                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
 328                 break;
 329             case RX_PACKET_CLASS_RECV_CBUF:
 330                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
 331                 break;
 332             case RX_PACKET_CLASS_SEND_CBUF:
 333                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
 334                 break;
 335             }
 336         }
 337     }
 338
 339     if (rx_nFreePackets < num_pkts)
 340         num_pkts = rx_nFreePackets;
 341
 342     if (!num_pkts) {
 343         rxi_NeedMorePackets = TRUE;
 344         goto done;
 345     }
 346 #else /* KERNEL */
 347     if (rx_nFreePackets < num_pkts) {
 348         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 349     }
 350 #endif /* KERNEL */
 351
 352     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 353          i < num_pkts;
 354          i++, c=queue_Next(c, rx_packet)) {
 355         RX_FPQ_MARK_USED(c);
 356     }
 357
 358     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 359
 360     rx_nFreePackets -= num_pkts;
 361
 362 #ifdef KERNEL
 363   done:
 364 #endif
 365     MUTEX_EXIT(&rx_freePktQ_lock);
 366
 367     USERPRI;
 368     return num_pkts;
 369 }
 370 #endif /* RX_ENABLE_TSFPQ */
 371
 372 /*
 373  * Free a packet currently used as a continuation buffer
 374  */
 375 #ifdef RX_ENABLE_TSFPQ
 376 /* num_pkts=0 means queue length is unknown */
 377 int
 378 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 379 {
 380     struct rx_ts_info_t * rx_ts_info;
 381     struct rx_packet *c, *nc;
 382     SPLVAR;
 383
 384     osi_Assert(num_pkts >= 0);
 385     RX_TS_INFO_GET(rx_ts_info);
 386
 387     if (!num_pkts) {
 388         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 389             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 390         }
 391     } else {
 392         for (queue_Scan(q, c, nc, rx_packet)) {
 393             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 394         }
 395     }
 396
 397     if (num_pkts) {
 398         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 399     }
 400
 401     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 402         NETPRI;
 403         MUTEX_ENTER(&rx_freePktQ_lock);
 404
 405         RX_TS_FPQ_LTOG(rx_ts_info);
 406
 407         /* Wakeup anyone waiting for packets */
 408         rxi_PacketsUnWait();
 409
 410         MUTEX_EXIT(&rx_freePktQ_lock);
 411         USERPRI;
 412     }
 413
 414     return num_pkts;
 415 }
 416 #else /* RX_ENABLE_TSFPQ */
 417 /* num_pkts=0 means queue length is unknown */
 418 int
 419 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 420 {
 421     struct rx_queue cbs;
 422     struct rx_packet *p, *np;
 423     int qlen = 0;
 424     SPLVAR;
 425
 426     osi_Assert(num_pkts >= 0);
 427     queue_Init(&cbs);
 428
 429     if (!num_pkts) {
 430         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 431             if (p->niovecs > 2) {
 432                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 433             }
 434             RX_FPQ_MARK_FREE(p);
 435         }
 436         if (!num_pkts)
 437             return 0;
 438     } else {
 439         for (queue_Scan(q, p, np, rx_packet)) {
 440             if (p->niovecs > 2) {
 441                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 442             }
 443             RX_FPQ_MARK_FREE(p);
 444         }
 445     }
 446
 447     if (qlen) {
 448         queue_SpliceAppend(q, &cbs);
 449         qlen += num_pkts;
 450     } else
 451         qlen = num_pkts;
 452
 453     NETPRI;
 454     MUTEX_ENTER(&rx_freePktQ_lock);
 455
 456     queue_SpliceAppend(&rx_freePacketQueue, q);
 457     rx_nFreePackets += qlen;
 458
 459     /* Wakeup anyone waiting for packets */
 460     rxi_PacketsUnWait();
 461
 462     MUTEX_EXIT(&rx_freePktQ_lock);
 463     USERPRI;
 464
 465     return num_pkts;
 466 }
 467 #endif /* RX_ENABLE_TSFPQ */
 468
 469 /* this one is kind of awful.
 470  * In rxkad, the packet has been all shortened, and everything, ready for
 471  * sending.  All of a sudden, we discover we need some of that space back.
 472  * This isn't terribly general, because it knows that the packets are only
 473  * rounded up to the EBS (userdata + security header).
 474  */
 475 int
 476 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 477 {
 478     int i;
 479     i = p->niovecs - 1;
 480     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 481         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 482             p->wirevec[i].iov_len += nb;
 483             return 0;
 484         }
 485     } else {
 486         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 487             p->wirevec[i].iov_len += nb;
 488             return 0;
 489         }
 490     }
 491
 492     return 0;
 493 }
 494
 495 /* get sufficient space to store nb bytes of data (or more), and hook
 496  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 497  * returns the number of bytes >0 which it failed to come up with.
 498  * Don't need to worry about locking on packet, since only
 499  * one thread can manipulate one at a time. Locking on continution
 500  * packets is handled by AllocPacketBufs */
 501 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 502 int
 503 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 504 {
 505     int i, nv;
 506     struct rx_queue q;
 507     struct rx_packet *cb, *ncb;
 508
 509     /* compute the number of cbuf's we need */
 510     nv = nb / RX_CBUFFERSIZE;
 511     if ((nv * RX_CBUFFERSIZE) < nb)
 512         nv++;
 513     if ((nv + p->niovecs) > RX_MAXWVECS)
 514         nv = RX_MAXWVECS - p->niovecs;
 515     if (nv < 1)
 516         return nb;
 517
 518     /* allocate buffers */
 519     queue_Init(&q);
 520     nv = AllocPacketBufs(class, nv, &q);
 521
 522     /* setup packet iovs */
 523     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 524         queue_Remove(cb);
 525         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 526         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 527     }
 528
 529     nb -= (nv * RX_CBUFFERSIZE);
 530     p->length += (nv * RX_CBUFFERSIZE);
 531     p->niovecs += nv;
 532
 533     return nb;
 534 }
 535
 536 /* Add more packet buffers */
 537 #ifdef RX_ENABLE_TSFPQ
 538 void
 539 rxi_MorePackets(int apackets)
 540 {
 541     struct rx_packet *p, *e;
 542     struct rx_ts_info_t * rx_ts_info;
 543     int getme;
 544     SPLVAR;
 545
 546     getme = apackets * sizeof(struct rx_packet);
 547     p = (struct rx_packet *)osi_Alloc(getme);
 548     osi_Assert(p);
 549
 550     PIN(p, getme);              /* XXXXX */
 551     memset(p, 0, getme);
 552     RX_TS_INFO_GET(rx_ts_info);
 553
 554     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 555     /* TSFPQ patch also needs to keep track of total packets */
 556
 557     MUTEX_ENTER(&rx_packets_mutex);
 558     rx_nPackets += apackets;
 559     RX_TS_FPQ_COMPUTE_LIMITS;
 560     MUTEX_EXIT(&rx_packets_mutex);
 561
 562     for (e = p + apackets; p < e; p++) {
 563         RX_PACKET_IOV_INIT(p);
 564         p->niovecs = 2;
 565
 566         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 567
 568         NETPRI;
 569         MUTEX_ENTER(&rx_freePktQ_lock);
 570 #ifdef RXDEBUG_PACKET
 571         p->packetId = rx_packet_id++;
 572         p->allNextp = rx_mallocedP;
 573 #endif /* RXDEBUG_PACKET */
 574         rx_mallocedP = p;
 575         MUTEX_EXIT(&rx_freePktQ_lock);
 576         USERPRI;
 577     }
 578     rx_ts_info->_FPQ.delta += apackets;
 579
 580     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 581         NETPRI;
 582         MUTEX_ENTER(&rx_freePktQ_lock);
 583
 584         RX_TS_FPQ_LTOG(rx_ts_info);
 585         rxi_NeedMorePackets = FALSE;
 586         rxi_PacketsUnWait();
 587
 588         MUTEX_EXIT(&rx_freePktQ_lock);
 589         USERPRI;
 590     }
 591 }
 592 #else /* RX_ENABLE_TSFPQ */
 593 void
 594 rxi_MorePackets(int apackets)
 595 {
 596     struct rx_packet *p, *e;
 597     int getme;
 598     SPLVAR;
 599
 600     getme = apackets * sizeof(struct rx_packet);
 601     p = (struct rx_packet *)osi_Alloc(getme);
 602     osi_Assert(p);
 603
 604     PIN(p, getme);              /* XXXXX */
 605     memset(p, 0, getme);
 606     NETPRI;
 607     MUTEX_ENTER(&rx_freePktQ_lock);
 608
 609     for (e = p + apackets; p < e; p++) {
 610         RX_PACKET_IOV_INIT(p);
 611 #ifdef RX_TRACK_PACKETS
 612         p->flags |= RX_PKTFLAG_FREE;
 613 #endif
 614         p->niovecs = 2;
 615
 616         queue_Append(&rx_freePacketQueue, p);
 617 #ifdef RXDEBUG_PACKET
 618         p->packetId = rx_packet_id++;
 619         p->allNextp = rx_mallocedP;
 620 #endif /* RXDEBUG_PACKET */
 621         rx_mallocedP = p;
 622     }
 623
 624     rx_nPackets += apackets;
 625     rx_nFreePackets += apackets;
 626     rxi_NeedMorePackets = FALSE;
 627     rxi_PacketsUnWait();
 628
 629     MUTEX_EXIT(&rx_freePktQ_lock);
 630     USERPRI;
 631 }
 632 #endif /* RX_ENABLE_TSFPQ */
 633
 634 #ifdef RX_ENABLE_TSFPQ
 635 void
 636 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 637 {
 638     struct rx_packet *p, *e;
 639     struct rx_ts_info_t * rx_ts_info;
 640     int getme;
 641     SPLVAR;
 642
 643     getme = apackets * sizeof(struct rx_packet);
 644     p = (struct rx_packet *)osi_Alloc(getme);
 645
 646     PIN(p, getme);              /* XXXXX */
 647     memset(p, 0, getme);
 648     RX_TS_INFO_GET(rx_ts_info);
 649
 650     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 651     /* TSFPQ patch also needs to keep track of total packets */
 652     MUTEX_ENTER(&rx_packets_mutex);
 653     rx_nPackets += apackets;
 654     RX_TS_FPQ_COMPUTE_LIMITS;
 655     MUTEX_EXIT(&rx_packets_mutex);
 656
 657     for (e = p + apackets; p < e; p++) {
 658         RX_PACKET_IOV_INIT(p);
 659         p->niovecs = 2;
 660         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 661
 662         NETPRI;
 663         MUTEX_ENTER(&rx_freePktQ_lock);
 664 #ifdef RXDEBUG_PACKET
 665         p->packetId = rx_packet_id++;
 666         p->allNextp = rx_mallocedP;
 667 #endif /* RXDEBUG_PACKET */
 668         rx_mallocedP = p;
 669         MUTEX_EXIT(&rx_freePktQ_lock);
 670         USERPRI;
 671     }
 672     rx_ts_info->_FPQ.delta += apackets;
 673
 674     if (flush_global &&
 675         (num_keep_local < apackets)) {
 676         NETPRI;
 677         MUTEX_ENTER(&rx_freePktQ_lock);
 678
 679         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 680         rxi_NeedMorePackets = FALSE;
 681         rxi_PacketsUnWait();
 682
 683         MUTEX_EXIT(&rx_freePktQ_lock);
 684         USERPRI;
 685     }
 686 }
 687 #endif /* RX_ENABLE_TSFPQ */
 688
 689 #ifndef KERNEL
 690 /* Add more packet buffers */
 691 void
 692 rxi_MorePacketsNoLock(int apackets)
 693 {
 694 #ifdef RX_ENABLE_TSFPQ
 695     struct rx_ts_info_t * rx_ts_info;
 696 #endif /* RX_ENABLE_TSFPQ */
 697     struct rx_packet *p, *e;
 698     int getme;
 699
 700     /* allocate enough packets that 1/4 of the packets will be able
 701      * to hold maximal amounts of data */
 702     apackets += (apackets / 4)
 703         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 704     do {
 705         getme = apackets * sizeof(struct rx_packet);
 706         p = (struct rx_packet *)osi_Alloc(getme);
 707         if (p == NULL) {
 708             apackets -= apackets / 4;
 709             osi_Assert(apackets > 0);
 710         }
 711     } while(p == NULL);
 712     memset(p, 0, getme);
 713
 714 #ifdef RX_ENABLE_TSFPQ
 715     RX_TS_INFO_GET(rx_ts_info);
 716     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 717 #endif /* RX_ENABLE_TSFPQ */
 718
 719     for (e = p + apackets; p < e; p++) {
 720         RX_PACKET_IOV_INIT(p);
 721 #ifdef RX_TRACK_PACKETS
 722         p->flags |= RX_PKTFLAG_FREE;
 723 #endif
 724         p->niovecs = 2;
 725
 726         queue_Append(&rx_freePacketQueue, p);
 727 #ifdef RXDEBUG_PACKET
 728         p->packetId = rx_packet_id++;
 729         p->allNextp = rx_mallocedP;
 730 #endif /* RXDEBUG_PACKET */
 731         rx_mallocedP = p;
 732     }
 733
 734     rx_nFreePackets += apackets;
 735     MUTEX_ENTER(&rx_packets_mutex);
 736     rx_nPackets += apackets;
 737 #ifdef RX_ENABLE_TSFPQ
 738     RX_TS_FPQ_COMPUTE_LIMITS;
 739 #endif /* RX_ENABLE_TSFPQ */
 740     MUTEX_EXIT(&rx_packets_mutex);
 741     rxi_NeedMorePackets = FALSE;
 742     rxi_PacketsUnWait();
 743 }
 744 #endif /* !KERNEL */
 745
 746 void
 747 rxi_FreeAllPackets(void)
 748 {
 749     /* must be called at proper interrupt level, etcetera */
 750     /* MTUXXX need to free all Packets */
 751     osi_Free(rx_mallocedP,
 752              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 753     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 754 }
 755
 756 #ifdef RX_ENABLE_TSFPQ
 757 void
 758 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 759 {
 760     struct rx_ts_info_t * rx_ts_info;
 761     int xfer;
 762     SPLVAR;
 763
 764     RX_TS_INFO_GET(rx_ts_info);
 765
 766     if (num_keep_local != rx_ts_info->_FPQ.len) {
 767         NETPRI;
 768         MUTEX_ENTER(&rx_freePktQ_lock);
 769         if (num_keep_local < rx_ts_info->_FPQ.len) {
 770             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 771             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 772             rxi_PacketsUnWait();
 773         } else {
 774             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 775             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 776                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 777             if (rx_nFreePackets < xfer) {
 778                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 779             }
 780             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 781         }
 782         MUTEX_EXIT(&rx_freePktQ_lock);
 783         USERPRI;
 784     }
 785 }
 786
 787 void
 788 rxi_FlushLocalPacketsTSFPQ(void)
 789 {
 790     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 791 }
 792 #endif /* RX_ENABLE_TSFPQ */
 793
 794 /* Allocate more packets iff we need more continuation buffers */
 795 /* In kernel, can't page in memory with interrupts disabled, so we
 796  * don't use the event mechanism. */
 797 void
 798 rx_CheckPackets(void)
 799 {
 800     if (rxi_NeedMorePackets) {
 801         rxi_MorePackets(rx_maxSendWindow);
 802     }
 803 }
 804
 805 /* In the packet freeing routine below, the assumption is that
 806    we want all of the packets to be used equally frequently, so that we
 807    don't get packet buffers paging out.  It would be just as valid to
 808    assume that we DO want them to page out if not many are being used.
 809    In any event, we assume the former, and append the packets to the end
 810    of the free list.  */
 811 /* This explanation is bogus.  The free list doesn't remain in any kind of
 812    useful order for afs_int32: the packets in use get pretty much randomly scattered
 813    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 814    must be stored so that packets which are adjacent in memory are adjacent in the
 815    free list.  An array springs rapidly to mind.
 816    */
 817
 818 /* Actually free the packet p. */
 819 #ifdef RX_ENABLE_TSFPQ
 820 void
 821 rxi_FreePacketNoLock(struct rx_packet *p)
 822 {
 823     struct rx_ts_info_t * rx_ts_info;
 824     dpf(("Free %"AFS_PTR_FMT"\n", p));
 825
 826     RX_TS_INFO_GET(rx_ts_info);
 827     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 828     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 829         RX_TS_FPQ_LTOG(rx_ts_info);
 830     }
 831 }
 832 #else /* RX_ENABLE_TSFPQ */
 833 void
 834 rxi_FreePacketNoLock(struct rx_packet *p)
 835 {
 836     dpf(("Free %"AFS_PTR_FMT"\n", p));
 837
 838     RX_FPQ_MARK_FREE(p);
 839     rx_nFreePackets++;
 840     queue_Append(&rx_freePacketQueue, p);
 841 }
 842 #endif /* RX_ENABLE_TSFPQ */
 843
 844 #ifdef RX_ENABLE_TSFPQ
 845 void
 846 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 847 {
 848     struct rx_ts_info_t * rx_ts_info;
 849     dpf(("Free %"AFS_PTR_FMT"\n", p));
 850
 851     RX_TS_INFO_GET(rx_ts_info);
 852     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 853
 854     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 855         NETPRI;
 856         MUTEX_ENTER(&rx_freePktQ_lock);
 857
 858         RX_TS_FPQ_LTOG(rx_ts_info);
 859
 860         /* Wakeup anyone waiting for packets */
 861         rxi_PacketsUnWait();
 862
 863         MUTEX_EXIT(&rx_freePktQ_lock);
 864         USERPRI;
 865     }
 866 }
 867 #endif /* RX_ENABLE_TSFPQ */
 868
 869 /*
 870  * free continuation buffers off a packet into a queue
 871  *
 872  * [IN] p      -- packet from which continuation buffers will be freed
 873  * [IN] first  -- iovec offset of first continuation buffer to free
 874  * [IN] q      -- queue into which continuation buffers will be chained
 875  *
 876  * returns:
 877  *   number of continuation buffers freed
 878  */
 879 #ifndef RX_ENABLE_TSFPQ
 880 static int
 881 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 882 {
 883     struct iovec *iov;
 884     struct rx_packet * cb;
 885     int count = 0;
 886
 887     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 888         iov = &p->wirevec[first];
 889         if (!iov->iov_base)
 890             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 891         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 892         RX_FPQ_MARK_FREE(cb);
 893         queue_Append(q, cb);
 894     }
 895     p->length = 0;
 896     p->niovecs = 0;
 897
 898     return count;
 899 }
 900 #endif
 901
 902 /*
 903  * free packet continuation buffers into the global free packet pool
 904  *
 905  * [IN] p      -- packet from which to free continuation buffers
 906  * [IN] first  -- iovec offset of first continuation buffer to free
 907  *
 908  * returns:
 909  *   zero always
 910  */
 911 int
 912 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 913 {
 914     struct iovec *iov;
 915
 916     for (first = MAX(2, first); first < p->niovecs; first++) {
 917         iov = &p->wirevec[first];
 918         if (!iov->iov_base)
 919             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 920         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 921     }
 922     p->length = 0;
 923     p->niovecs = 0;
 924
 925     return 0;
 926 }
 927
 928 #ifdef RX_ENABLE_TSFPQ
 929 /*
 930  * free packet continuation buffers into the thread-local free pool
 931  *
 932  * [IN] p             -- packet from which continuation buffers will be freed
 933  * [IN] first         -- iovec offset of first continuation buffer to free
 934  *                       any value less than 2, the min number of iovecs,
 935  *                       is treated as if it is 2.
 936  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 937  *                       global free pool before returning
 938  *
 939  * returns:
 940  *   zero always
 941  */
 942 static int
 943 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 944 {
 945     struct iovec *iov;
 946     struct rx_ts_info_t * rx_ts_info;
 947
 948     RX_TS_INFO_GET(rx_ts_info);
 949
 950     for (first = MAX(2, first); first < p->niovecs; first++) {
 951         iov = &p->wirevec[first];
 952         if (!iov->iov_base)
 953             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 954         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 955     }
 956     p->length = 0;
 957     p->niovecs = 0;
 958
 959     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 960         NETPRI;
 961         MUTEX_ENTER(&rx_freePktQ_lock);
 962
 963         RX_TS_FPQ_LTOG(rx_ts_info);
 964
 965         /* Wakeup anyone waiting for packets */
 966         rxi_PacketsUnWait();
 967
 968         MUTEX_EXIT(&rx_freePktQ_lock);
 969         USERPRI;
 970     }
 971     return 0;
 972 }
 973 #endif /* RX_ENABLE_TSFPQ */
 974
 975 int rxi_nBadIovecs = 0;
 976
 977 /* rxi_RestoreDataBufs
 978  *
 979  * Restore the correct sizes to the iovecs. Called when reusing a packet
 980  * for reading off the wire.
 981  */
 982 void
 983 rxi_RestoreDataBufs(struct rx_packet *p)
 984 {
 985     unsigned int i;
 986     struct iovec *iov = &p->wirevec[2];
 987
 988     RX_PACKET_IOV_INIT(p);
 989
 990     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 991         if (!iov->iov_base) {
 992             rxi_nBadIovecs++;
 993             p->niovecs = i;
 994             break;
 995         }
 996         iov->iov_len = RX_CBUFFERSIZE;
 997     }
 998 }
 999
1000 #ifdef RX_ENABLE_TSFPQ
1001 int
1002 rxi_TrimDataBufs(struct rx_packet *p, int first)
1003 {
1004     int length;
1005     struct iovec *iov, *end;
1006     struct rx_ts_info_t * rx_ts_info;
1007     SPLVAR;
1008
1009     if (first != 1)
1010         osi_Panic("TrimDataBufs 1: first must be 1");
1011
1012     /* Skip over continuation buffers containing message data */
1013     iov = &p->wirevec[2];
1014     end = iov + (p->niovecs - 2);
1015     length = p->length - p->wirevec[1].iov_len;
1016     for (; iov < end && length > 0; iov++) {
1017         if (!iov->iov_base)
1018             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1019         length -= iov->iov_len;
1020     }
1021
1022     /* iov now points to the first empty data buffer. */
1023     if (iov >= end)
1024         return 0;
1025
1026     RX_TS_INFO_GET(rx_ts_info);
1027     for (; iov < end; iov++) {
1028         if (!iov->iov_base)
1029             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1030         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1031         p->niovecs--;
1032     }
1033     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1034         NETPRI;
1035         MUTEX_ENTER(&rx_freePktQ_lock);
1036
1037         RX_TS_FPQ_LTOG(rx_ts_info);
1038         rxi_PacketsUnWait();
1039
1040         MUTEX_EXIT(&rx_freePktQ_lock);
1041         USERPRI;
1042     }
1043
1044     return 0;
1045 }
1046 #else /* RX_ENABLE_TSFPQ */
1047 int
1048 rxi_TrimDataBufs(struct rx_packet *p, int first)
1049 {
1050     int length;
1051     struct iovec *iov, *end;
1052     SPLVAR;
1053
1054     if (first != 1)
1055         osi_Panic("TrimDataBufs 1: first must be 1");
1056
1057     /* Skip over continuation buffers containing message data */
1058     iov = &p->wirevec[2];
1059     end = iov + (p->niovecs - 2);
1060     length = p->length - p->wirevec[1].iov_len;
1061     for (; iov < end && length > 0; iov++) {
1062         if (!iov->iov_base)
1063             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1064         length -= iov->iov_len;
1065     }
1066
1067     /* iov now points to the first empty data buffer. */
1068     if (iov >= end)
1069         return 0;
1070
1071     NETPRI;
1072     MUTEX_ENTER(&rx_freePktQ_lock);
1073
1074     for (; iov < end; iov++) {
1075         if (!iov->iov_base)
1076             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1077         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1078         p->niovecs--;
1079     }
1080     rxi_PacketsUnWait();
1081
1082     MUTEX_EXIT(&rx_freePktQ_lock);
1083     USERPRI;
1084
1085     return 0;
1086 }
1087 #endif /* RX_ENABLE_TSFPQ */
1088
1089 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1090  * remove it yourself first if you call this routine. */
1091 #ifdef RX_ENABLE_TSFPQ
1092 void
1093 rxi_FreePacket(struct rx_packet *p)
1094 {
1095     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1096     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1097 }
1098 #else /* RX_ENABLE_TSFPQ */
1099 void
1100 rxi_FreePacket(struct rx_packet *p)
1101 {
1102     SPLVAR;
1103
1104     NETPRI;
1105     MUTEX_ENTER(&rx_freePktQ_lock);
1106
1107     rxi_FreeDataBufsNoLock(p, 2);
1108     rxi_FreePacketNoLock(p);
1109     /* Wakeup anyone waiting for packets */
1110     rxi_PacketsUnWait();
1111
1112     MUTEX_EXIT(&rx_freePktQ_lock);
1113     USERPRI;
1114 }
1115 #endif /* RX_ENABLE_TSFPQ */
1116
1117 /* rxi_AllocPacket sets up p->length so it reflects the number of
1118  * bytes in the packet at this point, **not including** the header.
1119  * The header is absolutely necessary, besides, this is the way the
1120  * length field is usually used */
1121 #ifdef RX_ENABLE_TSFPQ
1122 struct rx_packet *
1123 rxi_AllocPacketNoLock(int class)
1124 {
1125     struct rx_packet *p;
1126     struct rx_ts_info_t * rx_ts_info;
1127
1128     RX_TS_INFO_GET(rx_ts_info);
1129
1130 #ifdef KERNEL
1131     if (rxi_OverQuota(class)) {
1132         rxi_NeedMorePackets = TRUE;
1133         if (rx_stats_active) {
1134             switch (class) {
1135             case RX_PACKET_CLASS_RECEIVE:
1136                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1137                 break;
1138             case RX_PACKET_CLASS_SEND:
1139                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1140                 break;
1141             case RX_PACKET_CLASS_SPECIAL:
1142                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1143                 break;
1144             case RX_PACKET_CLASS_RECV_CBUF:
1145                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1146                 break;
1147             case RX_PACKET_CLASS_SEND_CBUF:
1148                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1149                 break;
1150             }
1151         }
1152         return (struct rx_packet *)0;
1153     }
1154 #endif /* KERNEL */
1155
1156     if (rx_stats_active)
1157         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1158     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1159
1160 #ifdef KERNEL
1161         if (queue_IsEmpty(&rx_freePacketQueue))
1162             osi_Panic("rxi_AllocPacket error");
1163 #else /* KERNEL */
1164         if (queue_IsEmpty(&rx_freePacketQueue))
1165             rxi_MorePacketsNoLock(rx_maxSendWindow);
1166 #endif /* KERNEL */
1167
1168
1169         RX_TS_FPQ_GTOL(rx_ts_info);
1170     }
1171
1172     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1173
1174     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1175
1176
1177     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1178      * order to truncate outbound packets.  In the near future, may need
1179      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1180      */
1181     RX_PACKET_IOV_FULLINIT(p);
1182     return p;
1183 }
1184 #else /* RX_ENABLE_TSFPQ */
1185 struct rx_packet *
1186 rxi_AllocPacketNoLock(int class)
1187 {
1188     struct rx_packet *p;
1189
1190 #ifdef KERNEL
1191     if (rxi_OverQuota(class)) {
1192         rxi_NeedMorePackets = TRUE;
1193         if (rx_stats_active) {
1194             switch (class) {
1195             case RX_PACKET_CLASS_RECEIVE:
1196                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1197                 break;
1198             case RX_PACKET_CLASS_SEND:
1199                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1200                 break;
1201             case RX_PACKET_CLASS_SPECIAL:
1202                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1203                 break;
1204             case RX_PACKET_CLASS_RECV_CBUF:
1205                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1206                 break;
1207             case RX_PACKET_CLASS_SEND_CBUF:
1208                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1209                 break;
1210             }
1211         }
1212         return (struct rx_packet *)0;
1213     }
1214 #endif /* KERNEL */
1215
1216     if (rx_stats_active)
1217         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1218
1219 #ifdef KERNEL
1220     if (queue_IsEmpty(&rx_freePacketQueue))
1221         osi_Panic("rxi_AllocPacket error");
1222 #else /* KERNEL */
1223     if (queue_IsEmpty(&rx_freePacketQueue))
1224         rxi_MorePacketsNoLock(rx_maxSendWindow);
1225 #endif /* KERNEL */
1226
1227     rx_nFreePackets--;
1228     p = queue_First(&rx_freePacketQueue, rx_packet);
1229     queue_Remove(p);
1230     RX_FPQ_MARK_USED(p);
1231
1232     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1233
1234
1235     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1236      * order to truncate outbound packets.  In the near future, may need
1237      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1238      */
1239     RX_PACKET_IOV_FULLINIT(p);
1240     return p;
1241 }
1242 #endif /* RX_ENABLE_TSFPQ */
1243
1244 #ifdef RX_ENABLE_TSFPQ
1245 struct rx_packet *
1246 rxi_AllocPacketTSFPQ(int class, int pull_global)
1247 {
1248     struct rx_packet *p;
1249     struct rx_ts_info_t * rx_ts_info;
1250
1251     RX_TS_INFO_GET(rx_ts_info);
1252
1253     if (rx_stats_active)
1254         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1255     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1256         MUTEX_ENTER(&rx_freePktQ_lock);
1257
1258         if (queue_IsEmpty(&rx_freePacketQueue))
1259             rxi_MorePacketsNoLock(rx_maxSendWindow);
1260
1261         RX_TS_FPQ_GTOL(rx_ts_info);
1262
1263         MUTEX_EXIT(&rx_freePktQ_lock);
1264     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1265         return NULL;
1266     }
1267
1268     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1269
1270     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1271
1272     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1273      * order to truncate outbound packets.  In the near future, may need
1274      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1275      */
1276     RX_PACKET_IOV_FULLINIT(p);
1277     return p;
1278 }
1279 #endif /* RX_ENABLE_TSFPQ */
1280
1281 #ifdef RX_ENABLE_TSFPQ
1282 struct rx_packet *
1283 rxi_AllocPacket(int class)
1284 {
1285     struct rx_packet *p;
1286
1287     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1288     return p;
1289 }
1290 #else /* RX_ENABLE_TSFPQ */
1291 struct rx_packet *
1292 rxi_AllocPacket(int class)
1293 {
1294     struct rx_packet *p;
1295
1296     MUTEX_ENTER(&rx_freePktQ_lock);
1297     p = rxi_AllocPacketNoLock(class);
1298     MUTEX_EXIT(&rx_freePktQ_lock);
1299     return p;
1300 }
1301 #endif /* RX_ENABLE_TSFPQ */
1302
1303 /* This guy comes up with as many buffers as it {takes,can get} given
1304  * the MTU for this call. It also sets the packet length before
1305  * returning.  caution: this is often called at NETPRI
1306  * Called with call locked.
1307  */
1308 struct rx_packet *
1309 rxi_AllocSendPacket(struct rx_call *call, int want)
1310 {
1311     struct rx_packet *p = (struct rx_packet *)0;
1312     int mud;
1313     unsigned delta;
1314
1315     SPLVAR;
1316     mud = call->MTU - RX_HEADER_SIZE;
1317     delta =
1318         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1319         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1320
1321 #ifdef RX_ENABLE_TSFPQ
1322     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1323         want += delta;
1324         want = MIN(want, mud);
1325
1326         if ((unsigned)want > p->length)
1327             (void)rxi_AllocDataBuf(p, (want - p->length),
1328                                    RX_PACKET_CLASS_SEND_CBUF);
1329
1330         if (p->length > mud)
1331             p->length = mud;
1332
1333         if (delta >= p->length) {
1334             rxi_FreePacket(p);
1335             p = NULL;
1336         } else {
1337             p->length -= delta;
1338         }
1339         return p;
1340     }
1341 #endif /* RX_ENABLE_TSFPQ */
1342
1343     while (!(call->error)) {
1344         MUTEX_ENTER(&rx_freePktQ_lock);
1345         /* if an error occurred, or we get the packet we want, we're done */
1346         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1347             MUTEX_EXIT(&rx_freePktQ_lock);
1348
1349             want += delta;
1350             want = MIN(want, mud);
1351
1352             if ((unsigned)want > p->length)
1353                 (void)rxi_AllocDataBuf(p, (want - p->length),
1354                                        RX_PACKET_CLASS_SEND_CBUF);
1355
1356             if (p->length > mud)
1357                 p->length = mud;
1358
1359             if (delta >= p->length) {
1360                 rxi_FreePacket(p);
1361                 p = NULL;
1362             } else {
1363                 p->length -= delta;
1364             }
1365             break;
1366         }
1367
1368         /* no error occurred, and we didn't get a packet, so we sleep.
1369          * At this point, we assume that packets will be returned
1370          * sooner or later, as packets are acknowledged, and so we
1371          * just wait.  */
1372         NETPRI;
1373         call->flags |= RX_CALL_WAIT_PACKETS;
1374         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1375         MUTEX_EXIT(&call->lock);
1376         rx_waitingForPackets = 1;
1377
1378 #ifdef  RX_ENABLE_LOCKS
1379         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1380 #else
1381         osi_rxSleep(&rx_waitingForPackets);
1382 #endif
1383         MUTEX_EXIT(&rx_freePktQ_lock);
1384         MUTEX_ENTER(&call->lock);
1385         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1386         call->flags &= ~RX_CALL_WAIT_PACKETS;
1387         USERPRI;
1388     }
1389
1390     return p;
1391 }
1392
1393 #ifndef KERNEL
1394 #ifdef AFS_NT40_ENV
1395 /* Windows does not use file descriptors. */
1396 #define CountFDs(amax) 0
1397 #else
1398 /* count the number of used FDs */
1399 static int
1400 CountFDs(int amax)
1401 {
1402     struct stat tstat;
1403     int i, code;
1404     int count;
1405
1406     count = 0;
1407     for (i = 0; i < amax; i++) {
1408         code = fstat(i, &tstat);
1409         if (code == 0)
1410             count++;
1411     }
1412     return count;
1413 }
1414 #endif /* AFS_NT40_ENV */
1415 #else /* KERNEL */
1416
1417 #define CountFDs(amax) amax
1418
1419 #endif /* KERNEL */
1420
1421 #if !defined(KERNEL) || defined(UKERNEL)
1422
1423 /* This function reads a single packet from the interface into the
1424  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1425  * (host,port) of the sender are stored in the supplied variables, and
1426  * the data length of the packet is stored in the packet structure.
1427  * The header is decoded. */
1428 int
1429 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1430                u_short * port)
1431 {
1432     struct sockaddr_in from;
1433     unsigned int nbytes;
1434     afs_int32 rlen;
1435     afs_uint32 tlen, savelen;
1436     struct msghdr msg;
1437     rx_computelen(p, tlen);
1438     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1439
1440     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1441     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1442                                  * it once in order to avoid races.  */
1443     tlen = rlen - tlen;
1444     if (tlen > 0) {
1445         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1446         if (tlen > 0) {
1447             tlen = rlen - tlen;
1448         } else
1449             tlen = rlen;
1450     } else
1451         tlen = rlen;
1452
1453     /* Extend the last iovec for padding, it's just to make sure that the
1454      * read doesn't return more data than we expect, and is done to get around
1455      * our problems caused by the lack of a length field in the rx header.
1456      * Use the extra buffer that follows the localdata in each packet
1457      * structure. */
1458     savelen = p->wirevec[p->niovecs - 1].iov_len;
1459     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1460
1461     memset(&msg, 0, sizeof(msg));
1462     msg.msg_name = (char *)&from;
1463     msg.msg_namelen = sizeof(struct sockaddr_in);
1464     msg.msg_iov = p->wirevec;
1465     msg.msg_iovlen = p->niovecs;
1466     nbytes = rxi_Recvmsg(socket, &msg, 0);
1467
1468     /* restore the vec to its correct state */
1469     p->wirevec[p->niovecs - 1].iov_len = savelen;
1470
1471     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1472     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1473         if (nbytes < 0 && errno == EWOULDBLOCK) {
1474             if (rx_stats_active)
1475                 rx_MutexIncrement(rx_stats.noPacketOnRead, rx_stats_mutex);
1476         } else if (nbytes <= 0) {
1477             if (rx_stats_active) {
1478                 MUTEX_ENTER(&rx_stats_mutex);
1479                 rx_stats.bogusPacketOnRead++;
1480                 rx_stats.bogusHost = from.sin_addr.s_addr;
1481                 MUTEX_EXIT(&rx_stats_mutex);
1482             }
1483             dpf(("B: bogus packet from [%x,%d] nb=%d", ntohl(from.sin_addr.s_addr),
1484                  ntohs(from.sin_port), nbytes));
1485         }
1486         return 0;
1487     }
1488 #ifdef RXDEBUG
1489     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1490                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1491         rxi_DecodePacketHeader(p);
1492
1493         *host = from.sin_addr.s_addr;
1494         *port = from.sin_port;
1495
1496         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d",
1497               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1498               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1499               p->length));
1500 #ifdef RX_TRIMDATABUFS
1501         rxi_TrimDataBufs(p, 1);
1502 #endif
1503         return 0;
1504     }
1505 #endif
1506     else {
1507         /* Extract packet header. */
1508         rxi_DecodePacketHeader(p);
1509
1510         *host = from.sin_addr.s_addr;
1511         *port = from.sin_port;
1512         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1513             if (rx_stats_active) {
1514                 struct rx_peer *peer;
1515                 rx_MutexIncrement(rx_stats.packetsRead[p->header.type - 1], rx_stats_mutex);
1516                 /*
1517                  * Try to look up this peer structure.  If it doesn't exist,
1518                  * don't create a new one -
1519                  * we don't keep count of the bytes sent/received if a peer
1520                  * structure doesn't already exist.
1521                  *
1522                  * The peer/connection cleanup code assumes that there is 1 peer
1523                  * per connection.  If we actually created a peer structure here
1524                  * and this packet was an rxdebug packet, the peer structure would
1525                  * never be cleaned up.
1526                  */
1527                 peer = rxi_FindPeer(*host, *port, 0, 0);
1528                 /* Since this may not be associated with a connection,
1529                  * it may have no refCount, meaning we could race with
1530                  * ReapConnections
1531                  */
1532                 if (peer && (peer->refCount > 0)) {
1533                     MUTEX_ENTER(&peer->peer_lock);
1534                     hadd32(peer->bytesReceived, p->length);
1535                     MUTEX_EXIT(&peer->peer_lock);
1536                 }
1537             }
1538         }
1539
1540 #ifdef RX_TRIMDATABUFS
1541         /* Free any empty packet buffers at the end of this packet */
1542         rxi_TrimDataBufs(p, 1);
1543 #endif
1544         return 1;
1545     }
1546 }
1547
1548 #endif /* !KERNEL || UKERNEL */
1549
1550 /* This function splits off the first packet in a jumbo packet.
1551  * As of AFS 3.5, jumbograms contain more than one fixed size
1552  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1553  * last packet header. All packets (except the last) are padded to
1554  * fall on RX_CBUFFERSIZE boundaries.
1555  * HACK: We store the length of the first n-1 packets in the
1556  * last two pad bytes. */
1557
1558 struct rx_packet *
1559 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1560                      int first)
1561 {
1562     struct rx_packet *np;
1563     struct rx_jumboHeader *jp;
1564     int niov, i;
1565     struct iovec *iov;
1566     int length;
1567     afs_uint32 temp;
1568
1569     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1570      * bytes in length. All but the first packet are preceded by
1571      * an abbreviated four byte header. The length of the last packet
1572      * is calculated from the size of the jumbogram. */
1573     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1574
1575     if ((int)p->length < length) {
1576         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1577         return NULL;
1578     }
1579     niov = p->niovecs - 2;
1580     if (niov < 1) {
1581         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1582         return NULL;
1583     }
1584     iov = &p->wirevec[2];
1585     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1586
1587     /* Get a pointer to the abbreviated packet header */
1588     jp = (struct rx_jumboHeader *)
1589         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1590
1591     /* Set up the iovecs for the next packet */
1592     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1593     np->wirevec[0].iov_len = sizeof(struct rx_header);
1594     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1595     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1596     np->niovecs = niov + 1;
1597     for (i = 2, iov++; i <= niov; i++, iov++) {
1598         np->wirevec[i] = *iov;
1599     }
1600     np->length = p->length - length;
1601     p->length = RX_JUMBOBUFFERSIZE;
1602     p->niovecs = 2;
1603
1604     /* Convert the jumbo packet header to host byte order */
1605     temp = ntohl(*(afs_uint32 *) jp);
1606     jp->flags = (u_char) (temp >> 24);
1607     jp->cksum = (u_short) (temp);
1608
1609     /* Fill in the packet header */
1610     np->header = p->header;
1611     np->header.serial = p->header.serial + 1;
1612     np->header.seq = p->header.seq + 1;
1613     np->header.flags = jp->flags;
1614     np->header.spare = jp->cksum;
1615
1616     return np;
1617 }
1618
1619 #ifndef KERNEL
1620 /* Send a udp datagram */
1621 int
1622 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1623             int length, int istack)
1624 {
1625     struct msghdr msg;
1626         int ret;
1627
1628     memset(&msg, 0, sizeof(msg));
1629     msg.msg_iov = dvec;
1630     msg.msg_iovlen = nvecs;
1631     msg.msg_name = addr;
1632     msg.msg_namelen = sizeof(struct sockaddr_in);
1633
1634     ret = rxi_Sendmsg(socket, &msg, 0);
1635
1636     return ret;
1637 }
1638 #elif !defined(UKERNEL)
1639 /*
1640  * message receipt is done in rxk_input or rx_put.
1641  */
1642
1643 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1644 /*
1645  * Copy an mblock to the contiguous area pointed to by cp.
1646  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1647  * but it doesn't really.
1648  * Returns the number of bytes not transferred.
1649  * The message is NOT changed.
1650  */
1651 static int
1652 cpytoc(mblk_t * mp, int off, int len, char *cp)
1653 {
1654     int n;
1655
1656     for (; mp && len > 0; mp = mp->b_cont) {
1657         if (mp->b_datap->db_type != M_DATA) {
1658             return -1;
1659         }
1660         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1661         memcpy(cp, (char *)mp->b_rptr, n);
1662         cp += n;
1663         len -= n;
1664         mp->b_rptr += n;
1665     }
1666     return (len);
1667 }
1668
1669 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1670  * but it doesn't really.
1671  * This sucks, anyway, do it like m_cpy.... below
1672  */
1673 static int
1674 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1675            int niovs)
1676 {
1677     int m, n, o, t, i;
1678
1679     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1680         if (mp->b_datap->db_type != M_DATA) {
1681             return -1;
1682         }
1683         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1684         len -= n;
1685         while (n) {
1686             if (!t) {
1687                 o = 0;
1688                 i++;
1689                 t = iovs[i].iov_len;
1690             }
1691             m = MIN(n, t);
1692             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1693             mp->b_rptr += m;
1694             o += m;
1695             t -= m;
1696             n -= m;
1697         }
1698     }
1699     return (len);
1700 }
1701
1702 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1703 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1704 #else
1705 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1706 static int
1707 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1708 {
1709     caddr_t p1, p2;
1710     unsigned int l1, l2, i, t;
1711
1712     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1713         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1714
1715     while (off && m)
1716         if (m->m_len <= off) {
1717             off -= m->m_len;
1718             m = m->m_next;
1719             continue;
1720         } else
1721             break;
1722
1723     if (m == NULL)
1724         return len;
1725
1726     p1 = mtod(m, caddr_t) + off;
1727     l1 = m->m_len - off;
1728     i = 0;
1729     p2 = iovs[0].iov_base;
1730     l2 = iovs[0].iov_len;
1731
1732     while (len) {
1733         t = MIN(l1, MIN(l2, (unsigned int)len));
1734         memcpy(p2, p1, t);
1735         p1 += t;
1736         p2 += t;
1737         l1 -= t;
1738         l2 -= t;
1739         len -= t;
1740         if (!l1) {
1741             m = m->m_next;
1742             if (!m)
1743                 break;
1744             p1 = mtod(m, caddr_t);
1745             l1 = m->m_len;
1746         }
1747         if (!l2) {
1748             if (++i >= niovs)
1749                 break;
1750             p2 = iovs[i].iov_base;
1751             l2 = iovs[i].iov_len;
1752         }
1753
1754     }
1755
1756     return len;
1757 }
1758 #endif /* LINUX */
1759 #endif /* AFS_SUN5_ENV */
1760
1761 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1762 int
1763 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1764 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1765      mblk_t *amb;
1766 #else
1767      struct mbuf *amb;
1768 #endif
1769      void (*free) ();
1770      struct rx_packet *phandle;
1771      int hdr_len, data_len;
1772 {
1773     int code;
1774
1775     code =
1776         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1777                      phandle->niovecs);
1778     (*free) (amb);
1779
1780     return code;
1781 }
1782 #endif /* LINUX */
1783 #endif /*KERNEL && !UKERNEL */
1784
1785
1786 /* send a response to a debug packet */
1787
1788 struct rx_packet *
1789 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1790                        afs_uint32 ahost, short aport, int istack)
1791 {
1792     struct rx_debugIn tin;
1793     afs_int32 tl;
1794     struct rx_serverQueueEntry *np, *nqe;
1795
1796     /*
1797      * Only respond to client-initiated Rx debug packets,
1798      * and clear the client flag in the response.
1799      */
1800     if (ap->header.flags & RX_CLIENT_INITIATED) {
1801         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1802         rxi_EncodePacketHeader(ap);
1803     } else {
1804         return ap;
1805     }
1806
1807     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1808     /* all done with packet, now set length to the truth, so we can
1809      * reuse this packet */
1810     rx_computelen(ap, ap->length);
1811
1812     tin.type = ntohl(tin.type);
1813     tin.index = ntohl(tin.index);
1814     switch (tin.type) {
1815     case RX_DEBUGI_GETSTATS:{
1816             struct rx_debugStats tstat;
1817
1818             /* get basic stats */
1819             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1820             tstat.version = RX_DEBUGI_VERSION;
1821 #ifndef RX_ENABLE_LOCKS
1822             tstat.waitingForPackets = rx_waitingForPackets;
1823 #endif
1824             MUTEX_ENTER(&rx_serverPool_lock);
1825             tstat.nFreePackets = htonl(rx_nFreePackets);
1826             tstat.nPackets = htonl(rx_nPackets);
1827             tstat.callsExecuted = htonl(rxi_nCalls);
1828             tstat.packetReclaims = htonl(rx_packetReclaims);
1829             tstat.usedFDs = CountFDs(64);
1830             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1831             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1832             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1833                         tstat.idleThreads);
1834             MUTEX_EXIT(&rx_serverPool_lock);
1835             tstat.idleThreads = htonl(tstat.idleThreads);
1836             tl = sizeof(struct rx_debugStats) - ap->length;
1837             if (tl > 0)
1838                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1839
1840             if (tl <= 0) {
1841                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1842                                (char *)&tstat);
1843                 ap->length = sizeof(struct rx_debugStats);
1844                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1845                 rx_computelen(ap, ap->length);
1846             }
1847             break;
1848         }
1849
1850     case RX_DEBUGI_GETALLCONN:
1851     case RX_DEBUGI_GETCONN:{
1852             unsigned int i, j;
1853             struct rx_connection *tc;
1854             struct rx_call *tcall;
1855             struct rx_debugConn tconn;
1856             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1857
1858
1859             tl = sizeof(struct rx_debugConn) - ap->length;
1860             if (tl > 0)
1861                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1862             if (tl > 0)
1863                 return ap;
1864
1865             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1866             /* get N'th (maybe) "interesting" connection info */
1867             for (i = 0; i < rx_hashTableSize; i++) {
1868 #if !defined(KERNEL)
1869                 /* the time complexity of the algorithm used here
1870                  * exponentially increses with the number of connections.
1871                  */
1872 #ifdef AFS_PTHREAD_ENV
1873                 pthread_yield();
1874 #else
1875                 (void)IOMGR_Poll();
1876 #endif
1877 #endif
1878                 MUTEX_ENTER(&rx_connHashTable_lock);
1879                 /* We might be slightly out of step since we are not
1880                  * locking each call, but this is only debugging output.
1881                  */
1882                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1883                     if ((all || rxi_IsConnInteresting(tc))
1884                         && tin.index-- <= 0) {
1885                         tconn.host = tc->peer->host;
1886                         tconn.port = tc->peer->port;
1887                         tconn.cid = htonl(tc->cid);
1888                         tconn.epoch = htonl(tc->epoch);
1889                         tconn.serial = htonl(tc->serial);
1890                         for (j = 0; j < RX_MAXCALLS; j++) {
1891                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1892                             if ((tcall = tc->call[j])) {
1893                                 tconn.callState[j] = tcall->state;
1894                                 tconn.callMode[j] = tcall->mode;
1895                                 tconn.callFlags[j] = tcall->flags;
1896                                 if (queue_IsNotEmpty(&tcall->rq))
1897                                     tconn.callOther[j] |= RX_OTHER_IN;
1898                                 if (queue_IsNotEmpty(&tcall->tq))
1899                                     tconn.callOther[j] |= RX_OTHER_OUT;
1900                             } else
1901                                 tconn.callState[j] = RX_STATE_NOTINIT;
1902                         }
1903
1904                         tconn.natMTU = htonl(tc->peer->natMTU);
1905                         tconn.error = htonl(tc->error);
1906                         tconn.flags = tc->flags;
1907                         tconn.type = tc->type;
1908                         tconn.securityIndex = tc->securityIndex;
1909                         if (tc->securityObject) {
1910                             RXS_GetStats(tc->securityObject, tc,
1911                                          &tconn.secStats);
1912 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1913 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1914                             DOHTONL(flags);
1915                             DOHTONL(expires);
1916                             DOHTONL(packetsReceived);
1917                             DOHTONL(packetsSent);
1918                             DOHTONL(bytesReceived);
1919                             DOHTONL(bytesSent);
1920                             for (i = 0;
1921                                  i <
1922                                  sizeof(tconn.secStats.spares) /
1923                                  sizeof(short); i++)
1924                                 DOHTONS(spares[i]);
1925                             for (i = 0;
1926                                  i <
1927                                  sizeof(tconn.secStats.sparel) /
1928                                  sizeof(afs_int32); i++)
1929                                 DOHTONL(sparel[i]);
1930                         }
1931
1932                         MUTEX_EXIT(&rx_connHashTable_lock);
1933                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1934                                        (char *)&tconn);
1935                         tl = ap->length;
1936                         ap->length = sizeof(struct rx_debugConn);
1937                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1938                                             istack);
1939                         ap->length = tl;
1940                         return ap;
1941                     }
1942                 }
1943                 MUTEX_EXIT(&rx_connHashTable_lock);
1944             }
1945             /* if we make it here, there are no interesting packets */
1946             tconn.cid = htonl(0xffffffff);      /* means end */
1947             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1948                            (char *)&tconn);
1949             tl = ap->length;
1950             ap->length = sizeof(struct rx_debugConn);
1951             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1952             ap->length = tl;
1953             break;
1954         }
1955
1956         /*
1957          * Pass back all the peer structures we have available
1958          */
1959
1960     case RX_DEBUGI_GETPEER:{
1961             unsigned int i;
1962             struct rx_peer *tp;
1963             struct rx_debugPeer tpeer;
1964
1965
1966             tl = sizeof(struct rx_debugPeer) - ap->length;
1967             if (tl > 0)
1968                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1969             if (tl > 0)
1970                 return ap;
1971
1972             memset(&tpeer, 0, sizeof(tpeer));
1973             for (i = 0; i < rx_hashTableSize; i++) {
1974 #if !defined(KERNEL)
1975                 /* the time complexity of the algorithm used here
1976                  * exponentially increses with the number of peers.
1977                  *
1978                  * Yielding after processing each hash table entry
1979                  * and dropping rx_peerHashTable_lock.
1980                  * also increases the risk that we will miss a new
1981                  * entry - but we are willing to live with this
1982                  * limitation since this is meant for debugging only
1983                  */
1984 #ifdef AFS_PTHREAD_ENV
1985                 pthread_yield();
1986 #else
1987                 (void)IOMGR_Poll();
1988 #endif
1989 #endif
1990                 MUTEX_ENTER(&rx_peerHashTable_lock);
1991                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1992                     if (tin.index-- <= 0) {
1993                         tp->refCount++;
1994                         MUTEX_EXIT(&rx_peerHashTable_lock);
1995
1996                         MUTEX_ENTER(&tp->peer_lock);
1997                         tpeer.host = tp->host;
1998                         tpeer.port = tp->port;
1999                         tpeer.ifMTU = htons(tp->ifMTU);
2000                         tpeer.idleWhen = htonl(tp->idleWhen);
2001                         tpeer.refCount = htons(tp->refCount);
2002                         tpeer.burstSize = tp->burstSize;
2003                         tpeer.burst = tp->burst;
2004                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
2005                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
2006                         tpeer.rtt = htonl(tp->rtt);
2007                         tpeer.rtt_dev = htonl(tp->rtt_dev);
2008                         tpeer.timeout.sec = htonl(tp->timeout.sec);
2009                         tpeer.timeout.usec = htonl(tp->timeout.usec);
2010                         tpeer.nSent = htonl(tp->nSent);
2011                         tpeer.reSends = htonl(tp->reSends);
2012                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
2013                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
2014                         tpeer.rateFlag = htonl(tp->rateFlag);
2015                         tpeer.natMTU = htons(tp->natMTU);
2016                         tpeer.maxMTU = htons(tp->maxMTU);
2017                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2018                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2019                         tpeer.MTU = htons(tp->MTU);
2020                         tpeer.cwind = htons(tp->cwind);
2021                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2022                         tpeer.congestSeq = htons(tp->congestSeq);
2023                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2024                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2025                         tpeer.bytesReceived.high =
2026                             htonl(tp->bytesReceived.high);
2027                         tpeer.bytesReceived.low =
2028                             htonl(tp->bytesReceived.low);
2029                         MUTEX_EXIT(&tp->peer_lock);
2030
2031                         MUTEX_ENTER(&rx_peerHashTable_lock);
2032                         tp->refCount--;
2033                         MUTEX_EXIT(&rx_peerHashTable_lock);
2034
2035                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2036                                        (char *)&tpeer);
2037                         tl = ap->length;
2038                         ap->length = sizeof(struct rx_debugPeer);
2039                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2040                                             istack);
2041                         ap->length = tl;
2042                         return ap;
2043                     }
2044                 }
2045                 MUTEX_EXIT(&rx_peerHashTable_lock);
2046             }
2047             /* if we make it here, there are no interesting packets */
2048             tpeer.host = htonl(0xffffffff);     /* means end */
2049             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2050                            (char *)&tpeer);
2051             tl = ap->length;
2052             ap->length = sizeof(struct rx_debugPeer);
2053             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2054             ap->length = tl;
2055             break;
2056         }
2057
2058     case RX_DEBUGI_RXSTATS:{
2059             int i;
2060             afs_int32 *s;
2061
2062             tl = sizeof(rx_stats) - ap->length;
2063             if (tl > 0)
2064                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2065             if (tl > 0)
2066                 return ap;
2067
2068             /* Since its all int32s convert to network order with a loop. */
2069         if (rx_stats_active)
2070             MUTEX_ENTER(&rx_stats_mutex);
2071             s = (afs_int32 *) & rx_stats;
2072             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2073                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2074
2075             tl = ap->length;
2076             ap->length = sizeof(rx_stats);
2077         if (rx_stats_active)
2078             MUTEX_EXIT(&rx_stats_mutex);
2079             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2080             ap->length = tl;
2081             break;
2082         }
2083
2084     default:
2085         /* error response packet */
2086         tin.type = htonl(RX_DEBUGI_BADTYPE);
2087         tin.index = tin.type;
2088         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2089         tl = ap->length;
2090         ap->length = sizeof(struct rx_debugIn);
2091         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2092         ap->length = tl;
2093         break;
2094     }
2095     return ap;
2096 }
2097
2098 struct rx_packet *
2099 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2100                          afs_uint32 ahost, short aport, int istack)
2101 {
2102     afs_int32 tl;
2103
2104     /*
2105      * Only respond to client-initiated version requests, and
2106      * clear that flag in the response.
2107      */
2108     if (ap->header.flags & RX_CLIENT_INITIATED) {
2109         char buf[66];
2110
2111         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2112         rxi_EncodePacketHeader(ap);
2113         memset(buf, 0, sizeof(buf));
2114         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2115         rx_packetwrite(ap, 0, 65, buf);
2116         tl = ap->length;
2117         ap->length = 65;
2118         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2119         ap->length = tl;
2120     }
2121
2122     return ap;
2123 }
2124
2125
2126 /* send a debug packet back to the sender */
2127 static void
2128 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2129                     afs_uint32 ahost, short aport, afs_int32 istack)
2130 {
2131     struct sockaddr_in taddr;
2132     unsigned int i, nbytes, savelen = 0;
2133     int saven = 0;
2134 #ifdef KERNEL
2135     int waslocked = ISAFS_GLOCK();
2136 #endif
2137
2138     taddr.sin_family = AF_INET;
2139     taddr.sin_port = aport;
2140     taddr.sin_addr.s_addr = ahost;
2141 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2142     taddr.sin_len = sizeof(struct sockaddr_in);
2143 #endif
2144
2145     /* We need to trim the niovecs. */
2146     nbytes = apacket->length;
2147     for (i = 1; i < apacket->niovecs; i++) {
2148         if (nbytes <= apacket->wirevec[i].iov_len) {
2149             savelen = apacket->wirevec[i].iov_len;
2150             saven = apacket->niovecs;
2151             apacket->wirevec[i].iov_len = nbytes;
2152             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2153         } else
2154             nbytes -= apacket->wirevec[i].iov_len;
2155     }
2156 #ifdef KERNEL
2157 #ifdef RX_KERNEL_TRACE
2158     if (ICL_SETACTIVE(afs_iclSetp)) {
2159         if (!waslocked)
2160             AFS_GLOCK();
2161         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2162                    "before osi_NetSend()");
2163         AFS_GUNLOCK();
2164     } else
2165 #else
2166     if (waslocked)
2167         AFS_GUNLOCK();
2168 #endif
2169 #endif
2170     /* debug packets are not reliably delivered, hence the cast below. */
2171     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2172                       apacket->length + RX_HEADER_SIZE, istack);
2173 #ifdef KERNEL
2174 #ifdef RX_KERNEL_TRACE
2175     if (ICL_SETACTIVE(afs_iclSetp)) {
2176         AFS_GLOCK();
2177         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2178                    "after osi_NetSend()");
2179         if (!waslocked)
2180             AFS_GUNLOCK();
2181     } else
2182 #else
2183     if (waslocked)
2184         AFS_GLOCK();
2185 #endif
2186 #endif
2187     if (saven) {                /* means we truncated the packet above. */
2188         apacket->wirevec[i - 1].iov_len = savelen;
2189         apacket->niovecs = saven;
2190     }
2191
2192 }
2193
2194 /* Send the packet to appropriate destination for the specified
2195  * call.  The header is first encoded and placed in the packet.
2196  */
2197 void
2198 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2199                struct rx_packet *p, int istack)
2200 {
2201 #if defined(KERNEL)
2202     int waslocked;
2203 #endif
2204     int code;
2205     struct sockaddr_in addr;
2206     struct rx_peer *peer = conn->peer;
2207     osi_socket socket;
2208 #ifdef RXDEBUG
2209     char deliveryType = 'S';
2210 #endif
2211     /* The address we're sending the packet to */
2212     memset(&addr, 0, sizeof(addr));
2213     addr.sin_family = AF_INET;
2214     addr.sin_port = peer->port;
2215     addr.sin_addr.s_addr = peer->host;
2216
2217     /* This stuff should be revamped, I think, so that most, if not
2218      * all, of the header stuff is always added here.  We could
2219      * probably do away with the encode/decode routines. XXXXX */
2220
2221     /* Stamp each packet with a unique serial number.  The serial
2222      * number is maintained on a connection basis because some types
2223      * of security may be based on the serial number of the packet,
2224      * and security is handled on a per authenticated-connection
2225      * basis. */
2226     /* Pre-increment, to guarantee no zero serial number; a zero
2227      * serial number means the packet was never sent. */
2228     MUTEX_ENTER(&conn->conn_data_lock);
2229     p->header.serial = ++conn->serial;
2230     if (p->length > conn->peer->maxPacketSize) {
2231         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2232             (p->header.flags & RX_REQUEST_ACK)) {
2233             conn->lastPingSize = p->length;
2234             conn->lastPingSizeSer = p->header.serial;
2235         } else if (p->header.seq != 0) {
2236             conn->lastPacketSize = p->length;
2237             conn->lastPacketSizeSeq = p->header.seq;
2238         }
2239     }
2240     MUTEX_EXIT(&conn->conn_data_lock);
2241     /* This is so we can adjust retransmit time-outs better in the face of
2242      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2243      */
2244     if (p->firstSerial == 0) {
2245         p->firstSerial = p->header.serial;
2246     }
2247 #ifdef RXDEBUG
2248     /* If an output tracer function is defined, call it with the packet and
2249      * network address.  Note this function may modify its arguments. */
2250     if (rx_almostSent) {
2251         int drop = (*rx_almostSent) (p, &addr);
2252         /* drop packet if return value is non-zero? */
2253         if (drop)
2254             deliveryType = 'D'; /* Drop the packet */
2255     }
2256 #endif
2257
2258     /* Get network byte order header */
2259     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2260                                  * touch ALL the fields */
2261
2262     /* Send the packet out on the same socket that related packets are being
2263      * received on */
2264     socket =
2265         (conn->type ==
2266          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2267
2268 #ifdef RXDEBUG
2269     /* Possibly drop this packet,  for testing purposes */
2270     if ((deliveryType == 'D')
2271         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2272             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2273         deliveryType = 'D';     /* Drop the packet */
2274     } else {
2275         deliveryType = 'S';     /* Send the packet */
2276 #endif /* RXDEBUG */
2277
2278         /* Loop until the packet is sent.  We'd prefer just to use a
2279          * blocking socket, but unfortunately the interface doesn't
2280          * allow us to have the socket block in send mode, and not
2281          * block in receive mode */
2282 #ifdef KERNEL
2283         waslocked = ISAFS_GLOCK();
2284 #ifdef RX_KERNEL_TRACE
2285         if (ICL_SETACTIVE(afs_iclSetp)) {
2286             if (!waslocked)
2287                 AFS_GLOCK();
2288             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2289                        "before osi_NetSend()");
2290             AFS_GUNLOCK();
2291         } else
2292 #else
2293         if (waslocked)
2294             AFS_GUNLOCK();
2295 #endif
2296 #endif
2297         if ((code =
2298              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2299                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2300             /* send failed, so let's hurry up the resend, eh? */
2301             if (rx_stats_active)
2302                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2303             p->retryTime = p->timeSent; /* resend it very soon */
2304             clock_Addmsec(&(p->retryTime),
2305                           10 + (((afs_uint32) p->backoff) << 8));
2306             /* Some systems are nice and tell us right away that we cannot
2307              * reach this recipient by returning an error code.
2308              * So, when this happens let's "down" the host NOW so
2309              * we don't sit around waiting for this host to timeout later.
2310              */
2311             if (call &&
2312 #ifdef AFS_NT40_ENV
2313                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2314 #elif defined(AFS_LINUX20_ENV)
2315                 code == -ENETUNREACH
2316 #elif defined(AFS_DARWIN_ENV)
2317                 code == EHOSTUNREACH
2318 #else
2319                 0
2320 #endif
2321                 )
2322                 call->lastReceiveTime = 0;
2323         }
2324 #ifdef KERNEL
2325 #ifdef RX_KERNEL_TRACE
2326         if (ICL_SETACTIVE(afs_iclSetp)) {
2327             AFS_GLOCK();
2328             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2329                        "after osi_NetSend()");
2330             if (!waslocked)
2331                 AFS_GUNLOCK();
2332         } else
2333 #else
2334         if (waslocked)
2335             AFS_GLOCK();
2336 #endif
2337 #endif
2338 #ifdef RXDEBUG
2339     }
2340     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2341           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2342           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2343           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2344 #endif
2345     if (rx_stats_active) {
2346         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2347         MUTEX_ENTER(&peer->peer_lock);
2348         hadd32(peer->bytesSent, p->length);
2349         MUTEX_EXIT(&peer->peer_lock);
2350     }
2351 }
2352
2353 /* Send a list of packets to appropriate destination for the specified
2354  * connection.  The headers are first encoded and placed in the packets.
2355  */
2356 void
2357 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2358                    struct rx_packet **list, int len, int istack)
2359 {
2360 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2361     int waslocked;
2362 #endif
2363     struct sockaddr_in addr;
2364     struct rx_peer *peer = conn->peer;
2365     osi_socket socket;
2366     struct rx_packet *p = NULL;
2367     struct iovec wirevec[RX_MAXIOVECS];
2368     int i, length, code;
2369     afs_uint32 serial;
2370     afs_uint32 temp;
2371     struct rx_jumboHeader *jp;
2372 #ifdef RXDEBUG
2373     char deliveryType = 'S';
2374 #endif
2375     /* The address we're sending the packet to */
2376     addr.sin_family = AF_INET;
2377     addr.sin_port = peer->port;
2378     addr.sin_addr.s_addr = peer->host;
2379
2380     if (len + 1 > RX_MAXIOVECS) {
2381         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2382     }
2383
2384     /*
2385      * Stamp the packets in this jumbogram with consecutive serial numbers
2386      */
2387     MUTEX_ENTER(&conn->conn_data_lock);
2388     serial = conn->serial;
2389     conn->serial += len;
2390     for (i = 0; i < len; i++) {
2391         p = list[i];
2392         if (p->length > conn->peer->maxPacketSize) {
2393             /* a ping *or* a sequenced packet can count */
2394             if ((p->length > conn->peer->maxPacketSize)) {
2395                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2396                      (p->header.flags & RX_REQUEST_ACK)) &&
2397                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2398                     conn->lastPingSize = p->length;
2399                     conn->lastPingSizeSer = serial + i;
2400                 } else if ((p->header.seq != 0) &&
2401                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2402                     conn->lastPacketSize = p->length;
2403                     conn->lastPacketSizeSeq = p->header.seq;
2404                 }
2405             }
2406         }
2407     }
2408     MUTEX_EXIT(&conn->conn_data_lock);
2409
2410
2411     /* This stuff should be revamped, I think, so that most, if not
2412      * all, of the header stuff is always added here.  We could
2413      * probably do away with the encode/decode routines. XXXXX */
2414
2415     jp = NULL;
2416     length = RX_HEADER_SIZE;
2417     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2418     wirevec[0].iov_len = RX_HEADER_SIZE;
2419     for (i = 0; i < len; i++) {
2420         p = list[i];
2421
2422         /* The whole 3.5 jumbogram scheme relies on packets fitting
2423          * in a single packet buffer. */
2424         if (p->niovecs > 2) {
2425             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2426         }
2427
2428         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2429          * in this chunk.  */
2430         if (i < len - 1) {
2431             if (p->length != RX_JUMBOBUFFERSIZE) {
2432                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2433             }
2434             p->header.flags |= RX_JUMBO_PACKET;
2435             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2436             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2437         } else {
2438             wirevec[i + 1].iov_len = p->length;
2439             length += p->length;
2440         }
2441         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2442         if (jp != NULL) {
2443             /* Convert jumbo packet header to network byte order */
2444             temp = (afs_uint32) (p->header.flags) << 24;
2445             temp |= (afs_uint32) (p->header.spare);
2446             *(afs_uint32 *) jp = htonl(temp);
2447         }
2448         jp = (struct rx_jumboHeader *)
2449             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2450
2451         /* Stamp each packet with a unique serial number.  The serial
2452          * number is maintained on a connection basis because some types
2453          * of security may be based on the serial number of the packet,
2454          * and security is handled on a per authenticated-connection
2455          * basis. */
2456         /* Pre-increment, to guarantee no zero serial number; a zero
2457          * serial number means the packet was never sent. */
2458         p->header.serial = ++serial;
2459         /* This is so we can adjust retransmit time-outs better in the face of
2460          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2461          */
2462         if (p->firstSerial == 0) {
2463             p->firstSerial = p->header.serial;
2464         }
2465 #ifdef RXDEBUG
2466         /* If an output tracer function is defined, call it with the packet and
2467          * network address.  Note this function may modify its arguments. */
2468         if (rx_almostSent) {
2469             int drop = (*rx_almostSent) (p, &addr);
2470             /* drop packet if return value is non-zero? */
2471             if (drop)
2472                 deliveryType = 'D';     /* Drop the packet */
2473         }
2474 #endif
2475
2476         /* Get network byte order header */
2477         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2478                                          * touch ALL the fields */
2479     }
2480
2481     /* Send the packet out on the same socket that related packets are being
2482      * received on */
2483     socket =
2484         (conn->type ==
2485          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2486
2487 #ifdef RXDEBUG
2488     /* Possibly drop this packet,  for testing purposes */
2489     if ((deliveryType == 'D')
2490         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2491             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2492         deliveryType = 'D';     /* Drop the packet */
2493     } else {
2494         deliveryType = 'S';     /* Send the packet */
2495 #endif /* RXDEBUG */
2496
2497         /* Loop until the packet is sent.  We'd prefer just to use a
2498          * blocking socket, but unfortunately the interface doesn't
2499          * allow us to have the socket block in send mode, and not
2500          * block in receive mode */
2501 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2502         waslocked = ISAFS_GLOCK();
2503         if (!istack && waslocked)
2504             AFS_GUNLOCK();
2505 #endif
2506         if ((code =
2507              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2508                          istack)) != 0) {
2509             /* send failed, so let's hurry up the resend, eh? */
2510             if (rx_stats_active)
2511                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2512             for (i = 0; i < len; i++) {
2513                 p = list[i];
2514                 p->retryTime = p->timeSent;     /* resend it very soon */
2515                 clock_Addmsec(&(p->retryTime),
2516                               10 + (((afs_uint32) p->backoff) << 8));
2517             }
2518             /* Some systems are nice and tell us right away that we cannot
2519              * reach this recipient by returning an error code.
2520              * So, when this happens let's "down" the host NOW so
2521              * we don't sit around waiting for this host to timeout later.
2522              */
2523             if (call &&
2524 #ifdef AFS_NT40_ENV
2525                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2526 #elif defined(AFS_LINUX20_ENV)
2527                 code == -ENETUNREACH
2528 #elif defined(AFS_DARWIN_ENV)
2529                 code == EHOSTUNREACH
2530 #else
2531                 0
2532 #endif
2533                 )
2534                 call->lastReceiveTime = 0;
2535         }
2536 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2537         if (!istack && waslocked)
2538             AFS_GLOCK();
2539 #endif
2540 #ifdef RXDEBUG
2541     }
2542
2543     assert(p != NULL);
2544
2545     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2546           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2547           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2548           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2549
2550 #endif
2551     if (rx_stats_active) {
2552         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2553         MUTEX_ENTER(&peer->peer_lock);
2554         hadd32(peer->bytesSent, p->length);
2555         MUTEX_EXIT(&peer->peer_lock);
2556     }
2557 }
2558
2559
2560 /* Send a "special" packet to the peer connection.  If call is
2561  * specified, then the packet is directed to a specific call channel
2562  * associated with the connection, otherwise it is directed to the
2563  * connection only. Uses optionalPacket if it is supplied, rather than
2564  * allocating a new packet buffer.  Nbytes is the length of the data
2565  * portion of the packet.  If data is non-null, nbytes of data are
2566  * copied into the packet.  Type is the type of the packet, as defined
2567  * in rx.h.  Bug: there's a lot of duplication between this and other
2568  * routines.  This needs to be cleaned up. */
2569 struct rx_packet *
2570 rxi_SendSpecial(struct rx_call *call,
2571                 struct rx_connection *conn,
2572                 struct rx_packet *optionalPacket, int type, char *data,
2573                 int nbytes, int istack)
2574 {
2575     /* Some of the following stuff should be common code for all
2576      * packet sends (it's repeated elsewhere) */
2577     struct rx_packet *p;
2578     unsigned int i = 0;
2579     int savelen = 0, saven = 0;
2580     int channel, callNumber;
2581     if (call) {
2582         channel = call->channel;
2583         callNumber = *call->callNumber;
2584         /* BUSY packets refer to the next call on this connection */
2585         if (type == RX_PACKET_TYPE_BUSY) {
2586             callNumber++;
2587         }
2588     } else {
2589         channel = 0;
2590         callNumber = 0;
2591     }
2592     p = optionalPacket;
2593     if (!p) {
2594         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2595         if (!p)
2596             osi_Panic("rxi_SendSpecial failure");
2597     }
2598
2599     if (nbytes != -1)
2600         p->length = nbytes;
2601     else
2602         nbytes = p->length;
2603     p->header.serviceId = conn->serviceId;
2604     p->header.securityIndex = conn->securityIndex;
2605     p->header.cid = (conn->cid | channel);
2606     p->header.callNumber = callNumber;
2607     p->header.seq = 0;
2608     p->header.epoch = conn->epoch;
2609     p->header.type = type;
2610     p->header.flags = 0;
2611     if (conn->type == RX_CLIENT_CONNECTION)
2612         p->header.flags |= RX_CLIENT_INITIATED;
2613     if (data)
2614         rx_packetwrite(p, 0, nbytes, data);
2615
2616     for (i = 1; i < p->niovecs; i++) {
2617         if (nbytes <= p->wirevec[i].iov_len) {
2618             savelen = p->wirevec[i].iov_len;
2619             saven = p->niovecs;
2620             p->wirevec[i].iov_len = nbytes;
2621             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2622         } else
2623             nbytes -= p->wirevec[i].iov_len;
2624     }
2625
2626     if (call)
2627         rxi_Send(call, p, istack);
2628     else
2629         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2630     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2631         /* really need to do this, but it seems safer this way, given that  */
2632         /* sneaky optionalPacket... */
2633         p->wirevec[i - 1].iov_len = savelen;
2634         p->niovecs = saven;
2635     }
2636     if (!optionalPacket)
2637         rxi_FreePacket(p);
2638     return optionalPacket;
2639 }
2640
2641
2642 /* Encode the packet's header (from the struct header in the packet to
2643  * the net byte order representation in the wire representation of the
2644  * packet, which is what is actually sent out on the wire) */
2645 void
2646 rxi_EncodePacketHeader(struct rx_packet *p)
2647 {
2648     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2649
2650     memset(buf, 0, RX_HEADER_SIZE);
2651     *buf++ = htonl(p->header.epoch);
2652     *buf++ = htonl(p->header.cid);
2653     *buf++ = htonl(p->header.callNumber);
2654     *buf++ = htonl(p->header.seq);
2655     *buf++ = htonl(p->header.serial);
2656     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2657                    | (((afs_uint32) p->header.flags) << 16)
2658                    | (p->header.userStatus << 8) | p->header.securityIndex);
2659     /* Note: top 16 bits of this next word were reserved */
2660     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2661 }
2662
2663 /* Decode the packet's header (from net byte order to a struct header) */
2664 void
2665 rxi_DecodePacketHeader(struct rx_packet *p)
2666 {
2667     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2668     afs_uint32 temp;
2669
2670     p->header.epoch = ntohl(*buf);
2671     buf++;
2672     p->header.cid = ntohl(*buf);
2673     buf++;
2674     p->header.callNumber = ntohl(*buf);
2675     buf++;
2676     p->header.seq = ntohl(*buf);
2677     buf++;
2678     p->header.serial = ntohl(*buf);
2679     buf++;
2680
2681     temp = ntohl(*buf);
2682     buf++;
2683
2684     /* C will truncate byte fields to bytes for me */
2685     p->header.type = temp >> 24;
2686     p->header.flags = temp >> 16;
2687     p->header.userStatus = temp >> 8;
2688     p->header.securityIndex = temp >> 0;
2689
2690     temp = ntohl(*buf);
2691     buf++;
2692
2693     p->header.serviceId = (temp & 0xffff);
2694     p->header.spare = temp >> 16;
2695     /* Note: top 16 bits of this last word are the security checksum */
2696 }
2697
2698 void
2699 rxi_PrepareSendPacket(struct rx_call *call,
2700                       struct rx_packet *p, int last)
2701 {
2702     struct rx_connection *conn = call->conn;
2703     unsigned int i;
2704     afs_int32 len;              /* len must be a signed type; it can go negative */
2705
2706     p->flags &= ~RX_PKTFLAG_ACKED;
2707     p->header.cid = (conn->cid | call->channel);
2708     p->header.serviceId = conn->serviceId;
2709     p->header.securityIndex = conn->securityIndex;
2710
2711     /* No data packets on call 0. Where do these come from? */
2712     if (*call->callNumber == 0)
2713         *call->callNumber = 1;
2714
2715     p->header.callNumber = *call->callNumber;
2716     p->header.seq = call->tnext++;
2717     p->header.epoch = conn->epoch;
2718     p->header.type = RX_PACKET_TYPE_DATA;
2719     p->header.flags = 0;
2720     p->header.spare = 0;
2721     if (conn->type == RX_CLIENT_CONNECTION)
2722         p->header.flags |= RX_CLIENT_INITIATED;
2723
2724     if (last)
2725         p->header.flags |= RX_LAST_PACKET;
2726
2727     clock_Zero(&p->retryTime);  /* Never yet transmitted */
2728     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2729     p->header.serial = 0;       /* Another way of saying never transmitted... */
2730     p->backoff = 0;
2731
2732     /* Now that we're sure this is the last data on the call, make sure
2733      * that the "length" and the sum of the iov_lens matches. */
2734     len = p->length + call->conn->securityHeaderSize;
2735
2736     for (i = 1; i < p->niovecs && len > 0; i++) {
2737         len -= p->wirevec[i].iov_len;
2738     }
2739     if (len > 0) {
2740         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2741     } else if (i < p->niovecs) {
2742         /* Free any extra elements in the wirevec */
2743 #if defined(RX_ENABLE_TSFPQ)
2744         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2745 #else /* !RX_ENABLE_TSFPQ */
2746         MUTEX_ENTER(&rx_freePktQ_lock);
2747         rxi_FreeDataBufsNoLock(p, i);
2748         MUTEX_EXIT(&rx_freePktQ_lock);
2749 #endif /* !RX_ENABLE_TSFPQ */
2750
2751         p->niovecs = i;
2752     }
2753     if (len)
2754         p->wirevec[i - 1].iov_len += len;
2755     RXS_PreparePacket(conn->securityObject, call, p);
2756 }
2757
2758 /* Given an interface MTU size, calculate an adjusted MTU size that
2759  * will make efficient use of the RX buffers when the peer is sending
2760  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2761 int
2762 rxi_AdjustIfMTU(int mtu)
2763 {
2764     int adjMTU;
2765     int frags;
2766
2767     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2768         return mtu;
2769     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2770     if (mtu <= adjMTU) {
2771         return mtu;
2772     }
2773     mtu -= adjMTU;
2774     if (mtu <= 0) {
2775         return adjMTU;
2776     }
2777     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2778     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2779 }
2780
2781 /* Given an interface MTU size, and the peer's advertised max receive
2782  * size, calculate an adjisted maxMTU size that makes efficient use
2783  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2784 int
2785 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2786 {
2787     int maxMTU = mtu * rxi_nSendFrags;
2788     maxMTU = MIN(maxMTU, peerMaxMTU);
2789     return rxi_AdjustIfMTU(maxMTU);
2790 }
2791
2792 /* Given a packet size, figure out how many datagram packet will fit.
2793  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2794  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2795  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2796 int
2797 rxi_AdjustDgramPackets(int frags, int mtu)
2798 {
2799     int maxMTU;
2800     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2801         return 1;
2802     }
2803     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2804     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2805     /* subtract the size of the first and last packets */
2806     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2807     if (maxMTU < 0) {
2808         return 1;
2809     }
2810     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2811 }
2812
2813 #ifndef KERNEL
2814 /*
2815  * This function can be used by the Windows Cache Manager
2816  * to dump the list of all rx packets so that we can determine
2817  * where the packet leakage is.
2818  */
2819 int rx_DumpPackets(FILE *outputFile, char *cookie)
2820 {
2821 #ifdef RXDEBUG_PACKET
2822     struct rx_packet *p;
2823 #ifdef AFS_NT40_ENV
2824     int zilch;
2825     char output[2048];
2826 #define RXDPRINTF sprintf
2827 #define RXDPRINTOUT output
2828 #else
2829 #define RXDPRINTF fprintf
2830 #define RXDPRINTOUT outputFile
2831 #endif
2832
2833     NETPRI;
2834     MUTEX_ENTER(&rx_freePktQ_lock);
2835     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2836 #ifdef AFS_NT40_ENV
2837     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2838 #endif
2839
2840     for (p = rx_mallocedP; p; p = p->allNextp) {
2841         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2842                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2843                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2844                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2845                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2846                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2847 #ifdef AFS_NT40_ENV
2848         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2849 #endif
2850     }
2851
2852     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2853 #ifdef AFS_NT40_ENV
2854     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2855 #endif
2856
2857     MUTEX_EXIT(&rx_freePktQ_lock);
2858     USERPRI;
2859 #endif /* RXDEBUG_PACKET */
2860     return 0;
2861 }
2862 #endif