src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # if defined(AFS_NT40_ENV)
  47 #  ifndef EWOULDBLOCK
  48 #   define EWOULDBLOCK WSAEWOULDBLOCK
  49 #  endif
  50 #  include "rx_user.h"
  51 #  include "rx_xmit_nt.h"
  52 # endif
  53 # include <lwp.h>
  54 #endif /* KERNEL */
  55
  56 #ifdef  AFS_SUN5_ENV
  57 # include <sys/sysmacros.h>
  58 #endif
  59
  60 #include "rx.h"
  61 #include "rx_clock.h"
  62 #include "rx_queue.h"
  63 #include "rx_packet.h"
  64 #include "rx_atomic.h"
  65 #include "rx_globals.h"
  66 #include "rx_internal.h"
  67 #include "rx_stats.h"
  68
  69 #include "rx_conn.h"
  70
  71 #ifdef RX_LOCKS_DB
  72 /* rxdb_fileID is used to identify the lock location, along with line#. */
  73 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  74 #endif /* RX_LOCKS_DB */
  75 static struct rx_packet *rx_mallocedP = 0;
  76 #ifdef RXDEBUG_PACKET
  77 static afs_uint32       rx_packet_id = 0;
  78 #endif
  79
  80 extern char cml_version_number[];
  81
  82 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
  83
  84 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  85                                 afs_uint32 ahost, short aport,
  86                                 afs_int32 istack);
  87
  88 #ifdef RX_ENABLE_TSFPQ
  89 static int
  90 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
  91 #else
  92 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
  93                                    afs_uint32 first,
  94                                    struct rx_queue * q);
  95 #endif
  96
  97 /* some rules about packets:
  98  * 1.  When a packet is allocated, the final iov_buf contains room for
  99  * a security trailer, but iov_len masks that fact.  If the security
 100  * package wants to add the trailer, it may do so, and then extend
 101  * iov_len appropriately.  For this reason, packet's niovecs and
 102  * iov_len fields should be accurate before calling PreparePacket.
 103 */
 104
 105 /* Preconditions:
 106  *        all packet buffers (iov_base) are integral multiples of
 107  *        the word size.
 108  *        offset is an integral multiple of the word size.
 109  */
 110 afs_int32
 111 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 112 {
 113     unsigned int i;
 114     size_t l;
 115     for (l = 0, i = 1; i < packet->niovecs; i++) {
 116         if (l + packet->wirevec[i].iov_len > offset) {
 117             return
 118                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 119                                  (offset - l)));
 120         }
 121         l += packet->wirevec[i].iov_len;
 122     }
 123
 124     return 0;
 125 }
 126
 127 /* Preconditions:
 128  *        all packet buffers (iov_base) are integral multiples of the word size.
 129  *        offset is an integral multiple of the word size.
 130  */
 131 afs_int32
 132 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 133 {
 134     unsigned int i;
 135     size_t l;
 136     for (l = 0, i = 1; i < packet->niovecs; i++) {
 137         if (l + packet->wirevec[i].iov_len > offset) {
 138             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 139                              (offset - l))) = data;
 140             return 0;
 141         }
 142         l += packet->wirevec[i].iov_len;
 143     }
 144
 145     return 0;
 146 }
 147
 148 /* Preconditions:
 149  *        all packet buffers (iov_base) are integral multiples of the
 150  *        word size.
 151  *        offset is an integral multiple of the word size.
 152  * Packet Invariants:
 153  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 154  */
 155 afs_int32
 156 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 157                   char *out)
 158 {
 159     unsigned int i, j, l, r;
 160     for (l = 0, i = 1; i < packet->niovecs; i++) {
 161         if (l + packet->wirevec[i].iov_len > offset) {
 162             break;
 163         }
 164         l += packet->wirevec[i].iov_len;
 165     }
 166
 167     /* i is the iovec which contains the first little bit of data in which we
 168      * are interested.  l is the total length of everything prior to this iovec.
 169      * j is the number of bytes we can safely copy out of this iovec.
 170      * offset only applies to the first iovec.
 171      */
 172     r = resid;
 173     while ((r > 0) && (i < packet->niovecs)) {
 174         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 175         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 176         r -= j;
 177         out += j;
 178         l += packet->wirevec[i].iov_len;
 179         offset = l;
 180         i++;
 181     }
 182
 183     return (r ? (resid - r) : resid);
 184 }
 185
 186
 187 /* Preconditions:
 188  *        all packet buffers (iov_base) are integral multiples of the
 189  *        word size.
 190  *        offset is an integral multiple of the word size.
 191  */
 192 afs_int32
 193 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 194 {
 195     unsigned int i, j, l, o, r;
 196     char *b;
 197
 198     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 199         if (l + packet->wirevec[i].iov_len > o) {
 200             break;
 201         }
 202         l += packet->wirevec[i].iov_len;
 203     }
 204
 205     /* i is the iovec which contains the first little bit of data in which we
 206      * are interested.  l is the total length of everything prior to this iovec.
 207      * j is the number of bytes we can safely copy out of this iovec.
 208      * offset only applies to the first iovec.
 209      */
 210     r = resid;
 211     while ((r > 0) && (i <= RX_MAXWVECS)) {
 212         if (i >= packet->niovecs)
 213             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 214                 break;
 215
 216         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 217         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 218         memcpy(b, in, j);
 219         r -= j;
 220         in += j;
 221         l += packet->wirevec[i].iov_len;
 222         offset = l;
 223         i++;
 224     }
 225
 226     return (r ? (resid - r) : resid);
 227 }
 228
 229 int
 230 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 231 {
 232     struct rx_packet *p, *np;
 233
 234     num_pkts = AllocPacketBufs(class, num_pkts, q);
 235
 236     for (queue_Scan(q, p, np, rx_packet)) {
 237         RX_PACKET_IOV_FULLINIT(p);
 238     }
 239
 240     return num_pkts;
 241 }
 242
 243 #ifdef RX_ENABLE_TSFPQ
 244 static int
 245 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 246 {
 247     struct rx_ts_info_t * rx_ts_info;
 248     int transfer;
 249     SPLVAR;
 250
 251     RX_TS_INFO_GET(rx_ts_info);
 252
 253     transfer = num_pkts - rx_ts_info->_FPQ.len;
 254     if (transfer > 0) {
 255         NETPRI;
 256         MUTEX_ENTER(&rx_freePktQ_lock);
 257         transfer = MAX(transfer, rx_TSFPQGlobSize);
 258         if (transfer > rx_nFreePackets) {
 259             /* alloc enough for us, plus a few globs for other threads */
 260             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 261         }
 262
 263         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 264
 265         MUTEX_EXIT(&rx_freePktQ_lock);
 266         USERPRI;
 267     }
 268
 269     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 270
 271     return num_pkts;
 272 }
 273 #else /* RX_ENABLE_TSFPQ */
 274 static int
 275 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 276 {
 277     struct rx_packet *c;
 278     int i;
 279 #ifdef KERNEL
 280     int overq = 0;
 281 #endif
 282     SPLVAR;
 283
 284     NETPRI;
 285
 286     MUTEX_ENTER(&rx_freePktQ_lock);
 287
 288 #ifdef KERNEL
 289     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 290          num_pkts--, overq++);
 291
 292     if (overq) {
 293         rxi_NeedMorePackets = TRUE;
 294         if (rx_stats_active) {
 295             switch (class) {
 296             case RX_PACKET_CLASS_RECEIVE:
 297                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 298                 break;
 299             case RX_PACKET_CLASS_SEND:
 300                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 301                 break;
 302             case RX_PACKET_CLASS_SPECIAL:
 303                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 304                 break;
 305             case RX_PACKET_CLASS_RECV_CBUF:
 306                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 307                 break;
 308             case RX_PACKET_CLASS_SEND_CBUF:
 309                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 310                 break;
 311             }
 312         }
 313     }
 314
 315     if (rx_nFreePackets < num_pkts)
 316         num_pkts = rx_nFreePackets;
 317
 318     if (!num_pkts) {
 319         rxi_NeedMorePackets = TRUE;
 320         goto done;
 321     }
 322 #else /* KERNEL */
 323     if (rx_nFreePackets < num_pkts) {
 324         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 325     }
 326 #endif /* KERNEL */
 327
 328     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 329          i < num_pkts;
 330          i++, c=queue_Next(c, rx_packet)) {
 331         RX_FPQ_MARK_USED(c);
 332     }
 333
 334     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 335
 336     rx_nFreePackets -= num_pkts;
 337
 338 #ifdef KERNEL
 339   done:
 340 #endif
 341     MUTEX_EXIT(&rx_freePktQ_lock);
 342
 343     USERPRI;
 344     return num_pkts;
 345 }
 346 #endif /* RX_ENABLE_TSFPQ */
 347
 348 /*
 349  * Free a packet currently used as a continuation buffer
 350  */
 351 #ifdef RX_ENABLE_TSFPQ
 352 /* num_pkts=0 means queue length is unknown */
 353 int
 354 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 355 {
 356     struct rx_ts_info_t * rx_ts_info;
 357     struct rx_packet *c, *nc;
 358     SPLVAR;
 359
 360     osi_Assert(num_pkts >= 0);
 361     RX_TS_INFO_GET(rx_ts_info);
 362
 363     if (!num_pkts) {
 364         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 365             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 366         }
 367     } else {
 368         for (queue_Scan(q, c, nc, rx_packet)) {
 369             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 370         }
 371     }
 372
 373     if (num_pkts) {
 374         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 375     }
 376
 377     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 378         NETPRI;
 379         MUTEX_ENTER(&rx_freePktQ_lock);
 380
 381         RX_TS_FPQ_LTOG(rx_ts_info);
 382
 383         /* Wakeup anyone waiting for packets */
 384         rxi_PacketsUnWait();
 385
 386         MUTEX_EXIT(&rx_freePktQ_lock);
 387         USERPRI;
 388     }
 389
 390     return num_pkts;
 391 }
 392 #else /* RX_ENABLE_TSFPQ */
 393 /* num_pkts=0 means queue length is unknown */
 394 int
 395 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 396 {
 397     struct rx_queue cbs;
 398     struct rx_packet *p, *np;
 399     int qlen = 0;
 400     SPLVAR;
 401
 402     osi_Assert(num_pkts >= 0);
 403     queue_Init(&cbs);
 404
 405     if (!num_pkts) {
 406         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 407             if (p->niovecs > 2) {
 408                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 409             }
 410             RX_FPQ_MARK_FREE(p);
 411         }
 412         if (!num_pkts)
 413             return 0;
 414     } else {
 415         for (queue_Scan(q, p, np, rx_packet)) {
 416             if (p->niovecs > 2) {
 417                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 418             }
 419             RX_FPQ_MARK_FREE(p);
 420         }
 421     }
 422
 423     if (qlen) {
 424         queue_SpliceAppend(q, &cbs);
 425         qlen += num_pkts;
 426     } else
 427         qlen = num_pkts;
 428
 429     NETPRI;
 430     MUTEX_ENTER(&rx_freePktQ_lock);
 431
 432     queue_SpliceAppend(&rx_freePacketQueue, q);
 433     rx_nFreePackets += qlen;
 434
 435     /* Wakeup anyone waiting for packets */
 436     rxi_PacketsUnWait();
 437
 438     MUTEX_EXIT(&rx_freePktQ_lock);
 439     USERPRI;
 440
 441     return num_pkts;
 442 }
 443 #endif /* RX_ENABLE_TSFPQ */
 444
 445 /* this one is kind of awful.
 446  * In rxkad, the packet has been all shortened, and everything, ready for
 447  * sending.  All of a sudden, we discover we need some of that space back.
 448  * This isn't terribly general, because it knows that the packets are only
 449  * rounded up to the EBS (userdata + security header).
 450  */
 451 int
 452 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 453 {
 454     int i;
 455     i = p->niovecs - 1;
 456     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 457         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 458             p->wirevec[i].iov_len += nb;
 459             return 0;
 460         }
 461     } else {
 462         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 463             p->wirevec[i].iov_len += nb;
 464             return 0;
 465         }
 466     }
 467
 468     return 0;
 469 }
 470
 471 /* get sufficient space to store nb bytes of data (or more), and hook
 472  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 473  * returns the number of bytes >0 which it failed to come up with.
 474  * Don't need to worry about locking on packet, since only
 475  * one thread can manipulate one at a time. Locking on continution
 476  * packets is handled by AllocPacketBufs */
 477 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 478 int
 479 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 480 {
 481     int i, nv;
 482     struct rx_queue q;
 483     struct rx_packet *cb, *ncb;
 484
 485     /* compute the number of cbuf's we need */
 486     nv = nb / RX_CBUFFERSIZE;
 487     if ((nv * RX_CBUFFERSIZE) < nb)
 488         nv++;
 489     if ((nv + p->niovecs) > RX_MAXWVECS)
 490         nv = RX_MAXWVECS - p->niovecs;
 491     if (nv < 1)
 492         return nb;
 493
 494     /* allocate buffers */
 495     queue_Init(&q);
 496     nv = AllocPacketBufs(class, nv, &q);
 497
 498     /* setup packet iovs */
 499     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 500         queue_Remove(cb);
 501         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 502         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 503     }
 504
 505     nb -= (nv * RX_CBUFFERSIZE);
 506     p->length += (nv * RX_CBUFFERSIZE);
 507     p->niovecs += nv;
 508
 509     return nb;
 510 }
 511
 512 /* Add more packet buffers */
 513 #ifdef RX_ENABLE_TSFPQ
 514 void
 515 rxi_MorePackets(int apackets)
 516 {
 517     struct rx_packet *p, *e;
 518     struct rx_ts_info_t * rx_ts_info;
 519     int getme;
 520     SPLVAR;
 521
 522     getme = apackets * sizeof(struct rx_packet);
 523     p = (struct rx_packet *)osi_Alloc(getme);
 524     osi_Assert(p);
 525
 526     PIN(p, getme);              /* XXXXX */
 527     memset(p, 0, getme);
 528     RX_TS_INFO_GET(rx_ts_info);
 529
 530     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 531     /* TSFPQ patch also needs to keep track of total packets */
 532
 533     MUTEX_ENTER(&rx_packets_mutex);
 534     rx_nPackets += apackets;
 535     RX_TS_FPQ_COMPUTE_LIMITS;
 536     MUTEX_EXIT(&rx_packets_mutex);
 537
 538     for (e = p + apackets; p < e; p++) {
 539         RX_PACKET_IOV_INIT(p);
 540         p->niovecs = 2;
 541
 542         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 543
 544         NETPRI;
 545         MUTEX_ENTER(&rx_freePktQ_lock);
 546 #ifdef RXDEBUG_PACKET
 547         p->packetId = rx_packet_id++;
 548         p->allNextp = rx_mallocedP;
 549 #endif /* RXDEBUG_PACKET */
 550         rx_mallocedP = p;
 551         MUTEX_EXIT(&rx_freePktQ_lock);
 552         USERPRI;
 553     }
 554     rx_ts_info->_FPQ.delta += apackets;
 555
 556     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 557         NETPRI;
 558         MUTEX_ENTER(&rx_freePktQ_lock);
 559
 560         RX_TS_FPQ_LTOG(rx_ts_info);
 561         rxi_NeedMorePackets = FALSE;
 562         rxi_PacketsUnWait();
 563
 564         MUTEX_EXIT(&rx_freePktQ_lock);
 565         USERPRI;
 566     }
 567 }
 568 #else /* RX_ENABLE_TSFPQ */
 569 void
 570 rxi_MorePackets(int apackets)
 571 {
 572     struct rx_packet *p, *e;
 573     int getme;
 574     SPLVAR;
 575
 576     getme = apackets * sizeof(struct rx_packet);
 577     p = (struct rx_packet *)osi_Alloc(getme);
 578     osi_Assert(p);
 579
 580     PIN(p, getme);              /* XXXXX */
 581     memset(p, 0, getme);
 582     NETPRI;
 583     MUTEX_ENTER(&rx_freePktQ_lock);
 584
 585     for (e = p + apackets; p < e; p++) {
 586         RX_PACKET_IOV_INIT(p);
 587 #ifdef RX_TRACK_PACKETS
 588         p->flags |= RX_PKTFLAG_FREE;
 589 #endif
 590         p->niovecs = 2;
 591
 592         queue_Append(&rx_freePacketQueue, p);
 593 #ifdef RXDEBUG_PACKET
 594         p->packetId = rx_packet_id++;
 595         p->allNextp = rx_mallocedP;
 596 #endif /* RXDEBUG_PACKET */
 597         rx_mallocedP = p;
 598     }
 599
 600     rx_nPackets += apackets;
 601     rx_nFreePackets += apackets;
 602     rxi_NeedMorePackets = FALSE;
 603     rxi_PacketsUnWait();
 604
 605     MUTEX_EXIT(&rx_freePktQ_lock);
 606     USERPRI;
 607 }
 608 #endif /* RX_ENABLE_TSFPQ */
 609
 610 #ifdef RX_ENABLE_TSFPQ
 611 void
 612 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 613 {
 614     struct rx_packet *p, *e;
 615     struct rx_ts_info_t * rx_ts_info;
 616     int getme;
 617     SPLVAR;
 618
 619     getme = apackets * sizeof(struct rx_packet);
 620     p = (struct rx_packet *)osi_Alloc(getme);
 621
 622     PIN(p, getme);              /* XXXXX */
 623     memset(p, 0, getme);
 624     RX_TS_INFO_GET(rx_ts_info);
 625
 626     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 627     /* TSFPQ patch also needs to keep track of total packets */
 628     MUTEX_ENTER(&rx_packets_mutex);
 629     rx_nPackets += apackets;
 630     RX_TS_FPQ_COMPUTE_LIMITS;
 631     MUTEX_EXIT(&rx_packets_mutex);
 632
 633     for (e = p + apackets; p < e; p++) {
 634         RX_PACKET_IOV_INIT(p);
 635         p->niovecs = 2;
 636         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 637
 638         NETPRI;
 639         MUTEX_ENTER(&rx_freePktQ_lock);
 640 #ifdef RXDEBUG_PACKET
 641         p->packetId = rx_packet_id++;
 642         p->allNextp = rx_mallocedP;
 643 #endif /* RXDEBUG_PACKET */
 644         rx_mallocedP = p;
 645         MUTEX_EXIT(&rx_freePktQ_lock);
 646         USERPRI;
 647     }
 648     rx_ts_info->_FPQ.delta += apackets;
 649
 650     if (flush_global &&
 651         (num_keep_local < apackets)) {
 652         NETPRI;
 653         MUTEX_ENTER(&rx_freePktQ_lock);
 654
 655         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 656         rxi_NeedMorePackets = FALSE;
 657         rxi_PacketsUnWait();
 658
 659         MUTEX_EXIT(&rx_freePktQ_lock);
 660         USERPRI;
 661     }
 662 }
 663 #endif /* RX_ENABLE_TSFPQ */
 664
 665 #ifndef KERNEL
 666 /* Add more packet buffers */
 667 void
 668 rxi_MorePacketsNoLock(int apackets)
 669 {
 670 #ifdef RX_ENABLE_TSFPQ
 671     struct rx_ts_info_t * rx_ts_info;
 672 #endif /* RX_ENABLE_TSFPQ */
 673     struct rx_packet *p, *e;
 674     int getme;
 675
 676     /* allocate enough packets that 1/4 of the packets will be able
 677      * to hold maximal amounts of data */
 678     apackets += (apackets / 4)
 679         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 680     do {
 681         getme = apackets * sizeof(struct rx_packet);
 682         p = (struct rx_packet *)osi_Alloc(getme);
 683         if (p == NULL) {
 684             apackets -= apackets / 4;
 685             osi_Assert(apackets > 0);
 686         }
 687     } while(p == NULL);
 688     memset(p, 0, getme);
 689
 690 #ifdef RX_ENABLE_TSFPQ
 691     RX_TS_INFO_GET(rx_ts_info);
 692     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 693 #endif /* RX_ENABLE_TSFPQ */
 694
 695     for (e = p + apackets; p < e; p++) {
 696         RX_PACKET_IOV_INIT(p);
 697 #ifdef RX_TRACK_PACKETS
 698         p->flags |= RX_PKTFLAG_FREE;
 699 #endif
 700         p->niovecs = 2;
 701
 702         queue_Append(&rx_freePacketQueue, p);
 703 #ifdef RXDEBUG_PACKET
 704         p->packetId = rx_packet_id++;
 705         p->allNextp = rx_mallocedP;
 706 #endif /* RXDEBUG_PACKET */
 707         rx_mallocedP = p;
 708     }
 709
 710     rx_nFreePackets += apackets;
 711     MUTEX_ENTER(&rx_packets_mutex);
 712     rx_nPackets += apackets;
 713 #ifdef RX_ENABLE_TSFPQ
 714     RX_TS_FPQ_COMPUTE_LIMITS;
 715 #endif /* RX_ENABLE_TSFPQ */
 716     MUTEX_EXIT(&rx_packets_mutex);
 717     rxi_NeedMorePackets = FALSE;
 718     rxi_PacketsUnWait();
 719 }
 720 #endif /* !KERNEL */
 721
 722 void
 723 rxi_FreeAllPackets(void)
 724 {
 725     /* must be called at proper interrupt level, etcetera */
 726     /* MTUXXX need to free all Packets */
 727     osi_Free(rx_mallocedP,
 728              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 729     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 730 }
 731
 732 #ifdef RX_ENABLE_TSFPQ
 733 void
 734 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 735 {
 736     struct rx_ts_info_t * rx_ts_info;
 737     int xfer;
 738     SPLVAR;
 739
 740     RX_TS_INFO_GET(rx_ts_info);
 741
 742     if (num_keep_local != rx_ts_info->_FPQ.len) {
 743         NETPRI;
 744         MUTEX_ENTER(&rx_freePktQ_lock);
 745         if (num_keep_local < rx_ts_info->_FPQ.len) {
 746             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 747             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 748             rxi_PacketsUnWait();
 749         } else {
 750             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 751             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 752                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 753             if (rx_nFreePackets < xfer) {
 754                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 755             }
 756             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 757         }
 758         MUTEX_EXIT(&rx_freePktQ_lock);
 759         USERPRI;
 760     }
 761 }
 762
 763 void
 764 rxi_FlushLocalPacketsTSFPQ(void)
 765 {
 766     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 767 }
 768 #endif /* RX_ENABLE_TSFPQ */
 769
 770 /* Allocate more packets iff we need more continuation buffers */
 771 /* In kernel, can't page in memory with interrupts disabled, so we
 772  * don't use the event mechanism. */
 773 void
 774 rx_CheckPackets(void)
 775 {
 776     if (rxi_NeedMorePackets) {
 777         rxi_MorePackets(rx_maxSendWindow);
 778     }
 779 }
 780
 781 /* In the packet freeing routine below, the assumption is that
 782    we want all of the packets to be used equally frequently, so that we
 783    don't get packet buffers paging out.  It would be just as valid to
 784    assume that we DO want them to page out if not many are being used.
 785    In any event, we assume the former, and append the packets to the end
 786    of the free list.  */
 787 /* This explanation is bogus.  The free list doesn't remain in any kind of
 788    useful order for afs_int32: the packets in use get pretty much randomly scattered
 789    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 790    must be stored so that packets which are adjacent in memory are adjacent in the
 791    free list.  An array springs rapidly to mind.
 792    */
 793
 794 /* Actually free the packet p. */
 795 #ifdef RX_ENABLE_TSFPQ
 796 void
 797 rxi_FreePacketNoLock(struct rx_packet *p)
 798 {
 799     struct rx_ts_info_t * rx_ts_info;
 800     dpf(("Free %"AFS_PTR_FMT"\n", p));
 801
 802     RX_TS_INFO_GET(rx_ts_info);
 803     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 804     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 805         RX_TS_FPQ_LTOG(rx_ts_info);
 806     }
 807 }
 808 #else /* RX_ENABLE_TSFPQ */
 809 void
 810 rxi_FreePacketNoLock(struct rx_packet *p)
 811 {
 812     dpf(("Free %"AFS_PTR_FMT"\n", p));
 813
 814     RX_FPQ_MARK_FREE(p);
 815     rx_nFreePackets++;
 816     queue_Append(&rx_freePacketQueue, p);
 817 }
 818 #endif /* RX_ENABLE_TSFPQ */
 819
 820 #ifdef RX_ENABLE_TSFPQ
 821 void
 822 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 823 {
 824     struct rx_ts_info_t * rx_ts_info;
 825     dpf(("Free %"AFS_PTR_FMT"\n", p));
 826
 827     RX_TS_INFO_GET(rx_ts_info);
 828     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 829
 830     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 831         NETPRI;
 832         MUTEX_ENTER(&rx_freePktQ_lock);
 833
 834         RX_TS_FPQ_LTOG(rx_ts_info);
 835
 836         /* Wakeup anyone waiting for packets */
 837         rxi_PacketsUnWait();
 838
 839         MUTEX_EXIT(&rx_freePktQ_lock);
 840         USERPRI;
 841     }
 842 }
 843 #endif /* RX_ENABLE_TSFPQ */
 844
 845 /*
 846  * free continuation buffers off a packet into a queue
 847  *
 848  * [IN] p      -- packet from which continuation buffers will be freed
 849  * [IN] first  -- iovec offset of first continuation buffer to free
 850  * [IN] q      -- queue into which continuation buffers will be chained
 851  *
 852  * returns:
 853  *   number of continuation buffers freed
 854  */
 855 #ifndef RX_ENABLE_TSFPQ
 856 static int
 857 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 858 {
 859     struct iovec *iov;
 860     struct rx_packet * cb;
 861     int count = 0;
 862
 863     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 864         iov = &p->wirevec[first];
 865         if (!iov->iov_base)
 866             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 867         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 868         RX_FPQ_MARK_FREE(cb);
 869         queue_Append(q, cb);
 870     }
 871     p->length = 0;
 872     p->niovecs = 0;
 873
 874     return count;
 875 }
 876 #endif
 877
 878 /*
 879  * free packet continuation buffers into the global free packet pool
 880  *
 881  * [IN] p      -- packet from which to free continuation buffers
 882  * [IN] first  -- iovec offset of first continuation buffer to free
 883  *
 884  * returns:
 885  *   zero always
 886  */
 887 int
 888 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 889 {
 890     struct iovec *iov;
 891
 892     for (first = MAX(2, first); first < p->niovecs; first++) {
 893         iov = &p->wirevec[first];
 894         if (!iov->iov_base)
 895             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 896         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 897     }
 898     p->length = 0;
 899     p->niovecs = 0;
 900
 901     return 0;
 902 }
 903
 904 #ifdef RX_ENABLE_TSFPQ
 905 /*
 906  * free packet continuation buffers into the thread-local free pool
 907  *
 908  * [IN] p             -- packet from which continuation buffers will be freed
 909  * [IN] first         -- iovec offset of first continuation buffer to free
 910  *                       any value less than 2, the min number of iovecs,
 911  *                       is treated as if it is 2.
 912  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 913  *                       global free pool before returning
 914  *
 915  * returns:
 916  *   zero always
 917  */
 918 static int
 919 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 920 {
 921     struct iovec *iov;
 922     struct rx_ts_info_t * rx_ts_info;
 923
 924     RX_TS_INFO_GET(rx_ts_info);
 925
 926     for (first = MAX(2, first); first < p->niovecs; first++) {
 927         iov = &p->wirevec[first];
 928         if (!iov->iov_base)
 929             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 930         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 931     }
 932     p->length = 0;
 933     p->niovecs = 0;
 934
 935     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 936         NETPRI;
 937         MUTEX_ENTER(&rx_freePktQ_lock);
 938
 939         RX_TS_FPQ_LTOG(rx_ts_info);
 940
 941         /* Wakeup anyone waiting for packets */
 942         rxi_PacketsUnWait();
 943
 944         MUTEX_EXIT(&rx_freePktQ_lock);
 945         USERPRI;
 946     }
 947     return 0;
 948 }
 949 #endif /* RX_ENABLE_TSFPQ */
 950
 951 int rxi_nBadIovecs = 0;
 952
 953 /* rxi_RestoreDataBufs
 954  *
 955  * Restore the correct sizes to the iovecs. Called when reusing a packet
 956  * for reading off the wire.
 957  */
 958 void
 959 rxi_RestoreDataBufs(struct rx_packet *p)
 960 {
 961     unsigned int i;
 962     struct iovec *iov = &p->wirevec[2];
 963
 964     RX_PACKET_IOV_INIT(p);
 965
 966     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 967         if (!iov->iov_base) {
 968             rxi_nBadIovecs++;
 969             p->niovecs = i;
 970             break;
 971         }
 972         iov->iov_len = RX_CBUFFERSIZE;
 973     }
 974 }
 975
 976 #ifdef RX_ENABLE_TSFPQ
 977 int
 978 rxi_TrimDataBufs(struct rx_packet *p, int first)
 979 {
 980     int length;
 981     struct iovec *iov, *end;
 982     struct rx_ts_info_t * rx_ts_info;
 983     SPLVAR;
 984
 985     if (first != 1)
 986         osi_Panic("TrimDataBufs 1: first must be 1");
 987
 988     /* Skip over continuation buffers containing message data */
 989     iov = &p->wirevec[2];
 990     end = iov + (p->niovecs - 2);
 991     length = p->length - p->wirevec[1].iov_len;
 992     for (; iov < end && length > 0; iov++) {
 993         if (!iov->iov_base)
 994             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
 995         length -= iov->iov_len;
 996     }
 997
 998     /* iov now points to the first empty data buffer. */
 999     if (iov >= end)
1000         return 0;
1001
1002     RX_TS_INFO_GET(rx_ts_info);
1003     for (; iov < end; iov++) {
1004         if (!iov->iov_base)
1005             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1006         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1007         p->niovecs--;
1008     }
1009     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1010         NETPRI;
1011         MUTEX_ENTER(&rx_freePktQ_lock);
1012
1013         RX_TS_FPQ_LTOG(rx_ts_info);
1014         rxi_PacketsUnWait();
1015
1016         MUTEX_EXIT(&rx_freePktQ_lock);
1017         USERPRI;
1018     }
1019
1020     return 0;
1021 }
1022 #else /* RX_ENABLE_TSFPQ */
1023 int
1024 rxi_TrimDataBufs(struct rx_packet *p, int first)
1025 {
1026     int length;
1027     struct iovec *iov, *end;
1028     SPLVAR;
1029
1030     if (first != 1)
1031         osi_Panic("TrimDataBufs 1: first must be 1");
1032
1033     /* Skip over continuation buffers containing message data */
1034     iov = &p->wirevec[2];
1035     end = iov + (p->niovecs - 2);
1036     length = p->length - p->wirevec[1].iov_len;
1037     for (; iov < end && length > 0; iov++) {
1038         if (!iov->iov_base)
1039             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1040         length -= iov->iov_len;
1041     }
1042
1043     /* iov now points to the first empty data buffer. */
1044     if (iov >= end)
1045         return 0;
1046
1047     NETPRI;
1048     MUTEX_ENTER(&rx_freePktQ_lock);
1049
1050     for (; iov < end; iov++) {
1051         if (!iov->iov_base)
1052             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1053         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1054         p->niovecs--;
1055     }
1056     rxi_PacketsUnWait();
1057
1058     MUTEX_EXIT(&rx_freePktQ_lock);
1059     USERPRI;
1060
1061     return 0;
1062 }
1063 #endif /* RX_ENABLE_TSFPQ */
1064
1065 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1066  * remove it yourself first if you call this routine. */
1067 #ifdef RX_ENABLE_TSFPQ
1068 void
1069 rxi_FreePacket(struct rx_packet *p)
1070 {
1071     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1072     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1073 }
1074 #else /* RX_ENABLE_TSFPQ */
1075 void
1076 rxi_FreePacket(struct rx_packet *p)
1077 {
1078     SPLVAR;
1079
1080     NETPRI;
1081     MUTEX_ENTER(&rx_freePktQ_lock);
1082
1083     rxi_FreeDataBufsNoLock(p, 2);
1084     rxi_FreePacketNoLock(p);
1085     /* Wakeup anyone waiting for packets */
1086     rxi_PacketsUnWait();
1087
1088     MUTEX_EXIT(&rx_freePktQ_lock);
1089     USERPRI;
1090 }
1091 #endif /* RX_ENABLE_TSFPQ */
1092
1093 /* rxi_AllocPacket sets up p->length so it reflects the number of
1094  * bytes in the packet at this point, **not including** the header.
1095  * The header is absolutely necessary, besides, this is the way the
1096  * length field is usually used */
1097 #ifdef RX_ENABLE_TSFPQ
1098 struct rx_packet *
1099 rxi_AllocPacketNoLock(int class)
1100 {
1101     struct rx_packet *p;
1102     struct rx_ts_info_t * rx_ts_info;
1103
1104     RX_TS_INFO_GET(rx_ts_info);
1105
1106 #ifdef KERNEL
1107     if (rxi_OverQuota(class)) {
1108         rxi_NeedMorePackets = TRUE;
1109         if (rx_stats_active) {
1110             switch (class) {
1111             case RX_PACKET_CLASS_RECEIVE:
1112                 rx_atomic_inc(rx_stats.receivePktAllocFailures);
1113                 break;
1114             case RX_PACKET_CLASS_SEND:
1115                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1116                 break;
1117             case RX_PACKET_CLASS_SPECIAL:
1118                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1119                 break;
1120             case RX_PACKET_CLASS_RECV_CBUF:
1121                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1122                 break;
1123             case RX_PACKET_CLASS_SEND_CBUF:
1124                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1125                 break;
1126             }
1127         }
1128         return (struct rx_packet *)0;
1129     }
1130 #endif /* KERNEL */
1131
1132     if (rx_stats_active)
1133         rx_atomic_inc(&rx_stats.packetRequests);
1134     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1135
1136 #ifdef KERNEL
1137         if (queue_IsEmpty(&rx_freePacketQueue))
1138             osi_Panic("rxi_AllocPacket error");
1139 #else /* KERNEL */
1140         if (queue_IsEmpty(&rx_freePacketQueue))
1141             rxi_MorePacketsNoLock(rx_maxSendWindow);
1142 #endif /* KERNEL */
1143
1144
1145         RX_TS_FPQ_GTOL(rx_ts_info);
1146     }
1147
1148     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1149
1150     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1151
1152
1153     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1154      * order to truncate outbound packets.  In the near future, may need
1155      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1156      */
1157     RX_PACKET_IOV_FULLINIT(p);
1158     return p;
1159 }
1160 #else /* RX_ENABLE_TSFPQ */
1161 struct rx_packet *
1162 rxi_AllocPacketNoLock(int class)
1163 {
1164     struct rx_packet *p;
1165
1166 #ifdef KERNEL
1167     if (rxi_OverQuota(class)) {
1168         rxi_NeedMorePackets = TRUE;
1169         if (rx_stats_active) {
1170             switch (class) {
1171             case RX_PACKET_CLASS_RECEIVE:
1172                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1173                 break;
1174             case RX_PACKET_CLASS_SEND:
1175                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1176                 break;
1177             case RX_PACKET_CLASS_SPECIAL:
1178                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1179                 break;
1180             case RX_PACKET_CLASS_RECV_CBUF:
1181                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1182                 break;
1183             case RX_PACKET_CLASS_SEND_CBUF:
1184                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1185                 break;
1186             }
1187         }
1188         return (struct rx_packet *)0;
1189     }
1190 #endif /* KERNEL */
1191
1192     if (rx_stats_active)
1193         rx_atomic_inc(&rx_stats.packetRequests);
1194
1195 #ifdef KERNEL
1196     if (queue_IsEmpty(&rx_freePacketQueue))
1197         osi_Panic("rxi_AllocPacket error");
1198 #else /* KERNEL */
1199     if (queue_IsEmpty(&rx_freePacketQueue))
1200         rxi_MorePacketsNoLock(rx_maxSendWindow);
1201 #endif /* KERNEL */
1202
1203     rx_nFreePackets--;
1204     p = queue_First(&rx_freePacketQueue, rx_packet);
1205     queue_Remove(p);
1206     RX_FPQ_MARK_USED(p);
1207
1208     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1209
1210
1211     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1212      * order to truncate outbound packets.  In the near future, may need
1213      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1214      */
1215     RX_PACKET_IOV_FULLINIT(p);
1216     return p;
1217 }
1218 #endif /* RX_ENABLE_TSFPQ */
1219
1220 #ifdef RX_ENABLE_TSFPQ
1221 struct rx_packet *
1222 rxi_AllocPacketTSFPQ(int class, int pull_global)
1223 {
1224     struct rx_packet *p;
1225     struct rx_ts_info_t * rx_ts_info;
1226
1227     RX_TS_INFO_GET(rx_ts_info);
1228
1229     if (rx_stats_active)
1230         rx_atomic_inc(&rx_stats.packetRequests);
1231     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1232         MUTEX_ENTER(&rx_freePktQ_lock);
1233
1234         if (queue_IsEmpty(&rx_freePacketQueue))
1235             rxi_MorePacketsNoLock(rx_maxSendWindow);
1236
1237         RX_TS_FPQ_GTOL(rx_ts_info);
1238
1239         MUTEX_EXIT(&rx_freePktQ_lock);
1240     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1241         return NULL;
1242     }
1243
1244     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1245
1246     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1247
1248     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1249      * order to truncate outbound packets.  In the near future, may need
1250      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1251      */
1252     RX_PACKET_IOV_FULLINIT(p);
1253     return p;
1254 }
1255 #endif /* RX_ENABLE_TSFPQ */
1256
1257 #ifdef RX_ENABLE_TSFPQ
1258 struct rx_packet *
1259 rxi_AllocPacket(int class)
1260 {
1261     struct rx_packet *p;
1262
1263     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1264     return p;
1265 }
1266 #else /* RX_ENABLE_TSFPQ */
1267 struct rx_packet *
1268 rxi_AllocPacket(int class)
1269 {
1270     struct rx_packet *p;
1271
1272     MUTEX_ENTER(&rx_freePktQ_lock);
1273     p = rxi_AllocPacketNoLock(class);
1274     MUTEX_EXIT(&rx_freePktQ_lock);
1275     return p;
1276 }
1277 #endif /* RX_ENABLE_TSFPQ */
1278
1279 /* This guy comes up with as many buffers as it {takes,can get} given
1280  * the MTU for this call. It also sets the packet length before
1281  * returning.  caution: this is often called at NETPRI
1282  * Called with call locked.
1283  */
1284 struct rx_packet *
1285 rxi_AllocSendPacket(struct rx_call *call, int want)
1286 {
1287     struct rx_packet *p = (struct rx_packet *)0;
1288     int mud;
1289     unsigned delta;
1290
1291     SPLVAR;
1292     mud = call->MTU - RX_HEADER_SIZE;
1293     delta =
1294         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1295         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1296
1297 #ifdef RX_ENABLE_TSFPQ
1298     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1299         want += delta;
1300         want = MIN(want, mud);
1301
1302         if ((unsigned)want > p->length)
1303             (void)rxi_AllocDataBuf(p, (want - p->length),
1304                                    RX_PACKET_CLASS_SEND_CBUF);
1305
1306         if (p->length > mud)
1307             p->length = mud;
1308
1309         if (delta >= p->length) {
1310             rxi_FreePacket(p);
1311             p = NULL;
1312         } else {
1313             p->length -= delta;
1314         }
1315         return p;
1316     }
1317 #endif /* RX_ENABLE_TSFPQ */
1318
1319     while (!(call->error)) {
1320         MUTEX_ENTER(&rx_freePktQ_lock);
1321         /* if an error occurred, or we get the packet we want, we're done */
1322         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1323             MUTEX_EXIT(&rx_freePktQ_lock);
1324
1325             want += delta;
1326             want = MIN(want, mud);
1327
1328             if ((unsigned)want > p->length)
1329                 (void)rxi_AllocDataBuf(p, (want - p->length),
1330                                        RX_PACKET_CLASS_SEND_CBUF);
1331
1332             if (p->length > mud)
1333                 p->length = mud;
1334
1335             if (delta >= p->length) {
1336                 rxi_FreePacket(p);
1337                 p = NULL;
1338             } else {
1339                 p->length -= delta;
1340             }
1341             break;
1342         }
1343
1344         /* no error occurred, and we didn't get a packet, so we sleep.
1345          * At this point, we assume that packets will be returned
1346          * sooner or later, as packets are acknowledged, and so we
1347          * just wait.  */
1348         NETPRI;
1349         call->flags |= RX_CALL_WAIT_PACKETS;
1350         MUTEX_ENTER(&rx_refcnt_mutex);
1351         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1352         MUTEX_EXIT(&rx_refcnt_mutex);
1353         MUTEX_EXIT(&call->lock);
1354         rx_waitingForPackets = 1;
1355
1356 #ifdef  RX_ENABLE_LOCKS
1357         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1358 #else
1359         osi_rxSleep(&rx_waitingForPackets);
1360 #endif
1361         MUTEX_EXIT(&rx_freePktQ_lock);
1362         MUTEX_ENTER(&call->lock);
1363         MUTEX_ENTER(&rx_refcnt_mutex);
1364         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1365         MUTEX_EXIT(&rx_refcnt_mutex);
1366         call->flags &= ~RX_CALL_WAIT_PACKETS;
1367         USERPRI;
1368     }
1369
1370     return p;
1371 }
1372
1373 #ifndef KERNEL
1374 #ifdef AFS_NT40_ENV
1375 /* Windows does not use file descriptors. */
1376 #define CountFDs(amax) 0
1377 #else
1378 /* count the number of used FDs */
1379 static int
1380 CountFDs(int amax)
1381 {
1382     struct stat tstat;
1383     int i, code;
1384     int count;
1385
1386     count = 0;
1387     for (i = 0; i < amax; i++) {
1388         code = fstat(i, &tstat);
1389         if (code == 0)
1390             count++;
1391     }
1392     return count;
1393 }
1394 #endif /* AFS_NT40_ENV */
1395 #else /* KERNEL */
1396
1397 #define CountFDs(amax) amax
1398
1399 #endif /* KERNEL */
1400
1401 #if !defined(KERNEL) || defined(UKERNEL)
1402
1403 /* This function reads a single packet from the interface into the
1404  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1405  * (host,port) of the sender are stored in the supplied variables, and
1406  * the data length of the packet is stored in the packet structure.
1407  * The header is decoded. */
1408 int
1409 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1410                u_short * port)
1411 {
1412     struct sockaddr_in from;
1413     unsigned int nbytes;
1414     afs_int32 rlen;
1415     afs_uint32 tlen, savelen;
1416     struct msghdr msg;
1417     rx_computelen(p, tlen);
1418     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1419
1420     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1421     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1422                                  * it once in order to avoid races.  */
1423     tlen = rlen - tlen;
1424     if (tlen > 0) {
1425         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1426         if (tlen > 0) {
1427             tlen = rlen - tlen;
1428         } else
1429             tlen = rlen;
1430     } else
1431         tlen = rlen;
1432
1433     /* Extend the last iovec for padding, it's just to make sure that the
1434      * read doesn't return more data than we expect, and is done to get around
1435      * our problems caused by the lack of a length field in the rx header.
1436      * Use the extra buffer that follows the localdata in each packet
1437      * structure. */
1438     savelen = p->wirevec[p->niovecs - 1].iov_len;
1439     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1440
1441     memset(&msg, 0, sizeof(msg));
1442     msg.msg_name = (char *)&from;
1443     msg.msg_namelen = sizeof(struct sockaddr_in);
1444     msg.msg_iov = p->wirevec;
1445     msg.msg_iovlen = p->niovecs;
1446     nbytes = rxi_Recvmsg(socket, &msg, 0);
1447
1448     /* restore the vec to its correct state */
1449     p->wirevec[p->niovecs - 1].iov_len = savelen;
1450
1451     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1452     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1453         if (nbytes < 0 && errno == EWOULDBLOCK) {
1454             if (rx_stats_active)
1455                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1456         } else if (nbytes <= 0) {
1457             if (rx_stats_active) {
1458                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1459                 rx_stats.bogusHost = from.sin_addr.s_addr;
1460             }
1461             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1462                  ntohs(from.sin_port), nbytes));
1463         }
1464         return 0;
1465     }
1466 #ifdef RXDEBUG
1467     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1468                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1469         rxi_DecodePacketHeader(p);
1470
1471         *host = from.sin_addr.s_addr;
1472         *port = from.sin_port;
1473
1474         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1475               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1476               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1477               p->length));
1478 #ifdef RX_TRIMDATABUFS
1479         rxi_TrimDataBufs(p, 1);
1480 #endif
1481         return 0;
1482     }
1483 #endif
1484     else {
1485         /* Extract packet header. */
1486         rxi_DecodePacketHeader(p);
1487
1488         *host = from.sin_addr.s_addr;
1489         *port = from.sin_port;
1490         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1491             if (rx_stats_active) {
1492                 struct rx_peer *peer;
1493                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1494                 /*
1495                  * Try to look up this peer structure.  If it doesn't exist,
1496                  * don't create a new one -
1497                  * we don't keep count of the bytes sent/received if a peer
1498                  * structure doesn't already exist.
1499                  *
1500                  * The peer/connection cleanup code assumes that there is 1 peer
1501                  * per connection.  If we actually created a peer structure here
1502                  * and this packet was an rxdebug packet, the peer structure would
1503                  * never be cleaned up.
1504                  */
1505                 peer = rxi_FindPeer(*host, *port, 0, 0);
1506                 /* Since this may not be associated with a connection,
1507                  * it may have no refCount, meaning we could race with
1508                  * ReapConnections
1509                  */
1510                 if (peer && (peer->refCount > 0)) {
1511                     MUTEX_ENTER(&peer->peer_lock);
1512                     hadd32(peer->bytesReceived, p->length);
1513                     MUTEX_EXIT(&peer->peer_lock);
1514                 }
1515             }
1516         }
1517
1518 #ifdef RX_TRIMDATABUFS
1519         /* Free any empty packet buffers at the end of this packet */
1520         rxi_TrimDataBufs(p, 1);
1521 #endif
1522         return 1;
1523     }
1524 }
1525
1526 #endif /* !KERNEL || UKERNEL */
1527
1528 /* This function splits off the first packet in a jumbo packet.
1529  * As of AFS 3.5, jumbograms contain more than one fixed size
1530  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1531  * last packet header. All packets (except the last) are padded to
1532  * fall on RX_CBUFFERSIZE boundaries.
1533  * HACK: We store the length of the first n-1 packets in the
1534  * last two pad bytes. */
1535
1536 struct rx_packet *
1537 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1538                      int first)
1539 {
1540     struct rx_packet *np;
1541     struct rx_jumboHeader *jp;
1542     int niov, i;
1543     struct iovec *iov;
1544     int length;
1545     afs_uint32 temp;
1546
1547     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1548      * bytes in length. All but the first packet are preceded by
1549      * an abbreviated four byte header. The length of the last packet
1550      * is calculated from the size of the jumbogram. */
1551     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1552
1553     if ((int)p->length < length) {
1554         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1555         return NULL;
1556     }
1557     niov = p->niovecs - 2;
1558     if (niov < 1) {
1559         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1560         return NULL;
1561     }
1562     iov = &p->wirevec[2];
1563     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1564
1565     /* Get a pointer to the abbreviated packet header */
1566     jp = (struct rx_jumboHeader *)
1567         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1568
1569     /* Set up the iovecs for the next packet */
1570     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1571     np->wirevec[0].iov_len = sizeof(struct rx_header);
1572     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1573     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1574     np->niovecs = niov + 1;
1575     for (i = 2, iov++; i <= niov; i++, iov++) {
1576         np->wirevec[i] = *iov;
1577     }
1578     np->length = p->length - length;
1579     p->length = RX_JUMBOBUFFERSIZE;
1580     p->niovecs = 2;
1581
1582     /* Convert the jumbo packet header to host byte order */
1583     temp = ntohl(*(afs_uint32 *) jp);
1584     jp->flags = (u_char) (temp >> 24);
1585     jp->cksum = (u_short) (temp);
1586
1587     /* Fill in the packet header */
1588     np->header = p->header;
1589     np->header.serial = p->header.serial + 1;
1590     np->header.seq = p->header.seq + 1;
1591     np->header.flags = jp->flags;
1592     np->header.spare = jp->cksum;
1593
1594     return np;
1595 }
1596
1597 #ifndef KERNEL
1598 /* Send a udp datagram */
1599 int
1600 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1601             int length, int istack)
1602 {
1603     struct msghdr msg;
1604         int ret;
1605
1606     memset(&msg, 0, sizeof(msg));
1607     msg.msg_iov = dvec;
1608     msg.msg_iovlen = nvecs;
1609     msg.msg_name = addr;
1610     msg.msg_namelen = sizeof(struct sockaddr_in);
1611
1612     ret = rxi_Sendmsg(socket, &msg, 0);
1613
1614     return ret;
1615 }
1616 #elif !defined(UKERNEL)
1617 /*
1618  * message receipt is done in rxk_input or rx_put.
1619  */
1620
1621 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1622 /*
1623  * Copy an mblock to the contiguous area pointed to by cp.
1624  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1625  * but it doesn't really.
1626  * Returns the number of bytes not transferred.
1627  * The message is NOT changed.
1628  */
1629 static int
1630 cpytoc(mblk_t * mp, int off, int len, char *cp)
1631 {
1632     int n;
1633
1634     for (; mp && len > 0; mp = mp->b_cont) {
1635         if (mp->b_datap->db_type != M_DATA) {
1636             return -1;
1637         }
1638         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1639         memcpy(cp, (char *)mp->b_rptr, n);
1640         cp += n;
1641         len -= n;
1642         mp->b_rptr += n;
1643     }
1644     return (len);
1645 }
1646
1647 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1648  * but it doesn't really.
1649  * This sucks, anyway, do it like m_cpy.... below
1650  */
1651 static int
1652 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1653            int niovs)
1654 {
1655     int m, n, o, t, i;
1656
1657     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1658         if (mp->b_datap->db_type != M_DATA) {
1659             return -1;
1660         }
1661         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1662         len -= n;
1663         while (n) {
1664             if (!t) {
1665                 o = 0;
1666                 i++;
1667                 t = iovs[i].iov_len;
1668             }
1669             m = MIN(n, t);
1670             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1671             mp->b_rptr += m;
1672             o += m;
1673             t -= m;
1674             n -= m;
1675         }
1676     }
1677     return (len);
1678 }
1679
1680 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1681 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1682 #else
1683 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1684 static int
1685 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1686 {
1687     caddr_t p1, p2;
1688     unsigned int l1, l2, i, t;
1689
1690     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1691         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1692
1693     while (off && m)
1694         if (m->m_len <= off) {
1695             off -= m->m_len;
1696             m = m->m_next;
1697             continue;
1698         } else
1699             break;
1700
1701     if (m == NULL)
1702         return len;
1703
1704     p1 = mtod(m, caddr_t) + off;
1705     l1 = m->m_len - off;
1706     i = 0;
1707     p2 = iovs[0].iov_base;
1708     l2 = iovs[0].iov_len;
1709
1710     while (len) {
1711         t = MIN(l1, MIN(l2, (unsigned int)len));
1712         memcpy(p2, p1, t);
1713         p1 += t;
1714         p2 += t;
1715         l1 -= t;
1716         l2 -= t;
1717         len -= t;
1718         if (!l1) {
1719             m = m->m_next;
1720             if (!m)
1721                 break;
1722             p1 = mtod(m, caddr_t);
1723             l1 = m->m_len;
1724         }
1725         if (!l2) {
1726             if (++i >= niovs)
1727                 break;
1728             p2 = iovs[i].iov_base;
1729             l2 = iovs[i].iov_len;
1730         }
1731
1732     }
1733
1734     return len;
1735 }
1736 #endif /* LINUX */
1737 #endif /* AFS_SUN5_ENV */
1738
1739 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1740 #if defined(AFS_NBSD_ENV)
1741 int
1742 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1743 #else
1744 int
1745 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1746 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1747      mblk_t *amb;
1748 #else
1749      struct mbuf *amb;
1750 #endif
1751      void (*free) ();
1752      struct rx_packet *phandle;
1753      int hdr_len, data_len;
1754 #endif /* AFS_NBSD_ENV */
1755 {
1756     int code;
1757
1758     code =
1759         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1760                      phandle->niovecs);
1761     (*free) (amb);
1762
1763     return code;
1764 }
1765 #endif /* LINUX */
1766 #endif /*KERNEL && !UKERNEL */
1767
1768
1769 /* send a response to a debug packet */
1770
1771 struct rx_packet *
1772 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1773                        afs_uint32 ahost, short aport, int istack)
1774 {
1775     struct rx_debugIn tin;
1776     afs_int32 tl;
1777     struct rx_serverQueueEntry *np, *nqe;
1778
1779     /*
1780      * Only respond to client-initiated Rx debug packets,
1781      * and clear the client flag in the response.
1782      */
1783     if (ap->header.flags & RX_CLIENT_INITIATED) {
1784         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1785         rxi_EncodePacketHeader(ap);
1786     } else {
1787         return ap;
1788     }
1789
1790     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1791     /* all done with packet, now set length to the truth, so we can
1792      * reuse this packet */
1793     rx_computelen(ap, ap->length);
1794
1795     tin.type = ntohl(tin.type);
1796     tin.index = ntohl(tin.index);
1797     switch (tin.type) {
1798     case RX_DEBUGI_GETSTATS:{
1799             struct rx_debugStats tstat;
1800
1801             /* get basic stats */
1802             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1803             tstat.version = RX_DEBUGI_VERSION;
1804 #ifndef RX_ENABLE_LOCKS
1805             tstat.waitingForPackets = rx_waitingForPackets;
1806 #endif
1807             MUTEX_ENTER(&rx_serverPool_lock);
1808             tstat.nFreePackets = htonl(rx_nFreePackets);
1809             tstat.nPackets = htonl(rx_nPackets);
1810             tstat.callsExecuted = htonl(rxi_nCalls);
1811             tstat.packetReclaims = htonl(rx_packetReclaims);
1812             tstat.usedFDs = CountFDs(64);
1813             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1814             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1815             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1816                         tstat.idleThreads);
1817             MUTEX_EXIT(&rx_serverPool_lock);
1818             tstat.idleThreads = htonl(tstat.idleThreads);
1819             tl = sizeof(struct rx_debugStats) - ap->length;
1820             if (tl > 0)
1821                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1822
1823             if (tl <= 0) {
1824                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1825                                (char *)&tstat);
1826                 ap->length = sizeof(struct rx_debugStats);
1827                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1828                 rx_computelen(ap, ap->length);
1829             }
1830             break;
1831         }
1832
1833     case RX_DEBUGI_GETALLCONN:
1834     case RX_DEBUGI_GETCONN:{
1835             unsigned int i, j;
1836             struct rx_connection *tc;
1837             struct rx_call *tcall;
1838             struct rx_debugConn tconn;
1839             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1840
1841
1842             tl = sizeof(struct rx_debugConn) - ap->length;
1843             if (tl > 0)
1844                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1845             if (tl > 0)
1846                 return ap;
1847
1848             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1849             /* get N'th (maybe) "interesting" connection info */
1850             for (i = 0; i < rx_hashTableSize; i++) {
1851 #if !defined(KERNEL)
1852                 /* the time complexity of the algorithm used here
1853                  * exponentially increses with the number of connections.
1854                  */
1855 #ifdef AFS_PTHREAD_ENV
1856                 pthread_yield();
1857 #else
1858                 (void)IOMGR_Poll();
1859 #endif
1860 #endif
1861                 MUTEX_ENTER(&rx_connHashTable_lock);
1862                 /* We might be slightly out of step since we are not
1863                  * locking each call, but this is only debugging output.
1864                  */
1865                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1866                     if ((all || rxi_IsConnInteresting(tc))
1867                         && tin.index-- <= 0) {
1868                         tconn.host = tc->peer->host;
1869                         tconn.port = tc->peer->port;
1870                         tconn.cid = htonl(tc->cid);
1871                         tconn.epoch = htonl(tc->epoch);
1872                         tconn.serial = htonl(tc->serial);
1873                         for (j = 0; j < RX_MAXCALLS; j++) {
1874                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1875                             if ((tcall = tc->call[j])) {
1876                                 tconn.callState[j] = tcall->state;
1877                                 tconn.callMode[j] = tcall->mode;
1878                                 tconn.callFlags[j] = tcall->flags;
1879                                 if (queue_IsNotEmpty(&tcall->rq))
1880                                     tconn.callOther[j] |= RX_OTHER_IN;
1881                                 if (queue_IsNotEmpty(&tcall->tq))
1882                                     tconn.callOther[j] |= RX_OTHER_OUT;
1883                             } else
1884                                 tconn.callState[j] = RX_STATE_NOTINIT;
1885                         }
1886
1887                         tconn.natMTU = htonl(tc->peer->natMTU);
1888                         tconn.error = htonl(tc->error);
1889                         tconn.flags = tc->flags;
1890                         tconn.type = tc->type;
1891                         tconn.securityIndex = tc->securityIndex;
1892                         if (tc->securityObject) {
1893                             RXS_GetStats(tc->securityObject, tc,
1894                                          &tconn.secStats);
1895 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1896 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1897                             DOHTONL(flags);
1898                             DOHTONL(expires);
1899                             DOHTONL(packetsReceived);
1900                             DOHTONL(packetsSent);
1901                             DOHTONL(bytesReceived);
1902                             DOHTONL(bytesSent);
1903                             for (i = 0;
1904                                  i <
1905                                  sizeof(tconn.secStats.spares) /
1906                                  sizeof(short); i++)
1907                                 DOHTONS(spares[i]);
1908                             for (i = 0;
1909                                  i <
1910                                  sizeof(tconn.secStats.sparel) /
1911                                  sizeof(afs_int32); i++)
1912                                 DOHTONL(sparel[i]);
1913                         }
1914
1915                         MUTEX_EXIT(&rx_connHashTable_lock);
1916                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1917                                        (char *)&tconn);
1918                         tl = ap->length;
1919                         ap->length = sizeof(struct rx_debugConn);
1920                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1921                                             istack);
1922                         ap->length = tl;
1923                         return ap;
1924                     }
1925                 }
1926                 MUTEX_EXIT(&rx_connHashTable_lock);
1927             }
1928             /* if we make it here, there are no interesting packets */
1929             tconn.cid = htonl(0xffffffff);      /* means end */
1930             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1931                            (char *)&tconn);
1932             tl = ap->length;
1933             ap->length = sizeof(struct rx_debugConn);
1934             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1935             ap->length = tl;
1936             break;
1937         }
1938
1939         /*
1940          * Pass back all the peer structures we have available
1941          */
1942
1943     case RX_DEBUGI_GETPEER:{
1944             unsigned int i;
1945             struct rx_peer *tp;
1946             struct rx_debugPeer tpeer;
1947
1948
1949             tl = sizeof(struct rx_debugPeer) - ap->length;
1950             if (tl > 0)
1951                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1952             if (tl > 0)
1953                 return ap;
1954
1955             memset(&tpeer, 0, sizeof(tpeer));
1956             for (i = 0; i < rx_hashTableSize; i++) {
1957 #if !defined(KERNEL)
1958                 /* the time complexity of the algorithm used here
1959                  * exponentially increses with the number of peers.
1960                  *
1961                  * Yielding after processing each hash table entry
1962                  * and dropping rx_peerHashTable_lock.
1963                  * also increases the risk that we will miss a new
1964                  * entry - but we are willing to live with this
1965                  * limitation since this is meant for debugging only
1966                  */
1967 #ifdef AFS_PTHREAD_ENV
1968                 pthread_yield();
1969 #else
1970                 (void)IOMGR_Poll();
1971 #endif
1972 #endif
1973                 MUTEX_ENTER(&rx_peerHashTable_lock);
1974                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1975                     if (tin.index-- <= 0) {
1976                         tp->refCount++;
1977                         MUTEX_EXIT(&rx_peerHashTable_lock);
1978
1979                         MUTEX_ENTER(&tp->peer_lock);
1980                         tpeer.host = tp->host;
1981                         tpeer.port = tp->port;
1982                         tpeer.ifMTU = htons(tp->ifMTU);
1983                         tpeer.idleWhen = htonl(tp->idleWhen);
1984                         tpeer.refCount = htons(tp->refCount);
1985                         tpeer.burstSize = tp->burstSize;
1986                         tpeer.burst = tp->burst;
1987                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1988                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1989                         tpeer.rtt = htonl(tp->rtt);
1990                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1991                         tpeer.nSent = htonl(tp->nSent);
1992                         tpeer.reSends = htonl(tp->reSends);
1993                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1994                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1995                         tpeer.natMTU = htons(tp->natMTU);
1996                         tpeer.maxMTU = htons(tp->maxMTU);
1997                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1998                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1999                         tpeer.MTU = htons(tp->MTU);
2000                         tpeer.cwind = htons(tp->cwind);
2001                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2002                         tpeer.congestSeq = htons(tp->congestSeq);
2003                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2004                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2005                         tpeer.bytesReceived.high =
2006                             htonl(tp->bytesReceived.high);
2007                         tpeer.bytesReceived.low =
2008                             htonl(tp->bytesReceived.low);
2009                         MUTEX_EXIT(&tp->peer_lock);
2010
2011                         MUTEX_ENTER(&rx_peerHashTable_lock);
2012                         tp->refCount--;
2013                         MUTEX_EXIT(&rx_peerHashTable_lock);
2014
2015                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2016                                        (char *)&tpeer);
2017                         tl = ap->length;
2018                         ap->length = sizeof(struct rx_debugPeer);
2019                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2020                                             istack);
2021                         ap->length = tl;
2022                         return ap;
2023                     }
2024                 }
2025                 MUTEX_EXIT(&rx_peerHashTable_lock);
2026             }
2027             /* if we make it here, there are no interesting packets */
2028             tpeer.host = htonl(0xffffffff);     /* means end */
2029             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2030                            (char *)&tpeer);
2031             tl = ap->length;
2032             ap->length = sizeof(struct rx_debugPeer);
2033             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2034             ap->length = tl;
2035             break;
2036         }
2037
2038     case RX_DEBUGI_RXSTATS:{
2039             int i;
2040             afs_int32 *s;
2041
2042             tl = sizeof(rx_stats) - ap->length;
2043             if (tl > 0)
2044                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2045             if (tl > 0)
2046                 return ap;
2047
2048             /* Since its all int32s convert to network order with a loop. */
2049         if (rx_stats_active)
2050             MUTEX_ENTER(&rx_stats_mutex);
2051             s = (afs_int32 *) & rx_stats;
2052             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2053                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2054
2055             tl = ap->length;
2056             ap->length = sizeof(rx_stats);
2057         if (rx_stats_active)
2058             MUTEX_EXIT(&rx_stats_mutex);
2059             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2060             ap->length = tl;
2061             break;
2062         }
2063
2064     default:
2065         /* error response packet */
2066         tin.type = htonl(RX_DEBUGI_BADTYPE);
2067         tin.index = tin.type;
2068         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2069         tl = ap->length;
2070         ap->length = sizeof(struct rx_debugIn);
2071         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2072         ap->length = tl;
2073         break;
2074     }
2075     return ap;
2076 }
2077
2078 struct rx_packet *
2079 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2080                          afs_uint32 ahost, short aport, int istack)
2081 {
2082     afs_int32 tl;
2083
2084     /*
2085      * Only respond to client-initiated version requests, and
2086      * clear that flag in the response.
2087      */
2088     if (ap->header.flags & RX_CLIENT_INITIATED) {
2089         char buf[66];
2090
2091         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2092         rxi_EncodePacketHeader(ap);
2093         memset(buf, 0, sizeof(buf));
2094         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2095         rx_packetwrite(ap, 0, 65, buf);
2096         tl = ap->length;
2097         ap->length = 65;
2098         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2099         ap->length = tl;
2100     }
2101
2102     return ap;
2103 }
2104
2105
2106 /* send a debug packet back to the sender */
2107 static void
2108 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2109                     afs_uint32 ahost, short aport, afs_int32 istack)
2110 {
2111     struct sockaddr_in taddr;
2112     unsigned int i, nbytes, savelen = 0;
2113     int saven = 0;
2114 #ifdef KERNEL
2115     int waslocked = ISAFS_GLOCK();
2116 #endif
2117
2118     taddr.sin_family = AF_INET;
2119     taddr.sin_port = aport;
2120     taddr.sin_addr.s_addr = ahost;
2121 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2122     taddr.sin_len = sizeof(struct sockaddr_in);
2123 #endif
2124
2125     /* We need to trim the niovecs. */
2126     nbytes = apacket->length;
2127     for (i = 1; i < apacket->niovecs; i++) {
2128         if (nbytes <= apacket->wirevec[i].iov_len) {
2129             savelen = apacket->wirevec[i].iov_len;
2130             saven = apacket->niovecs;
2131             apacket->wirevec[i].iov_len = nbytes;
2132             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2133         } else
2134             nbytes -= apacket->wirevec[i].iov_len;
2135     }
2136 #ifdef KERNEL
2137 #ifdef RX_KERNEL_TRACE
2138     if (ICL_SETACTIVE(afs_iclSetp)) {
2139         if (!waslocked)
2140             AFS_GLOCK();
2141         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2142                    "before osi_NetSend()");
2143         AFS_GUNLOCK();
2144     } else
2145 #else
2146     if (waslocked)
2147         AFS_GUNLOCK();
2148 #endif
2149 #endif
2150     /* debug packets are not reliably delivered, hence the cast below. */
2151     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2152                       apacket->length + RX_HEADER_SIZE, istack);
2153 #ifdef KERNEL
2154 #ifdef RX_KERNEL_TRACE
2155     if (ICL_SETACTIVE(afs_iclSetp)) {
2156         AFS_GLOCK();
2157         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2158                    "after osi_NetSend()");
2159         if (!waslocked)
2160             AFS_GUNLOCK();
2161     } else
2162 #else
2163     if (waslocked)
2164         AFS_GLOCK();
2165 #endif
2166 #endif
2167     if (saven) {                /* means we truncated the packet above. */
2168         apacket->wirevec[i - 1].iov_len = savelen;
2169         apacket->niovecs = saven;
2170     }
2171
2172 }
2173
2174 /* Send the packet to appropriate destination for the specified
2175  * call.  The header is first encoded and placed in the packet.
2176  */
2177 void
2178 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2179                struct rx_packet *p, int istack)
2180 {
2181 #if defined(KERNEL)
2182     int waslocked;
2183 #endif
2184     int code;
2185     struct sockaddr_in addr;
2186     struct rx_peer *peer = conn->peer;
2187     osi_socket socket;
2188 #ifdef RXDEBUG
2189     char deliveryType = 'S';
2190 #endif
2191     /* The address we're sending the packet to */
2192     memset(&addr, 0, sizeof(addr));
2193     addr.sin_family = AF_INET;
2194     addr.sin_port = peer->port;
2195     addr.sin_addr.s_addr = peer->host;
2196
2197     /* This stuff should be revamped, I think, so that most, if not
2198      * all, of the header stuff is always added here.  We could
2199      * probably do away with the encode/decode routines. XXXXX */
2200
2201     /* Stamp each packet with a unique serial number.  The serial
2202      * number is maintained on a connection basis because some types
2203      * of security may be based on the serial number of the packet,
2204      * and security is handled on a per authenticated-connection
2205      * basis. */
2206     /* Pre-increment, to guarantee no zero serial number; a zero
2207      * serial number means the packet was never sent. */
2208     MUTEX_ENTER(&conn->conn_data_lock);
2209     p->header.serial = ++conn->serial;
2210     if (p->length > conn->peer->maxPacketSize) {
2211         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2212             (p->header.flags & RX_REQUEST_ACK)) {
2213             conn->lastPingSize = p->length;
2214             conn->lastPingSizeSer = p->header.serial;
2215         } else if (p->header.seq != 0) {
2216             conn->lastPacketSize = p->length;
2217             conn->lastPacketSizeSeq = p->header.seq;
2218         }
2219     }
2220     MUTEX_EXIT(&conn->conn_data_lock);
2221     /* This is so we can adjust retransmit time-outs better in the face of
2222      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2223      */
2224     if (p->firstSerial == 0) {
2225         p->firstSerial = p->header.serial;
2226     }
2227 #ifdef RXDEBUG
2228     /* If an output tracer function is defined, call it with the packet and
2229      * network address.  Note this function may modify its arguments. */
2230     if (rx_almostSent) {
2231         int drop = (*rx_almostSent) (p, &addr);
2232         /* drop packet if return value is non-zero? */
2233         if (drop)
2234             deliveryType = 'D'; /* Drop the packet */
2235     }
2236 #endif
2237
2238     /* Get network byte order header */
2239     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2240                                  * touch ALL the fields */
2241
2242     /* Send the packet out on the same socket that related packets are being
2243      * received on */
2244     socket =
2245         (conn->type ==
2246          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2247
2248 #ifdef RXDEBUG
2249     /* Possibly drop this packet,  for testing purposes */
2250     if ((deliveryType == 'D')
2251         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2252             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2253         deliveryType = 'D';     /* Drop the packet */
2254     } else {
2255         deliveryType = 'S';     /* Send the packet */
2256 #endif /* RXDEBUG */
2257
2258         /* Loop until the packet is sent.  We'd prefer just to use a
2259          * blocking socket, but unfortunately the interface doesn't
2260          * allow us to have the socket block in send mode, and not
2261          * block in receive mode */
2262 #ifdef KERNEL
2263         waslocked = ISAFS_GLOCK();
2264 #ifdef RX_KERNEL_TRACE
2265         if (ICL_SETACTIVE(afs_iclSetp)) {
2266             if (!waslocked)
2267                 AFS_GLOCK();
2268             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2269                        "before osi_NetSend()");
2270             AFS_GUNLOCK();
2271         } else
2272 #else
2273         if (waslocked)
2274             AFS_GUNLOCK();
2275 #endif
2276 #endif
2277         if ((code =
2278              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2279                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2280             /* send failed, so let's hurry up the resend, eh? */
2281             if (rx_stats_active)
2282                 rx_atomic_inc(&rx_stats.netSendFailures);
2283             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2284
2285             /* Some systems are nice and tell us right away that we cannot
2286              * reach this recipient by returning an error code.
2287              * So, when this happens let's "down" the host NOW so
2288              * we don't sit around waiting for this host to timeout later.
2289              */
2290             if (call &&
2291 #ifdef AFS_NT40_ENV
2292                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2293 #elif defined(AFS_LINUX20_ENV)
2294                 code == -ENETUNREACH
2295 #elif defined(AFS_DARWIN_ENV)
2296                 code == EHOSTUNREACH
2297 #else
2298                 0
2299 #endif
2300                 )
2301                 call->lastReceiveTime = 0;
2302         }
2303 #ifdef KERNEL
2304 #ifdef RX_KERNEL_TRACE
2305         if (ICL_SETACTIVE(afs_iclSetp)) {
2306             AFS_GLOCK();
2307             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2308                        "after osi_NetSend()");
2309             if (!waslocked)
2310                 AFS_GUNLOCK();
2311         } else
2312 #else
2313         if (waslocked)
2314             AFS_GLOCK();
2315 #endif
2316 #endif
2317 #ifdef RXDEBUG
2318     }
2319     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2320           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2321           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2322           p->header.seq, p->header.flags, p, p->length));
2323 #endif
2324     if (rx_stats_active) {
2325         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2326         MUTEX_ENTER(&peer->peer_lock);
2327         hadd32(peer->bytesSent, p->length);
2328         MUTEX_EXIT(&peer->peer_lock);
2329     }
2330 }
2331
2332 /* Send a list of packets to appropriate destination for the specified
2333  * connection.  The headers are first encoded and placed in the packets.
2334  */
2335 void
2336 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2337                    struct rx_packet **list, int len, int istack)
2338 {
2339 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2340     int waslocked;
2341 #endif
2342     struct sockaddr_in addr;
2343     struct rx_peer *peer = conn->peer;
2344     osi_socket socket;
2345     struct rx_packet *p = NULL;
2346     struct iovec wirevec[RX_MAXIOVECS];
2347     int i, length, code;
2348     afs_uint32 serial;
2349     afs_uint32 temp;
2350     struct rx_jumboHeader *jp;
2351 #ifdef RXDEBUG
2352     char deliveryType = 'S';
2353 #endif
2354     /* The address we're sending the packet to */
2355     addr.sin_family = AF_INET;
2356     addr.sin_port = peer->port;
2357     addr.sin_addr.s_addr = peer->host;
2358
2359     if (len + 1 > RX_MAXIOVECS) {
2360         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2361     }
2362
2363     /*
2364      * Stamp the packets in this jumbogram with consecutive serial numbers
2365      */
2366     MUTEX_ENTER(&conn->conn_data_lock);
2367     serial = conn->serial;
2368     conn->serial += len;
2369     for (i = 0; i < len; i++) {
2370         p = list[i];
2371         if (p->length > conn->peer->maxPacketSize) {
2372             /* a ping *or* a sequenced packet can count */
2373             if ((p->length > conn->peer->maxPacketSize)) {
2374                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2375                      (p->header.flags & RX_REQUEST_ACK)) &&
2376                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2377                     conn->lastPingSize = p->length;
2378                     conn->lastPingSizeSer = serial + i;
2379                 } else if ((p->header.seq != 0) &&
2380                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2381                     conn->lastPacketSize = p->length;
2382                     conn->lastPacketSizeSeq = p->header.seq;
2383                 }
2384             }
2385         }
2386     }
2387     MUTEX_EXIT(&conn->conn_data_lock);
2388
2389
2390     /* This stuff should be revamped, I think, so that most, if not
2391      * all, of the header stuff is always added here.  We could
2392      * probably do away with the encode/decode routines. XXXXX */
2393
2394     jp = NULL;
2395     length = RX_HEADER_SIZE;
2396     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2397     wirevec[0].iov_len = RX_HEADER_SIZE;
2398     for (i = 0; i < len; i++) {
2399         p = list[i];
2400
2401         /* The whole 3.5 jumbogram scheme relies on packets fitting
2402          * in a single packet buffer. */
2403         if (p->niovecs > 2) {
2404             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2405         }
2406
2407         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2408          * in this chunk.  */
2409         if (i < len - 1) {
2410             if (p->length != RX_JUMBOBUFFERSIZE) {
2411                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2412             }
2413             p->header.flags |= RX_JUMBO_PACKET;
2414             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2415             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2416         } else {
2417             wirevec[i + 1].iov_len = p->length;
2418             length += p->length;
2419         }
2420         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2421         if (jp != NULL) {
2422             /* Convert jumbo packet header to network byte order */
2423             temp = (afs_uint32) (p->header.flags) << 24;
2424             temp |= (afs_uint32) (p->header.spare);
2425             *(afs_uint32 *) jp = htonl(temp);
2426         }
2427         jp = (struct rx_jumboHeader *)
2428             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2429
2430         /* Stamp each packet with a unique serial number.  The serial
2431          * number is maintained on a connection basis because some types
2432          * of security may be based on the serial number of the packet,
2433          * and security is handled on a per authenticated-connection
2434          * basis. */
2435         /* Pre-increment, to guarantee no zero serial number; a zero
2436          * serial number means the packet was never sent. */
2437         p->header.serial = ++serial;
2438         /* This is so we can adjust retransmit time-outs better in the face of
2439          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2440          */
2441         if (p->firstSerial == 0) {
2442             p->firstSerial = p->header.serial;
2443         }
2444 #ifdef RXDEBUG
2445         /* If an output tracer function is defined, call it with the packet and
2446          * network address.  Note this function may modify its arguments. */
2447         if (rx_almostSent) {
2448             int drop = (*rx_almostSent) (p, &addr);
2449             /* drop packet if return value is non-zero? */
2450             if (drop)
2451                 deliveryType = 'D';     /* Drop the packet */
2452         }
2453 #endif
2454
2455         /* Get network byte order header */
2456         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2457                                          * touch ALL the fields */
2458     }
2459
2460     /* Send the packet out on the same socket that related packets are being
2461      * received on */
2462     socket =
2463         (conn->type ==
2464          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2465
2466 #ifdef RXDEBUG
2467     /* Possibly drop this packet,  for testing purposes */
2468     if ((deliveryType == 'D')
2469         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2470             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2471         deliveryType = 'D';     /* Drop the packet */
2472     } else {
2473         deliveryType = 'S';     /* Send the packet */
2474 #endif /* RXDEBUG */
2475
2476         /* Loop until the packet is sent.  We'd prefer just to use a
2477          * blocking socket, but unfortunately the interface doesn't
2478          * allow us to have the socket block in send mode, and not
2479          * block in receive mode */
2480 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2481         waslocked = ISAFS_GLOCK();
2482         if (!istack && waslocked)
2483             AFS_GUNLOCK();
2484 #endif
2485         if ((code =
2486              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2487                          istack)) != 0) {
2488             /* send failed, so let's hurry up the resend, eh? */
2489             if (rx_stats_active)
2490                 rx_atomic_inc(&rx_stats.netSendFailures);
2491             for (i = 0; i < len; i++) {
2492                 p = list[i];
2493                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2494             }
2495             /* Some systems are nice and tell us right away that we cannot
2496              * reach this recipient by returning an error code.
2497              * So, when this happens let's "down" the host NOW so
2498              * we don't sit around waiting for this host to timeout later.
2499              */
2500             if (call &&
2501 #ifdef AFS_NT40_ENV
2502                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2503 #elif defined(AFS_LINUX20_ENV)
2504                 code == -ENETUNREACH
2505 #elif defined(AFS_DARWIN_ENV)
2506                 code == EHOSTUNREACH
2507 #else
2508                 0
2509 #endif
2510                 )
2511                 call->lastReceiveTime = 0;
2512         }
2513 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2514         if (!istack && waslocked)
2515             AFS_GLOCK();
2516 #endif
2517 #ifdef RXDEBUG
2518     }
2519
2520     osi_Assert(p != NULL);
2521
2522     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2523           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2524           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2525           p->header.seq, p->header.flags, p, p->length));
2526
2527 #endif
2528     if (rx_stats_active) {
2529         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2530         MUTEX_ENTER(&peer->peer_lock);
2531         hadd32(peer->bytesSent, p->length);
2532         MUTEX_EXIT(&peer->peer_lock);
2533     }
2534 }
2535
2536
2537 /* Send a "special" packet to the peer connection.  If call is
2538  * specified, then the packet is directed to a specific call channel
2539  * associated with the connection, otherwise it is directed to the
2540  * connection only. Uses optionalPacket if it is supplied, rather than
2541  * allocating a new packet buffer.  Nbytes is the length of the data
2542  * portion of the packet.  If data is non-null, nbytes of data are
2543  * copied into the packet.  Type is the type of the packet, as defined
2544  * in rx.h.  Bug: there's a lot of duplication between this and other
2545  * routines.  This needs to be cleaned up. */
2546 struct rx_packet *
2547 rxi_SendSpecial(struct rx_call *call,
2548                 struct rx_connection *conn,
2549                 struct rx_packet *optionalPacket, int type, char *data,
2550                 int nbytes, int istack)
2551 {
2552     /* Some of the following stuff should be common code for all
2553      * packet sends (it's repeated elsewhere) */
2554     struct rx_packet *p;
2555     unsigned int i = 0;
2556     int savelen = 0, saven = 0;
2557     int channel, callNumber;
2558     if (call) {
2559         channel = call->channel;
2560         callNumber = *call->callNumber;
2561         /* BUSY packets refer to the next call on this connection */
2562         if (type == RX_PACKET_TYPE_BUSY) {
2563             callNumber++;
2564         }
2565     } else {
2566         channel = 0;
2567         callNumber = 0;
2568     }
2569     p = optionalPacket;
2570     if (!p) {
2571         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2572         if (!p)
2573             osi_Panic("rxi_SendSpecial failure");
2574     }
2575
2576     if (nbytes != -1)
2577         p->length = nbytes;
2578     else
2579         nbytes = p->length;
2580     p->header.serviceId = conn->serviceId;
2581     p->header.securityIndex = conn->securityIndex;
2582     p->header.cid = (conn->cid | channel);
2583     p->header.callNumber = callNumber;
2584     p->header.seq = 0;
2585     p->header.epoch = conn->epoch;
2586     p->header.type = type;
2587     p->header.flags = 0;
2588     if (conn->type == RX_CLIENT_CONNECTION)
2589         p->header.flags |= RX_CLIENT_INITIATED;
2590     if (data)
2591         rx_packetwrite(p, 0, nbytes, data);
2592
2593     for (i = 1; i < p->niovecs; i++) {
2594         if (nbytes <= p->wirevec[i].iov_len) {
2595             savelen = p->wirevec[i].iov_len;
2596             saven = p->niovecs;
2597             p->wirevec[i].iov_len = nbytes;
2598             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2599         } else
2600             nbytes -= p->wirevec[i].iov_len;
2601     }
2602
2603     if (call)
2604         rxi_Send(call, p, istack);
2605     else
2606         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2607     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2608         /* really need to do this, but it seems safer this way, given that  */
2609         /* sneaky optionalPacket... */
2610         p->wirevec[i - 1].iov_len = savelen;
2611         p->niovecs = saven;
2612     }
2613     if (!optionalPacket)
2614         rxi_FreePacket(p);
2615     return optionalPacket;
2616 }
2617
2618
2619 /* Encode the packet's header (from the struct header in the packet to
2620  * the net byte order representation in the wire representation of the
2621  * packet, which is what is actually sent out on the wire) */
2622 void
2623 rxi_EncodePacketHeader(struct rx_packet *p)
2624 {
2625     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2626
2627     memset(buf, 0, RX_HEADER_SIZE);
2628     *buf++ = htonl(p->header.epoch);
2629     *buf++ = htonl(p->header.cid);
2630     *buf++ = htonl(p->header.callNumber);
2631     *buf++ = htonl(p->header.seq);
2632     *buf++ = htonl(p->header.serial);
2633     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2634                    | (((afs_uint32) p->header.flags) << 16)
2635                    | (p->header.userStatus << 8) | p->header.securityIndex);
2636     /* Note: top 16 bits of this next word were reserved */
2637     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2638 }
2639
2640 /* Decode the packet's header (from net byte order to a struct header) */
2641 void
2642 rxi_DecodePacketHeader(struct rx_packet *p)
2643 {
2644     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2645     afs_uint32 temp;
2646
2647     p->header.epoch = ntohl(*buf);
2648     buf++;
2649     p->header.cid = ntohl(*buf);
2650     buf++;
2651     p->header.callNumber = ntohl(*buf);
2652     buf++;
2653     p->header.seq = ntohl(*buf);
2654     buf++;
2655     p->header.serial = ntohl(*buf);
2656     buf++;
2657
2658     temp = ntohl(*buf);
2659     buf++;
2660
2661     /* C will truncate byte fields to bytes for me */
2662     p->header.type = temp >> 24;
2663     p->header.flags = temp >> 16;
2664     p->header.userStatus = temp >> 8;
2665     p->header.securityIndex = temp >> 0;
2666
2667     temp = ntohl(*buf);
2668     buf++;
2669
2670     p->header.serviceId = (temp & 0xffff);
2671     p->header.spare = temp >> 16;
2672     /* Note: top 16 bits of this last word are the security checksum */
2673 }
2674
2675 /*
2676  * LOCKS HELD: called with call->lock held.
2677  *
2678  * PrepareSendPacket is the only place in the code that
2679  * can increment call->tnext.  This could become an atomic
2680  * in the future.  Beyond that there is nothing in this
2681  * function that requires the call being locked.  This
2682  * function can only be called by the application thread.
2683  */
2684 void
2685 rxi_PrepareSendPacket(struct rx_call *call,
2686                       struct rx_packet *p, int last)
2687 {
2688     struct rx_connection *conn = call->conn;
2689     afs_uint32 seq = call->tnext++;
2690     unsigned int i;
2691     afs_int32 len;              /* len must be a signed type; it can go negative */
2692
2693     /* No data packets on call 0. Where do these come from? */
2694     if (*call->callNumber == 0)
2695         *call->callNumber = 1;
2696
2697     MUTEX_EXIT(&call->lock);
2698     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2699
2700     p->header.cid = (conn->cid | call->channel);
2701     p->header.serviceId = conn->serviceId;
2702     p->header.securityIndex = conn->securityIndex;
2703
2704     p->header.callNumber = *call->callNumber;
2705     p->header.seq = seq;
2706     p->header.epoch = conn->epoch;
2707     p->header.type = RX_PACKET_TYPE_DATA;
2708     p->header.flags = 0;
2709     p->header.spare = 0;
2710     if (conn->type == RX_CLIENT_CONNECTION)
2711         p->header.flags |= RX_CLIENT_INITIATED;
2712
2713     if (last)
2714         p->header.flags |= RX_LAST_PACKET;
2715
2716     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2717     p->header.serial = 0;       /* Another way of saying never transmitted... */
2718
2719     /* Now that we're sure this is the last data on the call, make sure
2720      * that the "length" and the sum of the iov_lens matches. */
2721     len = p->length + call->conn->securityHeaderSize;
2722
2723     for (i = 1; i < p->niovecs && len > 0; i++) {
2724         len -= p->wirevec[i].iov_len;
2725     }
2726     if (len > 0) {
2727         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2728     } else if (i < p->niovecs) {
2729         /* Free any extra elements in the wirevec */
2730 #if defined(RX_ENABLE_TSFPQ)
2731         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2732 #else /* !RX_ENABLE_TSFPQ */
2733         MUTEX_ENTER(&rx_freePktQ_lock);
2734         rxi_FreeDataBufsNoLock(p, i);
2735         MUTEX_EXIT(&rx_freePktQ_lock);
2736 #endif /* !RX_ENABLE_TSFPQ */
2737
2738         p->niovecs = i;
2739     }
2740     if (len)
2741         p->wirevec[i - 1].iov_len += len;
2742     RXS_PreparePacket(conn->securityObject, call, p);
2743     MUTEX_ENTER(&call->lock);
2744 }
2745
2746 /* Given an interface MTU size, calculate an adjusted MTU size that
2747  * will make efficient use of the RX buffers when the peer is sending
2748  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2749 int
2750 rxi_AdjustIfMTU(int mtu)
2751 {
2752     int adjMTU;
2753     int frags;
2754
2755     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2756         return mtu;
2757     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2758     if (mtu <= adjMTU) {
2759         return mtu;
2760     }
2761     mtu -= adjMTU;
2762     if (mtu <= 0) {
2763         return adjMTU;
2764     }
2765     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2766     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2767 }
2768
2769 /* Given an interface MTU size, and the peer's advertised max receive
2770  * size, calculate an adjisted maxMTU size that makes efficient use
2771  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2772 int
2773 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2774 {
2775     int maxMTU = mtu * rxi_nSendFrags;
2776     maxMTU = MIN(maxMTU, peerMaxMTU);
2777     return rxi_AdjustIfMTU(maxMTU);
2778 }
2779
2780 /* Given a packet size, figure out how many datagram packet will fit.
2781  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2782  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2783  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2784 int
2785 rxi_AdjustDgramPackets(int frags, int mtu)
2786 {
2787     int maxMTU;
2788     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2789         return 1;
2790     }
2791     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2792     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2793     /* subtract the size of the first and last packets */
2794     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2795     if (maxMTU < 0) {
2796         return 1;
2797     }
2798     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2799 }
2800
2801 #ifndef KERNEL
2802 /*
2803  * This function can be used by the Windows Cache Manager
2804  * to dump the list of all rx packets so that we can determine
2805  * where the packet leakage is.
2806  */
2807 int rx_DumpPackets(FILE *outputFile, char *cookie)
2808 {
2809 #ifdef RXDEBUG_PACKET
2810     struct rx_packet *p;
2811 #ifdef AFS_NT40_ENV
2812     int zilch;
2813     char output[2048];
2814 #define RXDPRINTF sprintf
2815 #define RXDPRINTOUT output
2816 #else
2817 #define RXDPRINTF fprintf
2818 #define RXDPRINTOUT outputFile
2819 #endif
2820
2821     NETPRI;
2822     MUTEX_ENTER(&rx_freePktQ_lock);
2823     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2824 #ifdef AFS_NT40_ENV
2825     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2826 #endif
2827
2828     for (p = rx_mallocedP; p; p = p->allNextp) {
2829         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2830                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2831                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2832                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2833                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2834                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2835 #ifdef AFS_NT40_ENV
2836         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2837 #endif
2838     }
2839
2840     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2841 #ifdef AFS_NT40_ENV
2842     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2843 #endif
2844
2845     MUTEX_EXIT(&rx_freePktQ_lock);
2846     USERPRI;
2847 #endif /* RXDEBUG_PACKET */
2848     return 0;
2849 }
2850 #endif