src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # if defined(AFS_NT40_ENV)
  47 #  ifndef EWOULDBLOCK
  48 #   define EWOULDBLOCK WSAEWOULDBLOCK
  49 #  endif
  50 #  include "rx_user.h"
  51 #  include "rx_xmit_nt.h"
  52 # endif
  53 # include <lwp.h>
  54 #endif /* KERNEL */
  55
  56 #ifdef  AFS_SUN5_ENV
  57 # include <sys/sysmacros.h>
  58 #endif
  59
  60 #include "rx.h"
  61 #include "rx_clock.h"
  62 #include "rx_queue.h"
  63 #include "rx_packet.h"
  64 #include "rx_atomic.h"
  65 #include "rx_globals.h"
  66 #include "rx_internal.h"
  67 #include "rx_stats.h"
  68
  69 #ifdef RX_LOCKS_DB
  70 /* rxdb_fileID is used to identify the lock location, along with line#. */
  71 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  72 #endif /* RX_LOCKS_DB */
  73 static struct rx_packet *rx_mallocedP = 0;
  74 #ifdef RXDEBUG_PACKET
  75 static afs_uint32       rx_packet_id = 0;
  76 #endif
  77
  78 extern char cml_version_number[];
  79
  80 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
  81
  82 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  83                                 afs_uint32 ahost, short aport,
  84                                 afs_int32 istack);
  85
  86 #ifdef RX_ENABLE_TSFPQ
  87 static int
  88 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
  89 #else
  90 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
  91                                    afs_uint32 first,
  92                                    struct rx_queue * q);
  93 #endif
  94
  95 /* some rules about packets:
  96  * 1.  When a packet is allocated, the final iov_buf contains room for
  97  * a security trailer, but iov_len masks that fact.  If the security
  98  * package wants to add the trailer, it may do so, and then extend
  99  * iov_len appropriately.  For this reason, packet's niovecs and
 100  * iov_len fields should be accurate before calling PreparePacket.
 101 */
 102
 103 /* Preconditions:
 104  *        all packet buffers (iov_base) are integral multiples of
 105  *        the word size.
 106  *        offset is an integral multiple of the word size.
 107  */
 108 afs_int32
 109 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 110 {
 111     unsigned int i;
 112     size_t l;
 113     for (l = 0, i = 1; i < packet->niovecs; i++) {
 114         if (l + packet->wirevec[i].iov_len > offset) {
 115             return
 116                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 117                                  (offset - l)));
 118         }
 119         l += packet->wirevec[i].iov_len;
 120     }
 121
 122     return 0;
 123 }
 124
 125 /* Preconditions:
 126  *        all packet buffers (iov_base) are integral multiples of the word size.
 127  *        offset is an integral multiple of the word size.
 128  */
 129 afs_int32
 130 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 131 {
 132     unsigned int i;
 133     size_t l;
 134     for (l = 0, i = 1; i < packet->niovecs; i++) {
 135         if (l + packet->wirevec[i].iov_len > offset) {
 136             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 137                              (offset - l))) = data;
 138             return 0;
 139         }
 140         l += packet->wirevec[i].iov_len;
 141     }
 142
 143     return 0;
 144 }
 145
 146 /* Preconditions:
 147  *        all packet buffers (iov_base) are integral multiples of the
 148  *        word size.
 149  *        offset is an integral multiple of the word size.
 150  * Packet Invariants:
 151  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 152  */
 153 afs_int32
 154 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 155                   char *out)
 156 {
 157     unsigned int i, j, l, r;
 158     for (l = 0, i = 1; i < packet->niovecs; i++) {
 159         if (l + packet->wirevec[i].iov_len > offset) {
 160             break;
 161         }
 162         l += packet->wirevec[i].iov_len;
 163     }
 164
 165     /* i is the iovec which contains the first little bit of data in which we
 166      * are interested.  l is the total length of everything prior to this iovec.
 167      * j is the number of bytes we can safely copy out of this iovec.
 168      * offset only applies to the first iovec.
 169      */
 170     r = resid;
 171     while ((r > 0) && (i < packet->niovecs)) {
 172         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 173         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 174         r -= j;
 175         out += j;
 176         l += packet->wirevec[i].iov_len;
 177         offset = l;
 178         i++;
 179     }
 180
 181     return (r ? (resid - r) : resid);
 182 }
 183
 184
 185 /* Preconditions:
 186  *        all packet buffers (iov_base) are integral multiples of the
 187  *        word size.
 188  *        offset is an integral multiple of the word size.
 189  */
 190 afs_int32
 191 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 192 {
 193     unsigned int i, j, l, o, r;
 194     char *b;
 195
 196     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 197         if (l + packet->wirevec[i].iov_len > o) {
 198             break;
 199         }
 200         l += packet->wirevec[i].iov_len;
 201     }
 202
 203     /* i is the iovec which contains the first little bit of data in which we
 204      * are interested.  l is the total length of everything prior to this iovec.
 205      * j is the number of bytes we can safely copy out of this iovec.
 206      * offset only applies to the first iovec.
 207      */
 208     r = resid;
 209     while ((r > 0) && (i <= RX_MAXWVECS)) {
 210         if (i >= packet->niovecs)
 211             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 212                 break;
 213
 214         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 215         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 216         memcpy(b, in, j);
 217         r -= j;
 218         in += j;
 219         l += packet->wirevec[i].iov_len;
 220         offset = l;
 221         i++;
 222     }
 223
 224     return (r ? (resid - r) : resid);
 225 }
 226
 227 int
 228 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 229 {
 230     struct rx_packet *p, *np;
 231
 232     num_pkts = AllocPacketBufs(class, num_pkts, q);
 233
 234     for (queue_Scan(q, p, np, rx_packet)) {
 235         RX_PACKET_IOV_FULLINIT(p);
 236     }
 237
 238     return num_pkts;
 239 }
 240
 241 #ifdef RX_ENABLE_TSFPQ
 242 static int
 243 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 244 {
 245     struct rx_ts_info_t * rx_ts_info;
 246     int transfer;
 247     SPLVAR;
 248
 249     RX_TS_INFO_GET(rx_ts_info);
 250
 251     transfer = num_pkts - rx_ts_info->_FPQ.len;
 252     if (transfer > 0) {
 253         NETPRI;
 254         MUTEX_ENTER(&rx_freePktQ_lock);
 255         transfer = MAX(transfer, rx_TSFPQGlobSize);
 256         if (transfer > rx_nFreePackets) {
 257             /* alloc enough for us, plus a few globs for other threads */
 258             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 259         }
 260
 261         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 262
 263         MUTEX_EXIT(&rx_freePktQ_lock);
 264         USERPRI;
 265     }
 266
 267     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 268
 269     return num_pkts;
 270 }
 271 #else /* RX_ENABLE_TSFPQ */
 272 static int
 273 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 274 {
 275     struct rx_packet *c;
 276     int i;
 277 #ifdef KERNEL
 278     int overq = 0;
 279 #endif
 280     SPLVAR;
 281
 282     NETPRI;
 283
 284     MUTEX_ENTER(&rx_freePktQ_lock);
 285
 286 #ifdef KERNEL
 287     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 288          num_pkts--, overq++);
 289
 290     if (overq) {
 291         rxi_NeedMorePackets = TRUE;
 292         if (rx_stats_active) {
 293             switch (class) {
 294             case RX_PACKET_CLASS_RECEIVE:
 295                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 296                 break;
 297             case RX_PACKET_CLASS_SEND:
 298                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 299                 break;
 300             case RX_PACKET_CLASS_SPECIAL:
 301                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 302                 break;
 303             case RX_PACKET_CLASS_RECV_CBUF:
 304                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 305                 break;
 306             case RX_PACKET_CLASS_SEND_CBUF:
 307                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 308                 break;
 309             }
 310         }
 311     }
 312
 313     if (rx_nFreePackets < num_pkts)
 314         num_pkts = rx_nFreePackets;
 315
 316     if (!num_pkts) {
 317         rxi_NeedMorePackets = TRUE;
 318         goto done;
 319     }
 320 #else /* KERNEL */
 321     if (rx_nFreePackets < num_pkts) {
 322         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 323     }
 324 #endif /* KERNEL */
 325
 326     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 327          i < num_pkts;
 328          i++, c=queue_Next(c, rx_packet)) {
 329         RX_FPQ_MARK_USED(c);
 330     }
 331
 332     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 333
 334     rx_nFreePackets -= num_pkts;
 335
 336 #ifdef KERNEL
 337   done:
 338 #endif
 339     MUTEX_EXIT(&rx_freePktQ_lock);
 340
 341     USERPRI;
 342     return num_pkts;
 343 }
 344 #endif /* RX_ENABLE_TSFPQ */
 345
 346 /*
 347  * Free a packet currently used as a continuation buffer
 348  */
 349 #ifdef RX_ENABLE_TSFPQ
 350 /* num_pkts=0 means queue length is unknown */
 351 int
 352 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 353 {
 354     struct rx_ts_info_t * rx_ts_info;
 355     struct rx_packet *c, *nc;
 356     SPLVAR;
 357
 358     osi_Assert(num_pkts >= 0);
 359     RX_TS_INFO_GET(rx_ts_info);
 360
 361     if (!num_pkts) {
 362         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 363             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 364         }
 365     } else {
 366         for (queue_Scan(q, c, nc, rx_packet)) {
 367             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 368         }
 369     }
 370
 371     if (num_pkts) {
 372         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 373     }
 374
 375     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 376         NETPRI;
 377         MUTEX_ENTER(&rx_freePktQ_lock);
 378
 379         RX_TS_FPQ_LTOG(rx_ts_info);
 380
 381         /* Wakeup anyone waiting for packets */
 382         rxi_PacketsUnWait();
 383
 384         MUTEX_EXIT(&rx_freePktQ_lock);
 385         USERPRI;
 386     }
 387
 388     return num_pkts;
 389 }
 390 #else /* RX_ENABLE_TSFPQ */
 391 /* num_pkts=0 means queue length is unknown */
 392 int
 393 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 394 {
 395     struct rx_queue cbs;
 396     struct rx_packet *p, *np;
 397     int qlen = 0;
 398     SPLVAR;
 399
 400     osi_Assert(num_pkts >= 0);
 401     queue_Init(&cbs);
 402
 403     if (!num_pkts) {
 404         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 405             if (p->niovecs > 2) {
 406                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 407             }
 408             RX_FPQ_MARK_FREE(p);
 409         }
 410         if (!num_pkts)
 411             return 0;
 412     } else {
 413         for (queue_Scan(q, p, np, rx_packet)) {
 414             if (p->niovecs > 2) {
 415                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 416             }
 417             RX_FPQ_MARK_FREE(p);
 418         }
 419     }
 420
 421     if (qlen) {
 422         queue_SpliceAppend(q, &cbs);
 423         qlen += num_pkts;
 424     } else
 425         qlen = num_pkts;
 426
 427     NETPRI;
 428     MUTEX_ENTER(&rx_freePktQ_lock);
 429
 430     queue_SpliceAppend(&rx_freePacketQueue, q);
 431     rx_nFreePackets += qlen;
 432
 433     /* Wakeup anyone waiting for packets */
 434     rxi_PacketsUnWait();
 435
 436     MUTEX_EXIT(&rx_freePktQ_lock);
 437     USERPRI;
 438
 439     return num_pkts;
 440 }
 441 #endif /* RX_ENABLE_TSFPQ */
 442
 443 /* this one is kind of awful.
 444  * In rxkad, the packet has been all shortened, and everything, ready for
 445  * sending.  All of a sudden, we discover we need some of that space back.
 446  * This isn't terribly general, because it knows that the packets are only
 447  * rounded up to the EBS (userdata + security header).
 448  */
 449 int
 450 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 451 {
 452     int i;
 453     i = p->niovecs - 1;
 454     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 455         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 456             p->wirevec[i].iov_len += nb;
 457             return 0;
 458         }
 459     } else {
 460         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 461             p->wirevec[i].iov_len += nb;
 462             return 0;
 463         }
 464     }
 465
 466     return 0;
 467 }
 468
 469 /* get sufficient space to store nb bytes of data (or more), and hook
 470  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 471  * returns the number of bytes >0 which it failed to come up with.
 472  * Don't need to worry about locking on packet, since only
 473  * one thread can manipulate one at a time. Locking on continution
 474  * packets is handled by AllocPacketBufs */
 475 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 476 int
 477 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 478 {
 479     int i, nv;
 480     struct rx_queue q;
 481     struct rx_packet *cb, *ncb;
 482
 483     /* compute the number of cbuf's we need */
 484     nv = nb / RX_CBUFFERSIZE;
 485     if ((nv * RX_CBUFFERSIZE) < nb)
 486         nv++;
 487     if ((nv + p->niovecs) > RX_MAXWVECS)
 488         nv = RX_MAXWVECS - p->niovecs;
 489     if (nv < 1)
 490         return nb;
 491
 492     /* allocate buffers */
 493     queue_Init(&q);
 494     nv = AllocPacketBufs(class, nv, &q);
 495
 496     /* setup packet iovs */
 497     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 498         queue_Remove(cb);
 499         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 500         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 501     }
 502
 503     nb -= (nv * RX_CBUFFERSIZE);
 504     p->length += (nv * RX_CBUFFERSIZE);
 505     p->niovecs += nv;
 506
 507     return nb;
 508 }
 509
 510 /* Add more packet buffers */
 511 #ifdef RX_ENABLE_TSFPQ
 512 void
 513 rxi_MorePackets(int apackets)
 514 {
 515     struct rx_packet *p, *e;
 516     struct rx_ts_info_t * rx_ts_info;
 517     int getme;
 518     SPLVAR;
 519
 520     getme = apackets * sizeof(struct rx_packet);
 521     p = (struct rx_packet *)osi_Alloc(getme);
 522     osi_Assert(p);
 523
 524     PIN(p, getme);              /* XXXXX */
 525     memset(p, 0, getme);
 526     RX_TS_INFO_GET(rx_ts_info);
 527
 528     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 529     /* TSFPQ patch also needs to keep track of total packets */
 530
 531     MUTEX_ENTER(&rx_packets_mutex);
 532     rx_nPackets += apackets;
 533     RX_TS_FPQ_COMPUTE_LIMITS;
 534     MUTEX_EXIT(&rx_packets_mutex);
 535
 536     for (e = p + apackets; p < e; p++) {
 537         RX_PACKET_IOV_INIT(p);
 538         p->niovecs = 2;
 539
 540         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 541
 542         NETPRI;
 543         MUTEX_ENTER(&rx_freePktQ_lock);
 544 #ifdef RXDEBUG_PACKET
 545         p->packetId = rx_packet_id++;
 546         p->allNextp = rx_mallocedP;
 547 #endif /* RXDEBUG_PACKET */
 548         rx_mallocedP = p;
 549         MUTEX_EXIT(&rx_freePktQ_lock);
 550         USERPRI;
 551     }
 552     rx_ts_info->_FPQ.delta += apackets;
 553
 554     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 555         NETPRI;
 556         MUTEX_ENTER(&rx_freePktQ_lock);
 557
 558         RX_TS_FPQ_LTOG(rx_ts_info);
 559         rxi_NeedMorePackets = FALSE;
 560         rxi_PacketsUnWait();
 561
 562         MUTEX_EXIT(&rx_freePktQ_lock);
 563         USERPRI;
 564     }
 565 }
 566 #else /* RX_ENABLE_TSFPQ */
 567 void
 568 rxi_MorePackets(int apackets)
 569 {
 570     struct rx_packet *p, *e;
 571     int getme;
 572     SPLVAR;
 573
 574     getme = apackets * sizeof(struct rx_packet);
 575     p = (struct rx_packet *)osi_Alloc(getme);
 576     osi_Assert(p);
 577
 578     PIN(p, getme);              /* XXXXX */
 579     memset(p, 0, getme);
 580     NETPRI;
 581     MUTEX_ENTER(&rx_freePktQ_lock);
 582
 583     for (e = p + apackets; p < e; p++) {
 584         RX_PACKET_IOV_INIT(p);
 585 #ifdef RX_TRACK_PACKETS
 586         p->flags |= RX_PKTFLAG_FREE;
 587 #endif
 588         p->niovecs = 2;
 589
 590         queue_Append(&rx_freePacketQueue, p);
 591 #ifdef RXDEBUG_PACKET
 592         p->packetId = rx_packet_id++;
 593         p->allNextp = rx_mallocedP;
 594 #endif /* RXDEBUG_PACKET */
 595         rx_mallocedP = p;
 596     }
 597
 598     rx_nPackets += apackets;
 599     rx_nFreePackets += apackets;
 600     rxi_NeedMorePackets = FALSE;
 601     rxi_PacketsUnWait();
 602
 603     MUTEX_EXIT(&rx_freePktQ_lock);
 604     USERPRI;
 605 }
 606 #endif /* RX_ENABLE_TSFPQ */
 607
 608 #ifdef RX_ENABLE_TSFPQ
 609 void
 610 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 611 {
 612     struct rx_packet *p, *e;
 613     struct rx_ts_info_t * rx_ts_info;
 614     int getme;
 615     SPLVAR;
 616
 617     getme = apackets * sizeof(struct rx_packet);
 618     p = (struct rx_packet *)osi_Alloc(getme);
 619
 620     PIN(p, getme);              /* XXXXX */
 621     memset(p, 0, getme);
 622     RX_TS_INFO_GET(rx_ts_info);
 623
 624     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 625     /* TSFPQ patch also needs to keep track of total packets */
 626     MUTEX_ENTER(&rx_packets_mutex);
 627     rx_nPackets += apackets;
 628     RX_TS_FPQ_COMPUTE_LIMITS;
 629     MUTEX_EXIT(&rx_packets_mutex);
 630
 631     for (e = p + apackets; p < e; p++) {
 632         RX_PACKET_IOV_INIT(p);
 633         p->niovecs = 2;
 634         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 635
 636         NETPRI;
 637         MUTEX_ENTER(&rx_freePktQ_lock);
 638 #ifdef RXDEBUG_PACKET
 639         p->packetId = rx_packet_id++;
 640         p->allNextp = rx_mallocedP;
 641 #endif /* RXDEBUG_PACKET */
 642         rx_mallocedP = p;
 643         MUTEX_EXIT(&rx_freePktQ_lock);
 644         USERPRI;
 645     }
 646     rx_ts_info->_FPQ.delta += apackets;
 647
 648     if (flush_global &&
 649         (num_keep_local < apackets)) {
 650         NETPRI;
 651         MUTEX_ENTER(&rx_freePktQ_lock);
 652
 653         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 654         rxi_NeedMorePackets = FALSE;
 655         rxi_PacketsUnWait();
 656
 657         MUTEX_EXIT(&rx_freePktQ_lock);
 658         USERPRI;
 659     }
 660 }
 661 #endif /* RX_ENABLE_TSFPQ */
 662
 663 #ifndef KERNEL
 664 /* Add more packet buffers */
 665 void
 666 rxi_MorePacketsNoLock(int apackets)
 667 {
 668 #ifdef RX_ENABLE_TSFPQ
 669     struct rx_ts_info_t * rx_ts_info;
 670 #endif /* RX_ENABLE_TSFPQ */
 671     struct rx_packet *p, *e;
 672     int getme;
 673
 674     /* allocate enough packets that 1/4 of the packets will be able
 675      * to hold maximal amounts of data */
 676     apackets += (apackets / 4)
 677         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 678     do {
 679         getme = apackets * sizeof(struct rx_packet);
 680         p = (struct rx_packet *)osi_Alloc(getme);
 681         if (p == NULL) {
 682             apackets -= apackets / 4;
 683             osi_Assert(apackets > 0);
 684         }
 685     } while(p == NULL);
 686     memset(p, 0, getme);
 687
 688 #ifdef RX_ENABLE_TSFPQ
 689     RX_TS_INFO_GET(rx_ts_info);
 690     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 691 #endif /* RX_ENABLE_TSFPQ */
 692
 693     for (e = p + apackets; p < e; p++) {
 694         RX_PACKET_IOV_INIT(p);
 695 #ifdef RX_TRACK_PACKETS
 696         p->flags |= RX_PKTFLAG_FREE;
 697 #endif
 698         p->niovecs = 2;
 699
 700         queue_Append(&rx_freePacketQueue, p);
 701 #ifdef RXDEBUG_PACKET
 702         p->packetId = rx_packet_id++;
 703         p->allNextp = rx_mallocedP;
 704 #endif /* RXDEBUG_PACKET */
 705         rx_mallocedP = p;
 706     }
 707
 708     rx_nFreePackets += apackets;
 709     MUTEX_ENTER(&rx_packets_mutex);
 710     rx_nPackets += apackets;
 711 #ifdef RX_ENABLE_TSFPQ
 712     RX_TS_FPQ_COMPUTE_LIMITS;
 713 #endif /* RX_ENABLE_TSFPQ */
 714     MUTEX_EXIT(&rx_packets_mutex);
 715     rxi_NeedMorePackets = FALSE;
 716     rxi_PacketsUnWait();
 717 }
 718 #endif /* !KERNEL */
 719
 720 void
 721 rxi_FreeAllPackets(void)
 722 {
 723     /* must be called at proper interrupt level, etcetera */
 724     /* MTUXXX need to free all Packets */
 725     osi_Free(rx_mallocedP,
 726              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 727     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 728 }
 729
 730 #ifdef RX_ENABLE_TSFPQ
 731 void
 732 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 733 {
 734     struct rx_ts_info_t * rx_ts_info;
 735     int xfer;
 736     SPLVAR;
 737
 738     RX_TS_INFO_GET(rx_ts_info);
 739
 740     if (num_keep_local != rx_ts_info->_FPQ.len) {
 741         NETPRI;
 742         MUTEX_ENTER(&rx_freePktQ_lock);
 743         if (num_keep_local < rx_ts_info->_FPQ.len) {
 744             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 745             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 746             rxi_PacketsUnWait();
 747         } else {
 748             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 749             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 750                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 751             if (rx_nFreePackets < xfer) {
 752                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 753             }
 754             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 755         }
 756         MUTEX_EXIT(&rx_freePktQ_lock);
 757         USERPRI;
 758     }
 759 }
 760
 761 void
 762 rxi_FlushLocalPacketsTSFPQ(void)
 763 {
 764     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 765 }
 766 #endif /* RX_ENABLE_TSFPQ */
 767
 768 /* Allocate more packets iff we need more continuation buffers */
 769 /* In kernel, can't page in memory with interrupts disabled, so we
 770  * don't use the event mechanism. */
 771 void
 772 rx_CheckPackets(void)
 773 {
 774     if (rxi_NeedMorePackets) {
 775         rxi_MorePackets(rx_maxSendWindow);
 776     }
 777 }
 778
 779 /* In the packet freeing routine below, the assumption is that
 780    we want all of the packets to be used equally frequently, so that we
 781    don't get packet buffers paging out.  It would be just as valid to
 782    assume that we DO want them to page out if not many are being used.
 783    In any event, we assume the former, and append the packets to the end
 784    of the free list.  */
 785 /* This explanation is bogus.  The free list doesn't remain in any kind of
 786    useful order for afs_int32: the packets in use get pretty much randomly scattered
 787    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 788    must be stored so that packets which are adjacent in memory are adjacent in the
 789    free list.  An array springs rapidly to mind.
 790    */
 791
 792 /* Actually free the packet p. */
 793 #ifdef RX_ENABLE_TSFPQ
 794 void
 795 rxi_FreePacketNoLock(struct rx_packet *p)
 796 {
 797     struct rx_ts_info_t * rx_ts_info;
 798     dpf(("Free %"AFS_PTR_FMT"\n", p));
 799
 800     RX_TS_INFO_GET(rx_ts_info);
 801     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 802     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 803         RX_TS_FPQ_LTOG(rx_ts_info);
 804     }
 805 }
 806 #else /* RX_ENABLE_TSFPQ */
 807 void
 808 rxi_FreePacketNoLock(struct rx_packet *p)
 809 {
 810     dpf(("Free %"AFS_PTR_FMT"\n", p));
 811
 812     RX_FPQ_MARK_FREE(p);
 813     rx_nFreePackets++;
 814     queue_Append(&rx_freePacketQueue, p);
 815 }
 816 #endif /* RX_ENABLE_TSFPQ */
 817
 818 #ifdef RX_ENABLE_TSFPQ
 819 void
 820 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 821 {
 822     struct rx_ts_info_t * rx_ts_info;
 823     dpf(("Free %"AFS_PTR_FMT"\n", p));
 824
 825     RX_TS_INFO_GET(rx_ts_info);
 826     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 827
 828     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 829         NETPRI;
 830         MUTEX_ENTER(&rx_freePktQ_lock);
 831
 832         RX_TS_FPQ_LTOG(rx_ts_info);
 833
 834         /* Wakeup anyone waiting for packets */
 835         rxi_PacketsUnWait();
 836
 837         MUTEX_EXIT(&rx_freePktQ_lock);
 838         USERPRI;
 839     }
 840 }
 841 #endif /* RX_ENABLE_TSFPQ */
 842
 843 /*
 844  * free continuation buffers off a packet into a queue
 845  *
 846  * [IN] p      -- packet from which continuation buffers will be freed
 847  * [IN] first  -- iovec offset of first continuation buffer to free
 848  * [IN] q      -- queue into which continuation buffers will be chained
 849  *
 850  * returns:
 851  *   number of continuation buffers freed
 852  */
 853 #ifndef RX_ENABLE_TSFPQ
 854 static int
 855 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 856 {
 857     struct iovec *iov;
 858     struct rx_packet * cb;
 859     int count = 0;
 860
 861     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 862         iov = &p->wirevec[first];
 863         if (!iov->iov_base)
 864             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 865         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 866         RX_FPQ_MARK_FREE(cb);
 867         queue_Append(q, cb);
 868     }
 869     p->length = 0;
 870     p->niovecs = 0;
 871
 872     return count;
 873 }
 874 #endif
 875
 876 /*
 877  * free packet continuation buffers into the global free packet pool
 878  *
 879  * [IN] p      -- packet from which to free continuation buffers
 880  * [IN] first  -- iovec offset of first continuation buffer to free
 881  *
 882  * returns:
 883  *   zero always
 884  */
 885 int
 886 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 887 {
 888     struct iovec *iov;
 889
 890     for (first = MAX(2, first); first < p->niovecs; first++) {
 891         iov = &p->wirevec[first];
 892         if (!iov->iov_base)
 893             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 894         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 895     }
 896     p->length = 0;
 897     p->niovecs = 0;
 898
 899     return 0;
 900 }
 901
 902 #ifdef RX_ENABLE_TSFPQ
 903 /*
 904  * free packet continuation buffers into the thread-local free pool
 905  *
 906  * [IN] p             -- packet from which continuation buffers will be freed
 907  * [IN] first         -- iovec offset of first continuation buffer to free
 908  *                       any value less than 2, the min number of iovecs,
 909  *                       is treated as if it is 2.
 910  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 911  *                       global free pool before returning
 912  *
 913  * returns:
 914  *   zero always
 915  */
 916 static int
 917 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 918 {
 919     struct iovec *iov;
 920     struct rx_ts_info_t * rx_ts_info;
 921
 922     RX_TS_INFO_GET(rx_ts_info);
 923
 924     for (first = MAX(2, first); first < p->niovecs; first++) {
 925         iov = &p->wirevec[first];
 926         if (!iov->iov_base)
 927             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 928         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 929     }
 930     p->length = 0;
 931     p->niovecs = 0;
 932
 933     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 934         NETPRI;
 935         MUTEX_ENTER(&rx_freePktQ_lock);
 936
 937         RX_TS_FPQ_LTOG(rx_ts_info);
 938
 939         /* Wakeup anyone waiting for packets */
 940         rxi_PacketsUnWait();
 941
 942         MUTEX_EXIT(&rx_freePktQ_lock);
 943         USERPRI;
 944     }
 945     return 0;
 946 }
 947 #endif /* RX_ENABLE_TSFPQ */
 948
 949 int rxi_nBadIovecs = 0;
 950
 951 /* rxi_RestoreDataBufs
 952  *
 953  * Restore the correct sizes to the iovecs. Called when reusing a packet
 954  * for reading off the wire.
 955  */
 956 void
 957 rxi_RestoreDataBufs(struct rx_packet *p)
 958 {
 959     unsigned int i;
 960     struct iovec *iov = &p->wirevec[2];
 961
 962     RX_PACKET_IOV_INIT(p);
 963
 964     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 965         if (!iov->iov_base) {
 966             rxi_nBadIovecs++;
 967             p->niovecs = i;
 968             break;
 969         }
 970         iov->iov_len = RX_CBUFFERSIZE;
 971     }
 972 }
 973
 974 #ifdef RX_ENABLE_TSFPQ
 975 int
 976 rxi_TrimDataBufs(struct rx_packet *p, int first)
 977 {
 978     int length;
 979     struct iovec *iov, *end;
 980     struct rx_ts_info_t * rx_ts_info;
 981     SPLVAR;
 982
 983     if (first != 1)
 984         osi_Panic("TrimDataBufs 1: first must be 1");
 985
 986     /* Skip over continuation buffers containing message data */
 987     iov = &p->wirevec[2];
 988     end = iov + (p->niovecs - 2);
 989     length = p->length - p->wirevec[1].iov_len;
 990     for (; iov < end && length > 0; iov++) {
 991         if (!iov->iov_base)
 992             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
 993         length -= iov->iov_len;
 994     }
 995
 996     /* iov now points to the first empty data buffer. */
 997     if (iov >= end)
 998         return 0;
 999
1000     RX_TS_INFO_GET(rx_ts_info);
1001     for (; iov < end; iov++) {
1002         if (!iov->iov_base)
1003             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1004         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1005         p->niovecs--;
1006     }
1007     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1008         NETPRI;
1009         MUTEX_ENTER(&rx_freePktQ_lock);
1010
1011         RX_TS_FPQ_LTOG(rx_ts_info);
1012         rxi_PacketsUnWait();
1013
1014         MUTEX_EXIT(&rx_freePktQ_lock);
1015         USERPRI;
1016     }
1017
1018     return 0;
1019 }
1020 #else /* RX_ENABLE_TSFPQ */
1021 int
1022 rxi_TrimDataBufs(struct rx_packet *p, int first)
1023 {
1024     int length;
1025     struct iovec *iov, *end;
1026     SPLVAR;
1027
1028     if (first != 1)
1029         osi_Panic("TrimDataBufs 1: first must be 1");
1030
1031     /* Skip over continuation buffers containing message data */
1032     iov = &p->wirevec[2];
1033     end = iov + (p->niovecs - 2);
1034     length = p->length - p->wirevec[1].iov_len;
1035     for (; iov < end && length > 0; iov++) {
1036         if (!iov->iov_base)
1037             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1038         length -= iov->iov_len;
1039     }
1040
1041     /* iov now points to the first empty data buffer. */
1042     if (iov >= end)
1043         return 0;
1044
1045     NETPRI;
1046     MUTEX_ENTER(&rx_freePktQ_lock);
1047
1048     for (; iov < end; iov++) {
1049         if (!iov->iov_base)
1050             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1051         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1052         p->niovecs--;
1053     }
1054     rxi_PacketsUnWait();
1055
1056     MUTEX_EXIT(&rx_freePktQ_lock);
1057     USERPRI;
1058
1059     return 0;
1060 }
1061 #endif /* RX_ENABLE_TSFPQ */
1062
1063 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1064  * remove it yourself first if you call this routine. */
1065 #ifdef RX_ENABLE_TSFPQ
1066 void
1067 rxi_FreePacket(struct rx_packet *p)
1068 {
1069     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1070     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1071 }
1072 #else /* RX_ENABLE_TSFPQ */
1073 void
1074 rxi_FreePacket(struct rx_packet *p)
1075 {
1076     SPLVAR;
1077
1078     NETPRI;
1079     MUTEX_ENTER(&rx_freePktQ_lock);
1080
1081     rxi_FreeDataBufsNoLock(p, 2);
1082     rxi_FreePacketNoLock(p);
1083     /* Wakeup anyone waiting for packets */
1084     rxi_PacketsUnWait();
1085
1086     MUTEX_EXIT(&rx_freePktQ_lock);
1087     USERPRI;
1088 }
1089 #endif /* RX_ENABLE_TSFPQ */
1090
1091 /* rxi_AllocPacket sets up p->length so it reflects the number of
1092  * bytes in the packet at this point, **not including** the header.
1093  * The header is absolutely necessary, besides, this is the way the
1094  * length field is usually used */
1095 #ifdef RX_ENABLE_TSFPQ
1096 struct rx_packet *
1097 rxi_AllocPacketNoLock(int class)
1098 {
1099     struct rx_packet *p;
1100     struct rx_ts_info_t * rx_ts_info;
1101
1102     RX_TS_INFO_GET(rx_ts_info);
1103
1104 #ifdef KERNEL
1105     if (rxi_OverQuota(class)) {
1106         rxi_NeedMorePackets = TRUE;
1107         if (rx_stats_active) {
1108             switch (class) {
1109             case RX_PACKET_CLASS_RECEIVE:
1110                 rx_atomic_inc(rx_stats.receivePktAllocFailures);
1111                 break;
1112             case RX_PACKET_CLASS_SEND:
1113                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1114                 break;
1115             case RX_PACKET_CLASS_SPECIAL:
1116                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1117                 break;
1118             case RX_PACKET_CLASS_RECV_CBUF:
1119                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1120                 break;
1121             case RX_PACKET_CLASS_SEND_CBUF:
1122                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1123                 break;
1124             }
1125         }
1126         return (struct rx_packet *)0;
1127     }
1128 #endif /* KERNEL */
1129
1130     if (rx_stats_active)
1131         rx_atomic_inc(&rx_stats.packetRequests);
1132     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1133
1134 #ifdef KERNEL
1135         if (queue_IsEmpty(&rx_freePacketQueue))
1136             osi_Panic("rxi_AllocPacket error");
1137 #else /* KERNEL */
1138         if (queue_IsEmpty(&rx_freePacketQueue))
1139             rxi_MorePacketsNoLock(rx_maxSendWindow);
1140 #endif /* KERNEL */
1141
1142
1143         RX_TS_FPQ_GTOL(rx_ts_info);
1144     }
1145
1146     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1147
1148     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1149
1150
1151     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1152      * order to truncate outbound packets.  In the near future, may need
1153      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1154      */
1155     RX_PACKET_IOV_FULLINIT(p);
1156     return p;
1157 }
1158 #else /* RX_ENABLE_TSFPQ */
1159 struct rx_packet *
1160 rxi_AllocPacketNoLock(int class)
1161 {
1162     struct rx_packet *p;
1163
1164 #ifdef KERNEL
1165     if (rxi_OverQuota(class)) {
1166         rxi_NeedMorePackets = TRUE;
1167         if (rx_stats_active) {
1168             switch (class) {
1169             case RX_PACKET_CLASS_RECEIVE:
1170                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1171                 break;
1172             case RX_PACKET_CLASS_SEND:
1173                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1174                 break;
1175             case RX_PACKET_CLASS_SPECIAL:
1176                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1177                 break;
1178             case RX_PACKET_CLASS_RECV_CBUF:
1179                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1180                 break;
1181             case RX_PACKET_CLASS_SEND_CBUF:
1182                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1183                 break;
1184             }
1185         }
1186         return (struct rx_packet *)0;
1187     }
1188 #endif /* KERNEL */
1189
1190     if (rx_stats_active)
1191         rx_atomic_inc(&rx_stats.packetRequests);
1192
1193 #ifdef KERNEL
1194     if (queue_IsEmpty(&rx_freePacketQueue))
1195         osi_Panic("rxi_AllocPacket error");
1196 #else /* KERNEL */
1197     if (queue_IsEmpty(&rx_freePacketQueue))
1198         rxi_MorePacketsNoLock(rx_maxSendWindow);
1199 #endif /* KERNEL */
1200
1201     rx_nFreePackets--;
1202     p = queue_First(&rx_freePacketQueue, rx_packet);
1203     queue_Remove(p);
1204     RX_FPQ_MARK_USED(p);
1205
1206     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1207
1208
1209     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1210      * order to truncate outbound packets.  In the near future, may need
1211      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1212      */
1213     RX_PACKET_IOV_FULLINIT(p);
1214     return p;
1215 }
1216 #endif /* RX_ENABLE_TSFPQ */
1217
1218 #ifdef RX_ENABLE_TSFPQ
1219 struct rx_packet *
1220 rxi_AllocPacketTSFPQ(int class, int pull_global)
1221 {
1222     struct rx_packet *p;
1223     struct rx_ts_info_t * rx_ts_info;
1224
1225     RX_TS_INFO_GET(rx_ts_info);
1226
1227     if (rx_stats_active)
1228         rx_atomic_inc(&rx_stats.packetRequests);
1229     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1230         MUTEX_ENTER(&rx_freePktQ_lock);
1231
1232         if (queue_IsEmpty(&rx_freePacketQueue))
1233             rxi_MorePacketsNoLock(rx_maxSendWindow);
1234
1235         RX_TS_FPQ_GTOL(rx_ts_info);
1236
1237         MUTEX_EXIT(&rx_freePktQ_lock);
1238     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1239         return NULL;
1240     }
1241
1242     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1243
1244     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1245
1246     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1247      * order to truncate outbound packets.  In the near future, may need
1248      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1249      */
1250     RX_PACKET_IOV_FULLINIT(p);
1251     return p;
1252 }
1253 #endif /* RX_ENABLE_TSFPQ */
1254
1255 #ifdef RX_ENABLE_TSFPQ
1256 struct rx_packet *
1257 rxi_AllocPacket(int class)
1258 {
1259     struct rx_packet *p;
1260
1261     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1262     return p;
1263 }
1264 #else /* RX_ENABLE_TSFPQ */
1265 struct rx_packet *
1266 rxi_AllocPacket(int class)
1267 {
1268     struct rx_packet *p;
1269
1270     MUTEX_ENTER(&rx_freePktQ_lock);
1271     p = rxi_AllocPacketNoLock(class);
1272     MUTEX_EXIT(&rx_freePktQ_lock);
1273     return p;
1274 }
1275 #endif /* RX_ENABLE_TSFPQ */
1276
1277 /* This guy comes up with as many buffers as it {takes,can get} given
1278  * the MTU for this call. It also sets the packet length before
1279  * returning.  caution: this is often called at NETPRI
1280  * Called with call locked.
1281  */
1282 struct rx_packet *
1283 rxi_AllocSendPacket(struct rx_call *call, int want)
1284 {
1285     struct rx_packet *p = (struct rx_packet *)0;
1286     int mud;
1287     unsigned delta;
1288
1289     SPLVAR;
1290     mud = call->MTU - RX_HEADER_SIZE;
1291     delta =
1292         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1293         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1294
1295 #ifdef RX_ENABLE_TSFPQ
1296     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1297         want += delta;
1298         want = MIN(want, mud);
1299
1300         if ((unsigned)want > p->length)
1301             (void)rxi_AllocDataBuf(p, (want - p->length),
1302                                    RX_PACKET_CLASS_SEND_CBUF);
1303
1304         if (p->length > mud)
1305             p->length = mud;
1306
1307         if (delta >= p->length) {
1308             rxi_FreePacket(p);
1309             p = NULL;
1310         } else {
1311             p->length -= delta;
1312         }
1313         return p;
1314     }
1315 #endif /* RX_ENABLE_TSFPQ */
1316
1317     while (!(call->error)) {
1318         MUTEX_ENTER(&rx_freePktQ_lock);
1319         /* if an error occurred, or we get the packet we want, we're done */
1320         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1321             MUTEX_EXIT(&rx_freePktQ_lock);
1322
1323             want += delta;
1324             want = MIN(want, mud);
1325
1326             if ((unsigned)want > p->length)
1327                 (void)rxi_AllocDataBuf(p, (want - p->length),
1328                                        RX_PACKET_CLASS_SEND_CBUF);
1329
1330             if (p->length > mud)
1331                 p->length = mud;
1332
1333             if (delta >= p->length) {
1334                 rxi_FreePacket(p);
1335                 p = NULL;
1336             } else {
1337                 p->length -= delta;
1338             }
1339             break;
1340         }
1341
1342         /* no error occurred, and we didn't get a packet, so we sleep.
1343          * At this point, we assume that packets will be returned
1344          * sooner or later, as packets are acknowledged, and so we
1345          * just wait.  */
1346         NETPRI;
1347         call->flags |= RX_CALL_WAIT_PACKETS;
1348         MUTEX_ENTER(&rx_refcnt_mutex);
1349         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1350         MUTEX_EXIT(&rx_refcnt_mutex);
1351         MUTEX_EXIT(&call->lock);
1352         rx_waitingForPackets = 1;
1353
1354 #ifdef  RX_ENABLE_LOCKS
1355         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1356 #else
1357         osi_rxSleep(&rx_waitingForPackets);
1358 #endif
1359         MUTEX_EXIT(&rx_freePktQ_lock);
1360         MUTEX_ENTER(&call->lock);
1361         MUTEX_ENTER(&rx_refcnt_mutex);
1362         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1363         MUTEX_EXIT(&rx_refcnt_mutex);
1364         call->flags &= ~RX_CALL_WAIT_PACKETS;
1365         USERPRI;
1366     }
1367
1368     return p;
1369 }
1370
1371 #ifndef KERNEL
1372 #ifdef AFS_NT40_ENV
1373 /* Windows does not use file descriptors. */
1374 #define CountFDs(amax) 0
1375 #else
1376 /* count the number of used FDs */
1377 static int
1378 CountFDs(int amax)
1379 {
1380     struct stat tstat;
1381     int i, code;
1382     int count;
1383
1384     count = 0;
1385     for (i = 0; i < amax; i++) {
1386         code = fstat(i, &tstat);
1387         if (code == 0)
1388             count++;
1389     }
1390     return count;
1391 }
1392 #endif /* AFS_NT40_ENV */
1393 #else /* KERNEL */
1394
1395 #define CountFDs(amax) amax
1396
1397 #endif /* KERNEL */
1398
1399 #if !defined(KERNEL) || defined(UKERNEL)
1400
1401 /* This function reads a single packet from the interface into the
1402  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1403  * (host,port) of the sender are stored in the supplied variables, and
1404  * the data length of the packet is stored in the packet structure.
1405  * The header is decoded. */
1406 int
1407 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1408                u_short * port)
1409 {
1410     struct sockaddr_in from;
1411     unsigned int nbytes;
1412     afs_int32 rlen;
1413     afs_uint32 tlen, savelen;
1414     struct msghdr msg;
1415     rx_computelen(p, tlen);
1416     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1417
1418     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1419     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1420                                  * it once in order to avoid races.  */
1421     tlen = rlen - tlen;
1422     if (tlen > 0) {
1423         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1424         if (tlen > 0) {
1425             tlen = rlen - tlen;
1426         } else
1427             tlen = rlen;
1428     } else
1429         tlen = rlen;
1430
1431     /* Extend the last iovec for padding, it's just to make sure that the
1432      * read doesn't return more data than we expect, and is done to get around
1433      * our problems caused by the lack of a length field in the rx header.
1434      * Use the extra buffer that follows the localdata in each packet
1435      * structure. */
1436     savelen = p->wirevec[p->niovecs - 1].iov_len;
1437     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1438
1439     memset(&msg, 0, sizeof(msg));
1440     msg.msg_name = (char *)&from;
1441     msg.msg_namelen = sizeof(struct sockaddr_in);
1442     msg.msg_iov = p->wirevec;
1443     msg.msg_iovlen = p->niovecs;
1444     nbytes = rxi_Recvmsg(socket, &msg, 0);
1445
1446     /* restore the vec to its correct state */
1447     p->wirevec[p->niovecs - 1].iov_len = savelen;
1448
1449     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1450     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1451         if (nbytes < 0 && errno == EWOULDBLOCK) {
1452             if (rx_stats_active)
1453                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1454         } else if (nbytes <= 0) {
1455             if (rx_stats_active) {
1456                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1457                 rx_stats.bogusHost = from.sin_addr.s_addr;
1458             }
1459             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1460                  ntohs(from.sin_port), nbytes));
1461         }
1462         return 0;
1463     }
1464 #ifdef RXDEBUG
1465     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1466                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1467         rxi_DecodePacketHeader(p);
1468
1469         *host = from.sin_addr.s_addr;
1470         *port = from.sin_port;
1471
1472         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1473               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1474               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1475               p->length));
1476 #ifdef RX_TRIMDATABUFS
1477         rxi_TrimDataBufs(p, 1);
1478 #endif
1479         return 0;
1480     }
1481 #endif
1482     else {
1483         /* Extract packet header. */
1484         rxi_DecodePacketHeader(p);
1485
1486         *host = from.sin_addr.s_addr;
1487         *port = from.sin_port;
1488         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1489             if (rx_stats_active) {
1490                 struct rx_peer *peer;
1491                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1492                 /*
1493                  * Try to look up this peer structure.  If it doesn't exist,
1494                  * don't create a new one -
1495                  * we don't keep count of the bytes sent/received if a peer
1496                  * structure doesn't already exist.
1497                  *
1498                  * The peer/connection cleanup code assumes that there is 1 peer
1499                  * per connection.  If we actually created a peer structure here
1500                  * and this packet was an rxdebug packet, the peer structure would
1501                  * never be cleaned up.
1502                  */
1503                 peer = rxi_FindPeer(*host, *port, 0, 0);
1504                 /* Since this may not be associated with a connection,
1505                  * it may have no refCount, meaning we could race with
1506                  * ReapConnections
1507                  */
1508                 if (peer && (peer->refCount > 0)) {
1509                     MUTEX_ENTER(&peer->peer_lock);
1510                     hadd32(peer->bytesReceived, p->length);
1511                     MUTEX_EXIT(&peer->peer_lock);
1512                 }
1513             }
1514         }
1515
1516 #ifdef RX_TRIMDATABUFS
1517         /* Free any empty packet buffers at the end of this packet */
1518         rxi_TrimDataBufs(p, 1);
1519 #endif
1520         return 1;
1521     }
1522 }
1523
1524 #endif /* !KERNEL || UKERNEL */
1525
1526 /* This function splits off the first packet in a jumbo packet.
1527  * As of AFS 3.5, jumbograms contain more than one fixed size
1528  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1529  * last packet header. All packets (except the last) are padded to
1530  * fall on RX_CBUFFERSIZE boundaries.
1531  * HACK: We store the length of the first n-1 packets in the
1532  * last two pad bytes. */
1533
1534 struct rx_packet *
1535 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1536                      int first)
1537 {
1538     struct rx_packet *np;
1539     struct rx_jumboHeader *jp;
1540     int niov, i;
1541     struct iovec *iov;
1542     int length;
1543     afs_uint32 temp;
1544
1545     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1546      * bytes in length. All but the first packet are preceded by
1547      * an abbreviated four byte header. The length of the last packet
1548      * is calculated from the size of the jumbogram. */
1549     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1550
1551     if ((int)p->length < length) {
1552         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1553         return NULL;
1554     }
1555     niov = p->niovecs - 2;
1556     if (niov < 1) {
1557         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1558         return NULL;
1559     }
1560     iov = &p->wirevec[2];
1561     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1562
1563     /* Get a pointer to the abbreviated packet header */
1564     jp = (struct rx_jumboHeader *)
1565         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1566
1567     /* Set up the iovecs for the next packet */
1568     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1569     np->wirevec[0].iov_len = sizeof(struct rx_header);
1570     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1571     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1572     np->niovecs = niov + 1;
1573     for (i = 2, iov++; i <= niov; i++, iov++) {
1574         np->wirevec[i] = *iov;
1575     }
1576     np->length = p->length - length;
1577     p->length = RX_JUMBOBUFFERSIZE;
1578     p->niovecs = 2;
1579
1580     /* Convert the jumbo packet header to host byte order */
1581     temp = ntohl(*(afs_uint32 *) jp);
1582     jp->flags = (u_char) (temp >> 24);
1583     jp->cksum = (u_short) (temp);
1584
1585     /* Fill in the packet header */
1586     np->header = p->header;
1587     np->header.serial = p->header.serial + 1;
1588     np->header.seq = p->header.seq + 1;
1589     np->header.flags = jp->flags;
1590     np->header.spare = jp->cksum;
1591
1592     return np;
1593 }
1594
1595 #ifndef KERNEL
1596 /* Send a udp datagram */
1597 int
1598 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1599             int length, int istack)
1600 {
1601     struct msghdr msg;
1602         int ret;
1603
1604     memset(&msg, 0, sizeof(msg));
1605     msg.msg_iov = dvec;
1606     msg.msg_iovlen = nvecs;
1607     msg.msg_name = addr;
1608     msg.msg_namelen = sizeof(struct sockaddr_in);
1609
1610     ret = rxi_Sendmsg(socket, &msg, 0);
1611
1612     return ret;
1613 }
1614 #elif !defined(UKERNEL)
1615 /*
1616  * message receipt is done in rxk_input or rx_put.
1617  */
1618
1619 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1620 /*
1621  * Copy an mblock to the contiguous area pointed to by cp.
1622  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1623  * but it doesn't really.
1624  * Returns the number of bytes not transferred.
1625  * The message is NOT changed.
1626  */
1627 static int
1628 cpytoc(mblk_t * mp, int off, int len, char *cp)
1629 {
1630     int n;
1631
1632     for (; mp && len > 0; mp = mp->b_cont) {
1633         if (mp->b_datap->db_type != M_DATA) {
1634             return -1;
1635         }
1636         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1637         memcpy(cp, (char *)mp->b_rptr, n);
1638         cp += n;
1639         len -= n;
1640         mp->b_rptr += n;
1641     }
1642     return (len);
1643 }
1644
1645 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1646  * but it doesn't really.
1647  * This sucks, anyway, do it like m_cpy.... below
1648  */
1649 static int
1650 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1651            int niovs)
1652 {
1653     int m, n, o, t, i;
1654
1655     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1656         if (mp->b_datap->db_type != M_DATA) {
1657             return -1;
1658         }
1659         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1660         len -= n;
1661         while (n) {
1662             if (!t) {
1663                 o = 0;
1664                 i++;
1665                 t = iovs[i].iov_len;
1666             }
1667             m = MIN(n, t);
1668             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1669             mp->b_rptr += m;
1670             o += m;
1671             t -= m;
1672             n -= m;
1673         }
1674     }
1675     return (len);
1676 }
1677
1678 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1679 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1680 #else
1681 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1682 static int
1683 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1684 {
1685     caddr_t p1, p2;
1686     unsigned int l1, l2, i, t;
1687
1688     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1689         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1690
1691     while (off && m)
1692         if (m->m_len <= off) {
1693             off -= m->m_len;
1694             m = m->m_next;
1695             continue;
1696         } else
1697             break;
1698
1699     if (m == NULL)
1700         return len;
1701
1702     p1 = mtod(m, caddr_t) + off;
1703     l1 = m->m_len - off;
1704     i = 0;
1705     p2 = iovs[0].iov_base;
1706     l2 = iovs[0].iov_len;
1707
1708     while (len) {
1709         t = MIN(l1, MIN(l2, (unsigned int)len));
1710         memcpy(p2, p1, t);
1711         p1 += t;
1712         p2 += t;
1713         l1 -= t;
1714         l2 -= t;
1715         len -= t;
1716         if (!l1) {
1717             m = m->m_next;
1718             if (!m)
1719                 break;
1720             p1 = mtod(m, caddr_t);
1721             l1 = m->m_len;
1722         }
1723         if (!l2) {
1724             if (++i >= niovs)
1725                 break;
1726             p2 = iovs[i].iov_base;
1727             l2 = iovs[i].iov_len;
1728         }
1729
1730     }
1731
1732     return len;
1733 }
1734 #endif /* LINUX */
1735 #endif /* AFS_SUN5_ENV */
1736
1737 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1738 #if defined(AFS_NBSD_ENV)
1739 int
1740 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1741 #else
1742 int
1743 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1744 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1745      mblk_t *amb;
1746 #else
1747      struct mbuf *amb;
1748 #endif
1749      void (*free) ();
1750      struct rx_packet *phandle;
1751      int hdr_len, data_len;
1752 #endif /* AFS_NBSD_ENV */
1753 {
1754     int code;
1755
1756     code =
1757         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1758                      phandle->niovecs);
1759     (*free) (amb);
1760
1761     return code;
1762 }
1763 #endif /* LINUX */
1764 #endif /*KERNEL && !UKERNEL */
1765
1766
1767 /* send a response to a debug packet */
1768
1769 struct rx_packet *
1770 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1771                        afs_uint32 ahost, short aport, int istack)
1772 {
1773     struct rx_debugIn tin;
1774     afs_int32 tl;
1775     struct rx_serverQueueEntry *np, *nqe;
1776
1777     /*
1778      * Only respond to client-initiated Rx debug packets,
1779      * and clear the client flag in the response.
1780      */
1781     if (ap->header.flags & RX_CLIENT_INITIATED) {
1782         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1783         rxi_EncodePacketHeader(ap);
1784     } else {
1785         return ap;
1786     }
1787
1788     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1789     /* all done with packet, now set length to the truth, so we can
1790      * reuse this packet */
1791     rx_computelen(ap, ap->length);
1792
1793     tin.type = ntohl(tin.type);
1794     tin.index = ntohl(tin.index);
1795     switch (tin.type) {
1796     case RX_DEBUGI_GETSTATS:{
1797             struct rx_debugStats tstat;
1798
1799             /* get basic stats */
1800             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1801             tstat.version = RX_DEBUGI_VERSION;
1802 #ifndef RX_ENABLE_LOCKS
1803             tstat.waitingForPackets = rx_waitingForPackets;
1804 #endif
1805             MUTEX_ENTER(&rx_serverPool_lock);
1806             tstat.nFreePackets = htonl(rx_nFreePackets);
1807             tstat.nPackets = htonl(rx_nPackets);
1808             tstat.callsExecuted = htonl(rxi_nCalls);
1809             tstat.packetReclaims = htonl(rx_packetReclaims);
1810             tstat.usedFDs = CountFDs(64);
1811             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1812             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1813             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1814                         tstat.idleThreads);
1815             MUTEX_EXIT(&rx_serverPool_lock);
1816             tstat.idleThreads = htonl(tstat.idleThreads);
1817             tl = sizeof(struct rx_debugStats) - ap->length;
1818             if (tl > 0)
1819                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1820
1821             if (tl <= 0) {
1822                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1823                                (char *)&tstat);
1824                 ap->length = sizeof(struct rx_debugStats);
1825                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1826                 rx_computelen(ap, ap->length);
1827             }
1828             break;
1829         }
1830
1831     case RX_DEBUGI_GETALLCONN:
1832     case RX_DEBUGI_GETCONN:{
1833             unsigned int i, j;
1834             struct rx_connection *tc;
1835             struct rx_call *tcall;
1836             struct rx_debugConn tconn;
1837             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1838
1839
1840             tl = sizeof(struct rx_debugConn) - ap->length;
1841             if (tl > 0)
1842                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1843             if (tl > 0)
1844                 return ap;
1845
1846             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1847             /* get N'th (maybe) "interesting" connection info */
1848             for (i = 0; i < rx_hashTableSize; i++) {
1849 #if !defined(KERNEL)
1850                 /* the time complexity of the algorithm used here
1851                  * exponentially increses with the number of connections.
1852                  */
1853 #ifdef AFS_PTHREAD_ENV
1854                 pthread_yield();
1855 #else
1856                 (void)IOMGR_Poll();
1857 #endif
1858 #endif
1859                 MUTEX_ENTER(&rx_connHashTable_lock);
1860                 /* We might be slightly out of step since we are not
1861                  * locking each call, but this is only debugging output.
1862                  */
1863                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1864                     if ((all || rxi_IsConnInteresting(tc))
1865                         && tin.index-- <= 0) {
1866                         tconn.host = tc->peer->host;
1867                         tconn.port = tc->peer->port;
1868                         tconn.cid = htonl(tc->cid);
1869                         tconn.epoch = htonl(tc->epoch);
1870                         tconn.serial = htonl(tc->serial);
1871                         for (j = 0; j < RX_MAXCALLS; j++) {
1872                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1873                             if ((tcall = tc->call[j])) {
1874                                 tconn.callState[j] = tcall->state;
1875                                 tconn.callMode[j] = tcall->mode;
1876                                 tconn.callFlags[j] = tcall->flags;
1877                                 if (queue_IsNotEmpty(&tcall->rq))
1878                                     tconn.callOther[j] |= RX_OTHER_IN;
1879                                 if (queue_IsNotEmpty(&tcall->tq))
1880                                     tconn.callOther[j] |= RX_OTHER_OUT;
1881                             } else
1882                                 tconn.callState[j] = RX_STATE_NOTINIT;
1883                         }
1884
1885                         tconn.natMTU = htonl(tc->peer->natMTU);
1886                         tconn.error = htonl(tc->error);
1887                         tconn.flags = tc->flags;
1888                         tconn.type = tc->type;
1889                         tconn.securityIndex = tc->securityIndex;
1890                         if (tc->securityObject) {
1891                             RXS_GetStats(tc->securityObject, tc,
1892                                          &tconn.secStats);
1893 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1894 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1895                             DOHTONL(flags);
1896                             DOHTONL(expires);
1897                             DOHTONL(packetsReceived);
1898                             DOHTONL(packetsSent);
1899                             DOHTONL(bytesReceived);
1900                             DOHTONL(bytesSent);
1901                             for (i = 0;
1902                                  i <
1903                                  sizeof(tconn.secStats.spares) /
1904                                  sizeof(short); i++)
1905                                 DOHTONS(spares[i]);
1906                             for (i = 0;
1907                                  i <
1908                                  sizeof(tconn.secStats.sparel) /
1909                                  sizeof(afs_int32); i++)
1910                                 DOHTONL(sparel[i]);
1911                         }
1912
1913                         MUTEX_EXIT(&rx_connHashTable_lock);
1914                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1915                                        (char *)&tconn);
1916                         tl = ap->length;
1917                         ap->length = sizeof(struct rx_debugConn);
1918                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1919                                             istack);
1920                         ap->length = tl;
1921                         return ap;
1922                     }
1923                 }
1924                 MUTEX_EXIT(&rx_connHashTable_lock);
1925             }
1926             /* if we make it here, there are no interesting packets */
1927             tconn.cid = htonl(0xffffffff);      /* means end */
1928             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1929                            (char *)&tconn);
1930             tl = ap->length;
1931             ap->length = sizeof(struct rx_debugConn);
1932             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1933             ap->length = tl;
1934             break;
1935         }
1936
1937         /*
1938          * Pass back all the peer structures we have available
1939          */
1940
1941     case RX_DEBUGI_GETPEER:{
1942             unsigned int i;
1943             struct rx_peer *tp;
1944             struct rx_debugPeer tpeer;
1945
1946
1947             tl = sizeof(struct rx_debugPeer) - ap->length;
1948             if (tl > 0)
1949                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1950             if (tl > 0)
1951                 return ap;
1952
1953             memset(&tpeer, 0, sizeof(tpeer));
1954             for (i = 0; i < rx_hashTableSize; i++) {
1955 #if !defined(KERNEL)
1956                 /* the time complexity of the algorithm used here
1957                  * exponentially increses with the number of peers.
1958                  *
1959                  * Yielding after processing each hash table entry
1960                  * and dropping rx_peerHashTable_lock.
1961                  * also increases the risk that we will miss a new
1962                  * entry - but we are willing to live with this
1963                  * limitation since this is meant for debugging only
1964                  */
1965 #ifdef AFS_PTHREAD_ENV
1966                 pthread_yield();
1967 #else
1968                 (void)IOMGR_Poll();
1969 #endif
1970 #endif
1971                 MUTEX_ENTER(&rx_peerHashTable_lock);
1972                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1973                     if (tin.index-- <= 0) {
1974                         tp->refCount++;
1975                         MUTEX_EXIT(&rx_peerHashTable_lock);
1976
1977                         MUTEX_ENTER(&tp->peer_lock);
1978                         tpeer.host = tp->host;
1979                         tpeer.port = tp->port;
1980                         tpeer.ifMTU = htons(tp->ifMTU);
1981                         tpeer.idleWhen = htonl(tp->idleWhen);
1982                         tpeer.refCount = htons(tp->refCount);
1983                         tpeer.burstSize = tp->burstSize;
1984                         tpeer.burst = tp->burst;
1985                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1986                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1987                         tpeer.rtt = htonl(tp->rtt);
1988                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1989                         tpeer.nSent = htonl(tp->nSent);
1990                         tpeer.reSends = htonl(tp->reSends);
1991                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1992                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1993                         tpeer.natMTU = htons(tp->natMTU);
1994                         tpeer.maxMTU = htons(tp->maxMTU);
1995                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1996                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1997                         tpeer.MTU = htons(tp->MTU);
1998                         tpeer.cwind = htons(tp->cwind);
1999                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2000                         tpeer.congestSeq = htons(tp->congestSeq);
2001                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2002                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2003                         tpeer.bytesReceived.high =
2004                             htonl(tp->bytesReceived.high);
2005                         tpeer.bytesReceived.low =
2006                             htonl(tp->bytesReceived.low);
2007                         MUTEX_EXIT(&tp->peer_lock);
2008
2009                         MUTEX_ENTER(&rx_peerHashTable_lock);
2010                         tp->refCount--;
2011                         MUTEX_EXIT(&rx_peerHashTable_lock);
2012
2013                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2014                                        (char *)&tpeer);
2015                         tl = ap->length;
2016                         ap->length = sizeof(struct rx_debugPeer);
2017                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2018                                             istack);
2019                         ap->length = tl;
2020                         return ap;
2021                     }
2022                 }
2023                 MUTEX_EXIT(&rx_peerHashTable_lock);
2024             }
2025             /* if we make it here, there are no interesting packets */
2026             tpeer.host = htonl(0xffffffff);     /* means end */
2027             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2028                            (char *)&tpeer);
2029             tl = ap->length;
2030             ap->length = sizeof(struct rx_debugPeer);
2031             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2032             ap->length = tl;
2033             break;
2034         }
2035
2036     case RX_DEBUGI_RXSTATS:{
2037             int i;
2038             afs_int32 *s;
2039
2040             tl = sizeof(rx_stats) - ap->length;
2041             if (tl > 0)
2042                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2043             if (tl > 0)
2044                 return ap;
2045
2046             /* Since its all int32s convert to network order with a loop. */
2047         if (rx_stats_active)
2048             MUTEX_ENTER(&rx_stats_mutex);
2049             s = (afs_int32 *) & rx_stats;
2050             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2051                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2052
2053             tl = ap->length;
2054             ap->length = sizeof(rx_stats);
2055         if (rx_stats_active)
2056             MUTEX_EXIT(&rx_stats_mutex);
2057             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2058             ap->length = tl;
2059             break;
2060         }
2061
2062     default:
2063         /* error response packet */
2064         tin.type = htonl(RX_DEBUGI_BADTYPE);
2065         tin.index = tin.type;
2066         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2067         tl = ap->length;
2068         ap->length = sizeof(struct rx_debugIn);
2069         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2070         ap->length = tl;
2071         break;
2072     }
2073     return ap;
2074 }
2075
2076 struct rx_packet *
2077 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2078                          afs_uint32 ahost, short aport, int istack)
2079 {
2080     afs_int32 tl;
2081
2082     /*
2083      * Only respond to client-initiated version requests, and
2084      * clear that flag in the response.
2085      */
2086     if (ap->header.flags & RX_CLIENT_INITIATED) {
2087         char buf[66];
2088
2089         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2090         rxi_EncodePacketHeader(ap);
2091         memset(buf, 0, sizeof(buf));
2092         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2093         rx_packetwrite(ap, 0, 65, buf);
2094         tl = ap->length;
2095         ap->length = 65;
2096         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2097         ap->length = tl;
2098     }
2099
2100     return ap;
2101 }
2102
2103
2104 /* send a debug packet back to the sender */
2105 static void
2106 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2107                     afs_uint32 ahost, short aport, afs_int32 istack)
2108 {
2109     struct sockaddr_in taddr;
2110     unsigned int i, nbytes, savelen = 0;
2111     int saven = 0;
2112 #ifdef KERNEL
2113     int waslocked = ISAFS_GLOCK();
2114 #endif
2115
2116     taddr.sin_family = AF_INET;
2117     taddr.sin_port = aport;
2118     taddr.sin_addr.s_addr = ahost;
2119 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2120     taddr.sin_len = sizeof(struct sockaddr_in);
2121 #endif
2122
2123     /* We need to trim the niovecs. */
2124     nbytes = apacket->length;
2125     for (i = 1; i < apacket->niovecs; i++) {
2126         if (nbytes <= apacket->wirevec[i].iov_len) {
2127             savelen = apacket->wirevec[i].iov_len;
2128             saven = apacket->niovecs;
2129             apacket->wirevec[i].iov_len = nbytes;
2130             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2131         } else
2132             nbytes -= apacket->wirevec[i].iov_len;
2133     }
2134 #ifdef KERNEL
2135 #ifdef RX_KERNEL_TRACE
2136     if (ICL_SETACTIVE(afs_iclSetp)) {
2137         if (!waslocked)
2138             AFS_GLOCK();
2139         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2140                    "before osi_NetSend()");
2141         AFS_GUNLOCK();
2142     } else
2143 #else
2144     if (waslocked)
2145         AFS_GUNLOCK();
2146 #endif
2147 #endif
2148     /* debug packets are not reliably delivered, hence the cast below. */
2149     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2150                       apacket->length + RX_HEADER_SIZE, istack);
2151 #ifdef KERNEL
2152 #ifdef RX_KERNEL_TRACE
2153     if (ICL_SETACTIVE(afs_iclSetp)) {
2154         AFS_GLOCK();
2155         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2156                    "after osi_NetSend()");
2157         if (!waslocked)
2158             AFS_GUNLOCK();
2159     } else
2160 #else
2161     if (waslocked)
2162         AFS_GLOCK();
2163 #endif
2164 #endif
2165     if (saven) {                /* means we truncated the packet above. */
2166         apacket->wirevec[i - 1].iov_len = savelen;
2167         apacket->niovecs = saven;
2168     }
2169
2170 }
2171
2172 /* Send the packet to appropriate destination for the specified
2173  * call.  The header is first encoded and placed in the packet.
2174  */
2175 void
2176 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2177                struct rx_packet *p, int istack)
2178 {
2179 #if defined(KERNEL)
2180     int waslocked;
2181 #endif
2182     int code;
2183     struct sockaddr_in addr;
2184     struct rx_peer *peer = conn->peer;
2185     osi_socket socket;
2186 #ifdef RXDEBUG
2187     char deliveryType = 'S';
2188 #endif
2189     /* The address we're sending the packet to */
2190     memset(&addr, 0, sizeof(addr));
2191     addr.sin_family = AF_INET;
2192     addr.sin_port = peer->port;
2193     addr.sin_addr.s_addr = peer->host;
2194
2195     /* This stuff should be revamped, I think, so that most, if not
2196      * all, of the header stuff is always added here.  We could
2197      * probably do away with the encode/decode routines. XXXXX */
2198
2199     /* Stamp each packet with a unique serial number.  The serial
2200      * number is maintained on a connection basis because some types
2201      * of security may be based on the serial number of the packet,
2202      * and security is handled on a per authenticated-connection
2203      * basis. */
2204     /* Pre-increment, to guarantee no zero serial number; a zero
2205      * serial number means the packet was never sent. */
2206     MUTEX_ENTER(&conn->conn_data_lock);
2207     p->header.serial = ++conn->serial;
2208     if (p->length > conn->peer->maxPacketSize) {
2209         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2210             (p->header.flags & RX_REQUEST_ACK)) {
2211             conn->lastPingSize = p->length;
2212             conn->lastPingSizeSer = p->header.serial;
2213         } else if (p->header.seq != 0) {
2214             conn->lastPacketSize = p->length;
2215             conn->lastPacketSizeSeq = p->header.seq;
2216         }
2217     }
2218     MUTEX_EXIT(&conn->conn_data_lock);
2219     /* This is so we can adjust retransmit time-outs better in the face of
2220      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2221      */
2222     if (p->firstSerial == 0) {
2223         p->firstSerial = p->header.serial;
2224     }
2225 #ifdef RXDEBUG
2226     /* If an output tracer function is defined, call it with the packet and
2227      * network address.  Note this function may modify its arguments. */
2228     if (rx_almostSent) {
2229         int drop = (*rx_almostSent) (p, &addr);
2230         /* drop packet if return value is non-zero? */
2231         if (drop)
2232             deliveryType = 'D'; /* Drop the packet */
2233     }
2234 #endif
2235
2236     /* Get network byte order header */
2237     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2238                                  * touch ALL the fields */
2239
2240     /* Send the packet out on the same socket that related packets are being
2241      * received on */
2242     socket =
2243         (conn->type ==
2244          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2245
2246 #ifdef RXDEBUG
2247     /* Possibly drop this packet,  for testing purposes */
2248     if ((deliveryType == 'D')
2249         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2250             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2251         deliveryType = 'D';     /* Drop the packet */
2252     } else {
2253         deliveryType = 'S';     /* Send the packet */
2254 #endif /* RXDEBUG */
2255
2256         /* Loop until the packet is sent.  We'd prefer just to use a
2257          * blocking socket, but unfortunately the interface doesn't
2258          * allow us to have the socket block in send mode, and not
2259          * block in receive mode */
2260 #ifdef KERNEL
2261         waslocked = ISAFS_GLOCK();
2262 #ifdef RX_KERNEL_TRACE
2263         if (ICL_SETACTIVE(afs_iclSetp)) {
2264             if (!waslocked)
2265                 AFS_GLOCK();
2266             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2267                        "before osi_NetSend()");
2268             AFS_GUNLOCK();
2269         } else
2270 #else
2271         if (waslocked)
2272             AFS_GUNLOCK();
2273 #endif
2274 #endif
2275         if ((code =
2276              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2277                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2278             /* send failed, so let's hurry up the resend, eh? */
2279             if (rx_stats_active)
2280                 rx_atomic_inc(&rx_stats.netSendFailures);
2281             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2282
2283             /* Some systems are nice and tell us right away that we cannot
2284              * reach this recipient by returning an error code.
2285              * So, when this happens let's "down" the host NOW so
2286              * we don't sit around waiting for this host to timeout later.
2287              */
2288             if (call &&
2289 #ifdef AFS_NT40_ENV
2290                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2291 #elif defined(AFS_LINUX20_ENV)
2292                 code == -ENETUNREACH
2293 #elif defined(AFS_DARWIN_ENV)
2294                 code == EHOSTUNREACH
2295 #else
2296                 0
2297 #endif
2298                 )
2299                 call->lastReceiveTime = 0;
2300         }
2301 #ifdef KERNEL
2302 #ifdef RX_KERNEL_TRACE
2303         if (ICL_SETACTIVE(afs_iclSetp)) {
2304             AFS_GLOCK();
2305             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2306                        "after osi_NetSend()");
2307             if (!waslocked)
2308                 AFS_GUNLOCK();
2309         } else
2310 #else
2311         if (waslocked)
2312             AFS_GLOCK();
2313 #endif
2314 #endif
2315 #ifdef RXDEBUG
2316     }
2317     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2318           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2319           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2320           p->header.seq, p->header.flags, p, p->length));
2321 #endif
2322     if (rx_stats_active) {
2323         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2324         MUTEX_ENTER(&peer->peer_lock);
2325         hadd32(peer->bytesSent, p->length);
2326         MUTEX_EXIT(&peer->peer_lock);
2327     }
2328 }
2329
2330 /* Send a list of packets to appropriate destination for the specified
2331  * connection.  The headers are first encoded and placed in the packets.
2332  */
2333 void
2334 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2335                    struct rx_packet **list, int len, int istack)
2336 {
2337 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2338     int waslocked;
2339 #endif
2340     struct sockaddr_in addr;
2341     struct rx_peer *peer = conn->peer;
2342     osi_socket socket;
2343     struct rx_packet *p = NULL;
2344     struct iovec wirevec[RX_MAXIOVECS];
2345     int i, length, code;
2346     afs_uint32 serial;
2347     afs_uint32 temp;
2348     struct rx_jumboHeader *jp;
2349 #ifdef RXDEBUG
2350     char deliveryType = 'S';
2351 #endif
2352     /* The address we're sending the packet to */
2353     addr.sin_family = AF_INET;
2354     addr.sin_port = peer->port;
2355     addr.sin_addr.s_addr = peer->host;
2356
2357     if (len + 1 > RX_MAXIOVECS) {
2358         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2359     }
2360
2361     /*
2362      * Stamp the packets in this jumbogram with consecutive serial numbers
2363      */
2364     MUTEX_ENTER(&conn->conn_data_lock);
2365     serial = conn->serial;
2366     conn->serial += len;
2367     for (i = 0; i < len; i++) {
2368         p = list[i];
2369         if (p->length > conn->peer->maxPacketSize) {
2370             /* a ping *or* a sequenced packet can count */
2371             if ((p->length > conn->peer->maxPacketSize)) {
2372                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2373                      (p->header.flags & RX_REQUEST_ACK)) &&
2374                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2375                     conn->lastPingSize = p->length;
2376                     conn->lastPingSizeSer = serial + i;
2377                 } else if ((p->header.seq != 0) &&
2378                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2379                     conn->lastPacketSize = p->length;
2380                     conn->lastPacketSizeSeq = p->header.seq;
2381                 }
2382             }
2383         }
2384     }
2385     MUTEX_EXIT(&conn->conn_data_lock);
2386
2387
2388     /* This stuff should be revamped, I think, so that most, if not
2389      * all, of the header stuff is always added here.  We could
2390      * probably do away with the encode/decode routines. XXXXX */
2391
2392     jp = NULL;
2393     length = RX_HEADER_SIZE;
2394     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2395     wirevec[0].iov_len = RX_HEADER_SIZE;
2396     for (i = 0; i < len; i++) {
2397         p = list[i];
2398
2399         /* The whole 3.5 jumbogram scheme relies on packets fitting
2400          * in a single packet buffer. */
2401         if (p->niovecs > 2) {
2402             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2403         }
2404
2405         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2406          * in this chunk.  */
2407         if (i < len - 1) {
2408             if (p->length != RX_JUMBOBUFFERSIZE) {
2409                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2410             }
2411             p->header.flags |= RX_JUMBO_PACKET;
2412             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2413             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2414         } else {
2415             wirevec[i + 1].iov_len = p->length;
2416             length += p->length;
2417         }
2418         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2419         if (jp != NULL) {
2420             /* Convert jumbo packet header to network byte order */
2421             temp = (afs_uint32) (p->header.flags) << 24;
2422             temp |= (afs_uint32) (p->header.spare);
2423             *(afs_uint32 *) jp = htonl(temp);
2424         }
2425         jp = (struct rx_jumboHeader *)
2426             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2427
2428         /* Stamp each packet with a unique serial number.  The serial
2429          * number is maintained on a connection basis because some types
2430          * of security may be based on the serial number of the packet,
2431          * and security is handled on a per authenticated-connection
2432          * basis. */
2433         /* Pre-increment, to guarantee no zero serial number; a zero
2434          * serial number means the packet was never sent. */
2435         p->header.serial = ++serial;
2436         /* This is so we can adjust retransmit time-outs better in the face of
2437          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2438          */
2439         if (p->firstSerial == 0) {
2440             p->firstSerial = p->header.serial;
2441         }
2442 #ifdef RXDEBUG
2443         /* If an output tracer function is defined, call it with the packet and
2444          * network address.  Note this function may modify its arguments. */
2445         if (rx_almostSent) {
2446             int drop = (*rx_almostSent) (p, &addr);
2447             /* drop packet if return value is non-zero? */
2448             if (drop)
2449                 deliveryType = 'D';     /* Drop the packet */
2450         }
2451 #endif
2452
2453         /* Get network byte order header */
2454         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2455                                          * touch ALL the fields */
2456     }
2457
2458     /* Send the packet out on the same socket that related packets are being
2459      * received on */
2460     socket =
2461         (conn->type ==
2462          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2463
2464 #ifdef RXDEBUG
2465     /* Possibly drop this packet,  for testing purposes */
2466     if ((deliveryType == 'D')
2467         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2468             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2469         deliveryType = 'D';     /* Drop the packet */
2470     } else {
2471         deliveryType = 'S';     /* Send the packet */
2472 #endif /* RXDEBUG */
2473
2474         /* Loop until the packet is sent.  We'd prefer just to use a
2475          * blocking socket, but unfortunately the interface doesn't
2476          * allow us to have the socket block in send mode, and not
2477          * block in receive mode */
2478 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2479         waslocked = ISAFS_GLOCK();
2480         if (!istack && waslocked)
2481             AFS_GUNLOCK();
2482 #endif
2483         if ((code =
2484              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2485                          istack)) != 0) {
2486             /* send failed, so let's hurry up the resend, eh? */
2487             if (rx_stats_active)
2488                 rx_atomic_inc(&rx_stats.netSendFailures);
2489             for (i = 0; i < len; i++) {
2490                 p = list[i];
2491                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2492             }
2493             /* Some systems are nice and tell us right away that we cannot
2494              * reach this recipient by returning an error code.
2495              * So, when this happens let's "down" the host NOW so
2496              * we don't sit around waiting for this host to timeout later.
2497              */
2498             if (call &&
2499 #ifdef AFS_NT40_ENV
2500                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2501 #elif defined(AFS_LINUX20_ENV)
2502                 code == -ENETUNREACH
2503 #elif defined(AFS_DARWIN_ENV)
2504                 code == EHOSTUNREACH
2505 #else
2506                 0
2507 #endif
2508                 )
2509                 call->lastReceiveTime = 0;
2510         }
2511 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2512         if (!istack && waslocked)
2513             AFS_GLOCK();
2514 #endif
2515 #ifdef RXDEBUG
2516     }
2517
2518     osi_Assert(p != NULL);
2519
2520     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2521           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2522           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2523           p->header.seq, p->header.flags, p, p->length));
2524
2525 #endif
2526     if (rx_stats_active) {
2527         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2528         MUTEX_ENTER(&peer->peer_lock);
2529         hadd32(peer->bytesSent, p->length);
2530         MUTEX_EXIT(&peer->peer_lock);
2531     }
2532 }
2533
2534
2535 /* Send a "special" packet to the peer connection.  If call is
2536  * specified, then the packet is directed to a specific call channel
2537  * associated with the connection, otherwise it is directed to the
2538  * connection only. Uses optionalPacket if it is supplied, rather than
2539  * allocating a new packet buffer.  Nbytes is the length of the data
2540  * portion of the packet.  If data is non-null, nbytes of data are
2541  * copied into the packet.  Type is the type of the packet, as defined
2542  * in rx.h.  Bug: there's a lot of duplication between this and other
2543  * routines.  This needs to be cleaned up. */
2544 struct rx_packet *
2545 rxi_SendSpecial(struct rx_call *call,
2546                 struct rx_connection *conn,
2547                 struct rx_packet *optionalPacket, int type, char *data,
2548                 int nbytes, int istack)
2549 {
2550     /* Some of the following stuff should be common code for all
2551      * packet sends (it's repeated elsewhere) */
2552     struct rx_packet *p;
2553     unsigned int i = 0;
2554     int savelen = 0, saven = 0;
2555     int channel, callNumber;
2556     if (call) {
2557         channel = call->channel;
2558         callNumber = *call->callNumber;
2559         /* BUSY packets refer to the next call on this connection */
2560         if (type == RX_PACKET_TYPE_BUSY) {
2561             callNumber++;
2562         }
2563     } else {
2564         channel = 0;
2565         callNumber = 0;
2566     }
2567     p = optionalPacket;
2568     if (!p) {
2569         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2570         if (!p)
2571             osi_Panic("rxi_SendSpecial failure");
2572     }
2573
2574     if (nbytes != -1)
2575         p->length = nbytes;
2576     else
2577         nbytes = p->length;
2578     p->header.serviceId = conn->serviceId;
2579     p->header.securityIndex = conn->securityIndex;
2580     p->header.cid = (conn->cid | channel);
2581     p->header.callNumber = callNumber;
2582     p->header.seq = 0;
2583     p->header.epoch = conn->epoch;
2584     p->header.type = type;
2585     p->header.flags = 0;
2586     if (conn->type == RX_CLIENT_CONNECTION)
2587         p->header.flags |= RX_CLIENT_INITIATED;
2588     if (data)
2589         rx_packetwrite(p, 0, nbytes, data);
2590
2591     for (i = 1; i < p->niovecs; i++) {
2592         if (nbytes <= p->wirevec[i].iov_len) {
2593             savelen = p->wirevec[i].iov_len;
2594             saven = p->niovecs;
2595             p->wirevec[i].iov_len = nbytes;
2596             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2597         } else
2598             nbytes -= p->wirevec[i].iov_len;
2599     }
2600
2601     if (call)
2602         rxi_Send(call, p, istack);
2603     else
2604         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2605     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2606         /* really need to do this, but it seems safer this way, given that  */
2607         /* sneaky optionalPacket... */
2608         p->wirevec[i - 1].iov_len = savelen;
2609         p->niovecs = saven;
2610     }
2611     if (!optionalPacket)
2612         rxi_FreePacket(p);
2613     return optionalPacket;
2614 }
2615
2616
2617 /* Encode the packet's header (from the struct header in the packet to
2618  * the net byte order representation in the wire representation of the
2619  * packet, which is what is actually sent out on the wire) */
2620 void
2621 rxi_EncodePacketHeader(struct rx_packet *p)
2622 {
2623     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2624
2625     memset(buf, 0, RX_HEADER_SIZE);
2626     *buf++ = htonl(p->header.epoch);
2627     *buf++ = htonl(p->header.cid);
2628     *buf++ = htonl(p->header.callNumber);
2629     *buf++ = htonl(p->header.seq);
2630     *buf++ = htonl(p->header.serial);
2631     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2632                    | (((afs_uint32) p->header.flags) << 16)
2633                    | (p->header.userStatus << 8) | p->header.securityIndex);
2634     /* Note: top 16 bits of this next word were reserved */
2635     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2636 }
2637
2638 /* Decode the packet's header (from net byte order to a struct header) */
2639 void
2640 rxi_DecodePacketHeader(struct rx_packet *p)
2641 {
2642     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2643     afs_uint32 temp;
2644
2645     p->header.epoch = ntohl(*buf);
2646     buf++;
2647     p->header.cid = ntohl(*buf);
2648     buf++;
2649     p->header.callNumber = ntohl(*buf);
2650     buf++;
2651     p->header.seq = ntohl(*buf);
2652     buf++;
2653     p->header.serial = ntohl(*buf);
2654     buf++;
2655
2656     temp = ntohl(*buf);
2657     buf++;
2658
2659     /* C will truncate byte fields to bytes for me */
2660     p->header.type = temp >> 24;
2661     p->header.flags = temp >> 16;
2662     p->header.userStatus = temp >> 8;
2663     p->header.securityIndex = temp >> 0;
2664
2665     temp = ntohl(*buf);
2666     buf++;
2667
2668     p->header.serviceId = (temp & 0xffff);
2669     p->header.spare = temp >> 16;
2670     /* Note: top 16 bits of this last word are the security checksum */
2671 }
2672
2673 /*
2674  * LOCKS HELD: called with call->lock held.
2675  *
2676  * PrepareSendPacket is the only place in the code that
2677  * can increment call->tnext.  This could become an atomic
2678  * in the future.  Beyond that there is nothing in this
2679  * function that requires the call being locked.  This
2680  * function can only be called by the application thread.
2681  */
2682 void
2683 rxi_PrepareSendPacket(struct rx_call *call,
2684                       struct rx_packet *p, int last)
2685 {
2686     struct rx_connection *conn = call->conn;
2687     afs_uint32 seq = call->tnext++;
2688     unsigned int i;
2689     afs_int32 len;              /* len must be a signed type; it can go negative */
2690
2691     /* No data packets on call 0. Where do these come from? */
2692     if (*call->callNumber == 0)
2693         *call->callNumber = 1;
2694
2695     MUTEX_EXIT(&call->lock);
2696     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2697
2698     p->header.cid = (conn->cid | call->channel);
2699     p->header.serviceId = conn->serviceId;
2700     p->header.securityIndex = conn->securityIndex;
2701
2702     p->header.callNumber = *call->callNumber;
2703     p->header.seq = seq;
2704     p->header.epoch = conn->epoch;
2705     p->header.type = RX_PACKET_TYPE_DATA;
2706     p->header.flags = 0;
2707     p->header.spare = 0;
2708     if (conn->type == RX_CLIENT_CONNECTION)
2709         p->header.flags |= RX_CLIENT_INITIATED;
2710
2711     if (last)
2712         p->header.flags |= RX_LAST_PACKET;
2713
2714     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2715     p->header.serial = 0;       /* Another way of saying never transmitted... */
2716
2717     /* Now that we're sure this is the last data on the call, make sure
2718      * that the "length" and the sum of the iov_lens matches. */
2719     len = p->length + call->conn->securityHeaderSize;
2720
2721     for (i = 1; i < p->niovecs && len > 0; i++) {
2722         len -= p->wirevec[i].iov_len;
2723     }
2724     if (len > 0) {
2725         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2726     } else if (i < p->niovecs) {
2727         /* Free any extra elements in the wirevec */
2728 #if defined(RX_ENABLE_TSFPQ)
2729         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2730 #else /* !RX_ENABLE_TSFPQ */
2731         MUTEX_ENTER(&rx_freePktQ_lock);
2732         rxi_FreeDataBufsNoLock(p, i);
2733         MUTEX_EXIT(&rx_freePktQ_lock);
2734 #endif /* !RX_ENABLE_TSFPQ */
2735
2736         p->niovecs = i;
2737     }
2738     if (len)
2739         p->wirevec[i - 1].iov_len += len;
2740     RXS_PreparePacket(conn->securityObject, call, p);
2741     MUTEX_ENTER(&call->lock);
2742 }
2743
2744 /* Given an interface MTU size, calculate an adjusted MTU size that
2745  * will make efficient use of the RX buffers when the peer is sending
2746  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2747 int
2748 rxi_AdjustIfMTU(int mtu)
2749 {
2750     int adjMTU;
2751     int frags;
2752
2753     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2754         return mtu;
2755     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2756     if (mtu <= adjMTU) {
2757         return mtu;
2758     }
2759     mtu -= adjMTU;
2760     if (mtu <= 0) {
2761         return adjMTU;
2762     }
2763     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2764     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2765 }
2766
2767 /* Given an interface MTU size, and the peer's advertised max receive
2768  * size, calculate an adjisted maxMTU size that makes efficient use
2769  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2770 int
2771 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2772 {
2773     int maxMTU = mtu * rxi_nSendFrags;
2774     maxMTU = MIN(maxMTU, peerMaxMTU);
2775     return rxi_AdjustIfMTU(maxMTU);
2776 }
2777
2778 /* Given a packet size, figure out how many datagram packet will fit.
2779  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2780  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2781  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2782 int
2783 rxi_AdjustDgramPackets(int frags, int mtu)
2784 {
2785     int maxMTU;
2786     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2787         return 1;
2788     }
2789     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2790     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2791     /* subtract the size of the first and last packets */
2792     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2793     if (maxMTU < 0) {
2794         return 1;
2795     }
2796     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2797 }
2798
2799 #ifndef KERNEL
2800 /*
2801  * This function can be used by the Windows Cache Manager
2802  * to dump the list of all rx packets so that we can determine
2803  * where the packet leakage is.
2804  */
2805 int rx_DumpPackets(FILE *outputFile, char *cookie)
2806 {
2807 #ifdef RXDEBUG_PACKET
2808     struct rx_packet *p;
2809 #ifdef AFS_NT40_ENV
2810     int zilch;
2811     char output[2048];
2812 #define RXDPRINTF sprintf
2813 #define RXDPRINTOUT output
2814 #else
2815 #define RXDPRINTF fprintf
2816 #define RXDPRINTOUT outputFile
2817 #endif
2818
2819     NETPRI;
2820     MUTEX_ENTER(&rx_freePktQ_lock);
2821     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2822 #ifdef AFS_NT40_ENV
2823     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2824 #endif
2825
2826     for (p = rx_mallocedP; p; p = p->allNextp) {
2827         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2828                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2829                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2830                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2831                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2832                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2833 #ifdef AFS_NT40_ENV
2834         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2835 #endif
2836     }
2837
2838     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2839 #ifdef AFS_NT40_ENV
2840     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2841 #endif
2842
2843     MUTEX_EXIT(&rx_freePktQ_lock);
2844     USERPRI;
2845 #endif /* RXDEBUG_PACKET */
2846     return 0;
2847 }
2848 #endif