src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # if defined(AFS_NT40_ENV)
  47 #  ifndef EWOULDBLOCK
  48 #   define EWOULDBLOCK WSAEWOULDBLOCK
  49 #  endif
  50 #  include "rx_user.h"
  51 #  include "rx_xmit_nt.h"
  52 # endif
  53 # include <lwp.h>
  54 #endif /* KERNEL */
  55
  56 #ifdef  AFS_SUN5_ENV
  57 # include <sys/sysmacros.h>
  58 #endif
  59
  60 #include "rx.h"
  61 #include "rx_clock.h"
  62 #include "rx_queue.h"
  63 #include "rx_packet.h"
  64 #include "rx_atomic.h"
  65 #include "rx_globals.h"
  66 #include "rx_internal.h"
  67 #include "rx_stats.h"
  68
  69 #ifdef RX_LOCKS_DB
  70 /* rxdb_fileID is used to identify the lock location, along with line#. */
  71 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  72 #endif /* RX_LOCKS_DB */
  73 static struct rx_packet *rx_mallocedP = 0;
  74 #ifdef RXDEBUG_PACKET
  75 static afs_uint32       rx_packet_id = 0;
  76 #endif
  77
  78 extern char cml_version_number[];
  79
  80 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
  81
  82 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  83                                 afs_uint32 ahost, short aport,
  84                                 afs_int32 istack);
  85
  86 #ifdef RX_ENABLE_TSFPQ
  87 static int
  88 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
  89 #else
  90 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
  91                                    afs_uint32 first,
  92                                    struct rx_queue * q);
  93 #endif
  94
  95 /* some rules about packets:
  96  * 1.  When a packet is allocated, the final iov_buf contains room for
  97  * a security trailer, but iov_len masks that fact.  If the security
  98  * package wants to add the trailer, it may do so, and then extend
  99  * iov_len appropriately.  For this reason, packet's niovecs and
 100  * iov_len fields should be accurate before calling PreparePacket.
 101 */
 102
 103 /* Preconditions:
 104  *        all packet buffers (iov_base) are integral multiples of
 105  *        the word size.
 106  *        offset is an integral multiple of the word size.
 107  */
 108 afs_int32
 109 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 110 {
 111     unsigned int i;
 112     size_t l;
 113     for (l = 0, i = 1; i < packet->niovecs; i++) {
 114         if (l + packet->wirevec[i].iov_len > offset) {
 115             return
 116                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 117                                  (offset - l)));
 118         }
 119         l += packet->wirevec[i].iov_len;
 120     }
 121
 122     return 0;
 123 }
 124
 125 /* Preconditions:
 126  *        all packet buffers (iov_base) are integral multiples of the word size.
 127  *        offset is an integral multiple of the word size.
 128  */
 129 afs_int32
 130 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 131 {
 132     unsigned int i;
 133     size_t l;
 134     for (l = 0, i = 1; i < packet->niovecs; i++) {
 135         if (l + packet->wirevec[i].iov_len > offset) {
 136             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 137                              (offset - l))) = data;
 138             return 0;
 139         }
 140         l += packet->wirevec[i].iov_len;
 141     }
 142
 143     return 0;
 144 }
 145
 146 /* Preconditions:
 147  *        all packet buffers (iov_base) are integral multiples of the
 148  *        word size.
 149  *        offset is an integral multiple of the word size.
 150  * Packet Invariants:
 151  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 152  */
 153 afs_int32
 154 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 155                   char *out)
 156 {
 157     unsigned int i, j, l, r;
 158     for (l = 0, i = 1; i < packet->niovecs; i++) {
 159         if (l + packet->wirevec[i].iov_len > offset) {
 160             break;
 161         }
 162         l += packet->wirevec[i].iov_len;
 163     }
 164
 165     /* i is the iovec which contains the first little bit of data in which we
 166      * are interested.  l is the total length of everything prior to this iovec.
 167      * j is the number of bytes we can safely copy out of this iovec.
 168      * offset only applies to the first iovec.
 169      */
 170     r = resid;
 171     while ((r > 0) && (i < packet->niovecs)) {
 172         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 173         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 174         r -= j;
 175         out += j;
 176         l += packet->wirevec[i].iov_len;
 177         offset = l;
 178         i++;
 179     }
 180
 181     return (r ? (resid - r) : resid);
 182 }
 183
 184
 185 /* Preconditions:
 186  *        all packet buffers (iov_base) are integral multiples of the
 187  *        word size.
 188  *        offset is an integral multiple of the word size.
 189  */
 190 afs_int32
 191 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 192 {
 193     unsigned int i, j, l, o, r;
 194     char *b;
 195
 196     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 197         if (l + packet->wirevec[i].iov_len > o) {
 198             break;
 199         }
 200         l += packet->wirevec[i].iov_len;
 201     }
 202
 203     /* i is the iovec which contains the first little bit of data in which we
 204      * are interested.  l is the total length of everything prior to this iovec.
 205      * j is the number of bytes we can safely copy out of this iovec.
 206      * offset only applies to the first iovec.
 207      */
 208     r = resid;
 209     while ((r > 0) && (i <= RX_MAXWVECS)) {
 210         if (i >= packet->niovecs)
 211             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 212                 break;
 213
 214         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 215         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 216         memcpy(b, in, j);
 217         r -= j;
 218         in += j;
 219         l += packet->wirevec[i].iov_len;
 220         offset = l;
 221         i++;
 222     }
 223
 224     return (r ? (resid - r) : resid);
 225 }
 226
 227 int
 228 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 229 {
 230     struct rx_packet *p, *np;
 231
 232     num_pkts = AllocPacketBufs(class, num_pkts, q);
 233
 234     for (queue_Scan(q, p, np, rx_packet)) {
 235         RX_PACKET_IOV_FULLINIT(p);
 236     }
 237
 238     return num_pkts;
 239 }
 240
 241 #ifdef RX_ENABLE_TSFPQ
 242 static int
 243 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 244 {
 245     struct rx_ts_info_t * rx_ts_info;
 246     int transfer;
 247     SPLVAR;
 248
 249     RX_TS_INFO_GET(rx_ts_info);
 250
 251     transfer = num_pkts - rx_ts_info->_FPQ.len;
 252     if (transfer > 0) {
 253         NETPRI;
 254         MUTEX_ENTER(&rx_freePktQ_lock);
 255         transfer = MAX(transfer, rx_TSFPQGlobSize);
 256         if (transfer > rx_nFreePackets) {
 257             /* alloc enough for us, plus a few globs for other threads */
 258             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 259         }
 260
 261         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 262
 263         MUTEX_EXIT(&rx_freePktQ_lock);
 264         USERPRI;
 265     }
 266
 267     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 268
 269     return num_pkts;
 270 }
 271 #else /* RX_ENABLE_TSFPQ */
 272 static int
 273 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 274 {
 275     struct rx_packet *c;
 276     int i;
 277 #ifdef KERNEL
 278     int overq = 0;
 279 #endif
 280     SPLVAR;
 281
 282     NETPRI;
 283
 284     MUTEX_ENTER(&rx_freePktQ_lock);
 285
 286 #ifdef KERNEL
 287     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 288          num_pkts--, overq++);
 289
 290     if (overq) {
 291         rxi_NeedMorePackets = TRUE;
 292         if (rx_stats_active) {
 293             switch (class) {
 294             case RX_PACKET_CLASS_RECEIVE:
 295                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 296                 break;
 297             case RX_PACKET_CLASS_SEND:
 298                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 299                 break;
 300             case RX_PACKET_CLASS_SPECIAL:
 301                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 302                 break;
 303             case RX_PACKET_CLASS_RECV_CBUF:
 304                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 305                 break;
 306             case RX_PACKET_CLASS_SEND_CBUF:
 307                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 308                 break;
 309             }
 310         }
 311     }
 312
 313     if (rx_nFreePackets < num_pkts)
 314         num_pkts = rx_nFreePackets;
 315
 316     if (!num_pkts) {
 317         rxi_NeedMorePackets = TRUE;
 318         goto done;
 319     }
 320 #else /* KERNEL */
 321     if (rx_nFreePackets < num_pkts) {
 322         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 323     }
 324 #endif /* KERNEL */
 325
 326     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 327          i < num_pkts;
 328          i++, c=queue_Next(c, rx_packet)) {
 329         RX_FPQ_MARK_USED(c);
 330     }
 331
 332     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 333
 334     rx_nFreePackets -= num_pkts;
 335
 336 #ifdef KERNEL
 337   done:
 338 #endif
 339     MUTEX_EXIT(&rx_freePktQ_lock);
 340
 341     USERPRI;
 342     return num_pkts;
 343 }
 344 #endif /* RX_ENABLE_TSFPQ */
 345
 346 /*
 347  * Free a packet currently used as a continuation buffer
 348  */
 349 #ifdef RX_ENABLE_TSFPQ
 350 /* num_pkts=0 means queue length is unknown */
 351 int
 352 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 353 {
 354     struct rx_ts_info_t * rx_ts_info;
 355     struct rx_packet *c, *nc;
 356     SPLVAR;
 357
 358     osi_Assert(num_pkts >= 0);
 359     RX_TS_INFO_GET(rx_ts_info);
 360
 361     if (!num_pkts) {
 362         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 363             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 364         }
 365     } else {
 366         for (queue_Scan(q, c, nc, rx_packet)) {
 367             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 368         }
 369     }
 370
 371     if (num_pkts) {
 372         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 373     }
 374
 375     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 376         NETPRI;
 377         MUTEX_ENTER(&rx_freePktQ_lock);
 378
 379         RX_TS_FPQ_LTOG(rx_ts_info);
 380
 381         /* Wakeup anyone waiting for packets */
 382         rxi_PacketsUnWait();
 383
 384         MUTEX_EXIT(&rx_freePktQ_lock);
 385         USERPRI;
 386     }
 387
 388     return num_pkts;
 389 }
 390 #else /* RX_ENABLE_TSFPQ */
 391 /* num_pkts=0 means queue length is unknown */
 392 int
 393 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 394 {
 395     struct rx_queue cbs;
 396     struct rx_packet *p, *np;
 397     int qlen = 0;
 398     SPLVAR;
 399
 400     osi_Assert(num_pkts >= 0);
 401     queue_Init(&cbs);
 402
 403     if (!num_pkts) {
 404         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 405             if (p->niovecs > 2) {
 406                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 407             }
 408             RX_FPQ_MARK_FREE(p);
 409         }
 410         if (!num_pkts)
 411             return 0;
 412     } else {
 413         for (queue_Scan(q, p, np, rx_packet)) {
 414             if (p->niovecs > 2) {
 415                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 416             }
 417             RX_FPQ_MARK_FREE(p);
 418         }
 419     }
 420
 421     if (qlen) {
 422         queue_SpliceAppend(q, &cbs);
 423         qlen += num_pkts;
 424     } else
 425         qlen = num_pkts;
 426
 427     NETPRI;
 428     MUTEX_ENTER(&rx_freePktQ_lock);
 429
 430     queue_SpliceAppend(&rx_freePacketQueue, q);
 431     rx_nFreePackets += qlen;
 432
 433     /* Wakeup anyone waiting for packets */
 434     rxi_PacketsUnWait();
 435
 436     MUTEX_EXIT(&rx_freePktQ_lock);
 437     USERPRI;
 438
 439     return num_pkts;
 440 }
 441 #endif /* RX_ENABLE_TSFPQ */
 442
 443 /* this one is kind of awful.
 444  * In rxkad, the packet has been all shortened, and everything, ready for
 445  * sending.  All of a sudden, we discover we need some of that space back.
 446  * This isn't terribly general, because it knows that the packets are only
 447  * rounded up to the EBS (userdata + security header).
 448  */
 449 int
 450 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 451 {
 452     int i;
 453     i = p->niovecs - 1;
 454     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 455         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 456             p->wirevec[i].iov_len += nb;
 457             return 0;
 458         }
 459     } else {
 460         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 461             p->wirevec[i].iov_len += nb;
 462             return 0;
 463         }
 464     }
 465
 466     return 0;
 467 }
 468
 469 /* get sufficient space to store nb bytes of data (or more), and hook
 470  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 471  * returns the number of bytes >0 which it failed to come up with.
 472  * Don't need to worry about locking on packet, since only
 473  * one thread can manipulate one at a time. Locking on continution
 474  * packets is handled by AllocPacketBufs */
 475 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 476 int
 477 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 478 {
 479     int i, nv;
 480     struct rx_queue q;
 481     struct rx_packet *cb, *ncb;
 482
 483     /* compute the number of cbuf's we need */
 484     nv = nb / RX_CBUFFERSIZE;
 485     if ((nv * RX_CBUFFERSIZE) < nb)
 486         nv++;
 487     if ((nv + p->niovecs) > RX_MAXWVECS)
 488         nv = RX_MAXWVECS - p->niovecs;
 489     if (nv < 1)
 490         return nb;
 491
 492     /* allocate buffers */
 493     queue_Init(&q);
 494     nv = AllocPacketBufs(class, nv, &q);
 495
 496     /* setup packet iovs */
 497     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 498         queue_Remove(cb);
 499         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 500         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 501     }
 502
 503     nb -= (nv * RX_CBUFFERSIZE);
 504     p->length += (nv * RX_CBUFFERSIZE);
 505     p->niovecs += nv;
 506
 507     return nb;
 508 }
 509
 510 /* Add more packet buffers */
 511 #ifdef RX_ENABLE_TSFPQ
 512 void
 513 rxi_MorePackets(int apackets)
 514 {
 515     struct rx_packet *p, *e;
 516     struct rx_ts_info_t * rx_ts_info;
 517     int getme;
 518     SPLVAR;
 519
 520     getme = apackets * sizeof(struct rx_packet);
 521     p = (struct rx_packet *)osi_Alloc(getme);
 522     osi_Assert(p);
 523
 524     PIN(p, getme);              /* XXXXX */
 525     memset(p, 0, getme);
 526     RX_TS_INFO_GET(rx_ts_info);
 527
 528     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 529     /* TSFPQ patch also needs to keep track of total packets */
 530
 531     MUTEX_ENTER(&rx_packets_mutex);
 532     rx_nPackets += apackets;
 533     RX_TS_FPQ_COMPUTE_LIMITS;
 534     MUTEX_EXIT(&rx_packets_mutex);
 535
 536     for (e = p + apackets; p < e; p++) {
 537         RX_PACKET_IOV_INIT(p);
 538         p->niovecs = 2;
 539
 540         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 541
 542         NETPRI;
 543         MUTEX_ENTER(&rx_freePktQ_lock);
 544 #ifdef RXDEBUG_PACKET
 545         p->packetId = rx_packet_id++;
 546         p->allNextp = rx_mallocedP;
 547 #endif /* RXDEBUG_PACKET */
 548         rx_mallocedP = p;
 549         MUTEX_EXIT(&rx_freePktQ_lock);
 550         USERPRI;
 551     }
 552     rx_ts_info->_FPQ.delta += apackets;
 553
 554     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 555         NETPRI;
 556         MUTEX_ENTER(&rx_freePktQ_lock);
 557
 558         RX_TS_FPQ_LTOG(rx_ts_info);
 559         rxi_NeedMorePackets = FALSE;
 560         rxi_PacketsUnWait();
 561
 562         MUTEX_EXIT(&rx_freePktQ_lock);
 563         USERPRI;
 564     }
 565 }
 566 #else /* RX_ENABLE_TSFPQ */
 567 void
 568 rxi_MorePackets(int apackets)
 569 {
 570     struct rx_packet *p, *e;
 571     int getme;
 572     SPLVAR;
 573
 574     getme = apackets * sizeof(struct rx_packet);
 575     p = (struct rx_packet *)osi_Alloc(getme);
 576     osi_Assert(p);
 577
 578     PIN(p, getme);              /* XXXXX */
 579     memset(p, 0, getme);
 580     NETPRI;
 581     MUTEX_ENTER(&rx_freePktQ_lock);
 582
 583     for (e = p + apackets; p < e; p++) {
 584         RX_PACKET_IOV_INIT(p);
 585 #ifdef RX_TRACK_PACKETS
 586         p->flags |= RX_PKTFLAG_FREE;
 587 #endif
 588         p->niovecs = 2;
 589
 590         queue_Append(&rx_freePacketQueue, p);
 591 #ifdef RXDEBUG_PACKET
 592         p->packetId = rx_packet_id++;
 593         p->allNextp = rx_mallocedP;
 594 #endif /* RXDEBUG_PACKET */
 595         rx_mallocedP = p;
 596     }
 597
 598     rx_nPackets += apackets;
 599     rx_nFreePackets += apackets;
 600     rxi_NeedMorePackets = FALSE;
 601     rxi_PacketsUnWait();
 602
 603     MUTEX_EXIT(&rx_freePktQ_lock);
 604     USERPRI;
 605 }
 606 #endif /* RX_ENABLE_TSFPQ */
 607
 608 #ifdef RX_ENABLE_TSFPQ
 609 void
 610 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 611 {
 612     struct rx_packet *p, *e;
 613     struct rx_ts_info_t * rx_ts_info;
 614     int getme;
 615     SPLVAR;
 616
 617     getme = apackets * sizeof(struct rx_packet);
 618     p = (struct rx_packet *)osi_Alloc(getme);
 619
 620     PIN(p, getme);              /* XXXXX */
 621     memset(p, 0, getme);
 622     RX_TS_INFO_GET(rx_ts_info);
 623
 624     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 625     /* TSFPQ patch also needs to keep track of total packets */
 626     MUTEX_ENTER(&rx_packets_mutex);
 627     rx_nPackets += apackets;
 628     RX_TS_FPQ_COMPUTE_LIMITS;
 629     MUTEX_EXIT(&rx_packets_mutex);
 630
 631     for (e = p + apackets; p < e; p++) {
 632         RX_PACKET_IOV_INIT(p);
 633         p->niovecs = 2;
 634         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 635
 636         NETPRI;
 637         MUTEX_ENTER(&rx_freePktQ_lock);
 638 #ifdef RXDEBUG_PACKET
 639         p->packetId = rx_packet_id++;
 640         p->allNextp = rx_mallocedP;
 641 #endif /* RXDEBUG_PACKET */
 642         rx_mallocedP = p;
 643         MUTEX_EXIT(&rx_freePktQ_lock);
 644         USERPRI;
 645     }
 646     rx_ts_info->_FPQ.delta += apackets;
 647
 648     if (flush_global &&
 649         (num_keep_local < apackets)) {
 650         NETPRI;
 651         MUTEX_ENTER(&rx_freePktQ_lock);
 652
 653         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 654         rxi_NeedMorePackets = FALSE;
 655         rxi_PacketsUnWait();
 656
 657         MUTEX_EXIT(&rx_freePktQ_lock);
 658         USERPRI;
 659     }
 660 }
 661 #endif /* RX_ENABLE_TSFPQ */
 662
 663 #ifndef KERNEL
 664 /* Add more packet buffers */
 665 void
 666 rxi_MorePacketsNoLock(int apackets)
 667 {
 668 #ifdef RX_ENABLE_TSFPQ
 669     struct rx_ts_info_t * rx_ts_info;
 670 #endif /* RX_ENABLE_TSFPQ */
 671     struct rx_packet *p, *e;
 672     int getme;
 673
 674     /* allocate enough packets that 1/4 of the packets will be able
 675      * to hold maximal amounts of data */
 676     apackets += (apackets / 4)
 677         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 678     do {
 679         getme = apackets * sizeof(struct rx_packet);
 680         p = (struct rx_packet *)osi_Alloc(getme);
 681         if (p == NULL) {
 682             apackets -= apackets / 4;
 683             osi_Assert(apackets > 0);
 684         }
 685     } while(p == NULL);
 686     memset(p, 0, getme);
 687
 688 #ifdef RX_ENABLE_TSFPQ
 689     RX_TS_INFO_GET(rx_ts_info);
 690     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 691 #endif /* RX_ENABLE_TSFPQ */
 692
 693     for (e = p + apackets; p < e; p++) {
 694         RX_PACKET_IOV_INIT(p);
 695 #ifdef RX_TRACK_PACKETS
 696         p->flags |= RX_PKTFLAG_FREE;
 697 #endif
 698         p->niovecs = 2;
 699
 700         queue_Append(&rx_freePacketQueue, p);
 701 #ifdef RXDEBUG_PACKET
 702         p->packetId = rx_packet_id++;
 703         p->allNextp = rx_mallocedP;
 704 #endif /* RXDEBUG_PACKET */
 705         rx_mallocedP = p;
 706     }
 707
 708     rx_nFreePackets += apackets;
 709     MUTEX_ENTER(&rx_packets_mutex);
 710     rx_nPackets += apackets;
 711 #ifdef RX_ENABLE_TSFPQ
 712     RX_TS_FPQ_COMPUTE_LIMITS;
 713 #endif /* RX_ENABLE_TSFPQ */
 714     MUTEX_EXIT(&rx_packets_mutex);
 715     rxi_NeedMorePackets = FALSE;
 716     rxi_PacketsUnWait();
 717 }
 718 #endif /* !KERNEL */
 719
 720 void
 721 rxi_FreeAllPackets(void)
 722 {
 723     /* must be called at proper interrupt level, etcetera */
 724     /* MTUXXX need to free all Packets */
 725     osi_Free(rx_mallocedP,
 726              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 727     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 728 }
 729
 730 #ifdef RX_ENABLE_TSFPQ
 731 void
 732 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 733 {
 734     struct rx_ts_info_t * rx_ts_info;
 735     int xfer;
 736     SPLVAR;
 737
 738     RX_TS_INFO_GET(rx_ts_info);
 739
 740     if (num_keep_local != rx_ts_info->_FPQ.len) {
 741         NETPRI;
 742         MUTEX_ENTER(&rx_freePktQ_lock);
 743         if (num_keep_local < rx_ts_info->_FPQ.len) {
 744             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 745             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 746             rxi_PacketsUnWait();
 747         } else {
 748             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 749             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 750                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 751             if (rx_nFreePackets < xfer) {
 752                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 753             }
 754             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 755         }
 756         MUTEX_EXIT(&rx_freePktQ_lock);
 757         USERPRI;
 758     }
 759 }
 760
 761 void
 762 rxi_FlushLocalPacketsTSFPQ(void)
 763 {
 764     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 765 }
 766 #endif /* RX_ENABLE_TSFPQ */
 767
 768 /* Allocate more packets iff we need more continuation buffers */
 769 /* In kernel, can't page in memory with interrupts disabled, so we
 770  * don't use the event mechanism. */
 771 void
 772 rx_CheckPackets(void)
 773 {
 774     if (rxi_NeedMorePackets) {
 775         rxi_MorePackets(rx_maxSendWindow);
 776     }
 777 }
 778
 779 /* In the packet freeing routine below, the assumption is that
 780    we want all of the packets to be used equally frequently, so that we
 781    don't get packet buffers paging out.  It would be just as valid to
 782    assume that we DO want them to page out if not many are being used.
 783    In any event, we assume the former, and append the packets to the end
 784    of the free list.  */
 785 /* This explanation is bogus.  The free list doesn't remain in any kind of
 786    useful order for afs_int32: the packets in use get pretty much randomly scattered
 787    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 788    must be stored so that packets which are adjacent in memory are adjacent in the
 789    free list.  An array springs rapidly to mind.
 790    */
 791
 792 /* Actually free the packet p. */
 793 #ifdef RX_ENABLE_TSFPQ
 794 void
 795 rxi_FreePacketNoLock(struct rx_packet *p)
 796 {
 797     struct rx_ts_info_t * rx_ts_info;
 798     dpf(("Free %"AFS_PTR_FMT"\n", p));
 799
 800     RX_TS_INFO_GET(rx_ts_info);
 801     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 802     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 803         RX_TS_FPQ_LTOG(rx_ts_info);
 804     }
 805 }
 806 #else /* RX_ENABLE_TSFPQ */
 807 void
 808 rxi_FreePacketNoLock(struct rx_packet *p)
 809 {
 810     dpf(("Free %"AFS_PTR_FMT"\n", p));
 811
 812     RX_FPQ_MARK_FREE(p);
 813     rx_nFreePackets++;
 814     queue_Append(&rx_freePacketQueue, p);
 815 }
 816 #endif /* RX_ENABLE_TSFPQ */
 817
 818 #ifdef RX_ENABLE_TSFPQ
 819 void
 820 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 821 {
 822     struct rx_ts_info_t * rx_ts_info;
 823     dpf(("Free %"AFS_PTR_FMT"\n", p));
 824
 825     RX_TS_INFO_GET(rx_ts_info);
 826     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 827
 828     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 829         NETPRI;
 830         MUTEX_ENTER(&rx_freePktQ_lock);
 831
 832         RX_TS_FPQ_LTOG(rx_ts_info);
 833
 834         /* Wakeup anyone waiting for packets */
 835         rxi_PacketsUnWait();
 836
 837         MUTEX_EXIT(&rx_freePktQ_lock);
 838         USERPRI;
 839     }
 840 }
 841 #endif /* RX_ENABLE_TSFPQ */
 842
 843 /*
 844  * free continuation buffers off a packet into a queue
 845  *
 846  * [IN] p      -- packet from which continuation buffers will be freed
 847  * [IN] first  -- iovec offset of first continuation buffer to free
 848  * [IN] q      -- queue into which continuation buffers will be chained
 849  *
 850  * returns:
 851  *   number of continuation buffers freed
 852  */
 853 #ifndef RX_ENABLE_TSFPQ
 854 static int
 855 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 856 {
 857     struct iovec *iov;
 858     struct rx_packet * cb;
 859     int count = 0;
 860
 861     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 862         iov = &p->wirevec[first];
 863         if (!iov->iov_base)
 864             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 865         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 866         RX_FPQ_MARK_FREE(cb);
 867         queue_Append(q, cb);
 868     }
 869     p->length = 0;
 870     p->niovecs = 0;
 871
 872     return count;
 873 }
 874 #endif
 875
 876 /*
 877  * free packet continuation buffers into the global free packet pool
 878  *
 879  * [IN] p      -- packet from which to free continuation buffers
 880  * [IN] first  -- iovec offset of first continuation buffer to free
 881  *
 882  * returns:
 883  *   zero always
 884  */
 885 int
 886 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 887 {
 888     struct iovec *iov;
 889
 890     for (first = MAX(2, first); first < p->niovecs; first++) {
 891         iov = &p->wirevec[first];
 892         if (!iov->iov_base)
 893             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 894         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 895     }
 896     p->length = 0;
 897     p->niovecs = 0;
 898
 899     return 0;
 900 }
 901
 902 #ifdef RX_ENABLE_TSFPQ
 903 /*
 904  * free packet continuation buffers into the thread-local free pool
 905  *
 906  * [IN] p             -- packet from which continuation buffers will be freed
 907  * [IN] first         -- iovec offset of first continuation buffer to free
 908  *                       any value less than 2, the min number of iovecs,
 909  *                       is treated as if it is 2.
 910  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 911  *                       global free pool before returning
 912  *
 913  * returns:
 914  *   zero always
 915  */
 916 static int
 917 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 918 {
 919     struct iovec *iov;
 920     struct rx_ts_info_t * rx_ts_info;
 921
 922     RX_TS_INFO_GET(rx_ts_info);
 923
 924     for (first = MAX(2, first); first < p->niovecs; first++) {
 925         iov = &p->wirevec[first];
 926         if (!iov->iov_base)
 927             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 928         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 929     }
 930     p->length = 0;
 931     p->niovecs = 0;
 932
 933     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 934         NETPRI;
 935         MUTEX_ENTER(&rx_freePktQ_lock);
 936
 937         RX_TS_FPQ_LTOG(rx_ts_info);
 938
 939         /* Wakeup anyone waiting for packets */
 940         rxi_PacketsUnWait();
 941
 942         MUTEX_EXIT(&rx_freePktQ_lock);
 943         USERPRI;
 944     }
 945     return 0;
 946 }
 947 #endif /* RX_ENABLE_TSFPQ */
 948
 949 int rxi_nBadIovecs = 0;
 950
 951 /* rxi_RestoreDataBufs
 952  *
 953  * Restore the correct sizes to the iovecs. Called when reusing a packet
 954  * for reading off the wire.
 955  */
 956 void
 957 rxi_RestoreDataBufs(struct rx_packet *p)
 958 {
 959     unsigned int i;
 960     struct iovec *iov = &p->wirevec[2];
 961
 962     RX_PACKET_IOV_INIT(p);
 963
 964     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 965         if (!iov->iov_base) {
 966             rxi_nBadIovecs++;
 967             p->niovecs = i;
 968             break;
 969         }
 970         iov->iov_len = RX_CBUFFERSIZE;
 971     }
 972 }
 973
 974 #ifdef RX_ENABLE_TSFPQ
 975 int
 976 rxi_TrimDataBufs(struct rx_packet *p, int first)
 977 {
 978     int length;
 979     struct iovec *iov, *end;
 980     struct rx_ts_info_t * rx_ts_info;
 981     SPLVAR;
 982
 983     if (first != 1)
 984         osi_Panic("TrimDataBufs 1: first must be 1");
 985
 986     /* Skip over continuation buffers containing message data */
 987     iov = &p->wirevec[2];
 988     end = iov + (p->niovecs - 2);
 989     length = p->length - p->wirevec[1].iov_len;
 990     for (; iov < end && length > 0; iov++) {
 991         if (!iov->iov_base)
 992             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
 993         length -= iov->iov_len;
 994     }
 995
 996     /* iov now points to the first empty data buffer. */
 997     if (iov >= end)
 998         return 0;
 999
1000     RX_TS_INFO_GET(rx_ts_info);
1001     for (; iov < end; iov++) {
1002         if (!iov->iov_base)
1003             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1004         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1005         p->niovecs--;
1006     }
1007     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1008         NETPRI;
1009         MUTEX_ENTER(&rx_freePktQ_lock);
1010
1011         RX_TS_FPQ_LTOG(rx_ts_info);
1012         rxi_PacketsUnWait();
1013
1014         MUTEX_EXIT(&rx_freePktQ_lock);
1015         USERPRI;
1016     }
1017
1018     return 0;
1019 }
1020 #else /* RX_ENABLE_TSFPQ */
1021 int
1022 rxi_TrimDataBufs(struct rx_packet *p, int first)
1023 {
1024     int length;
1025     struct iovec *iov, *end;
1026     SPLVAR;
1027
1028     if (first != 1)
1029         osi_Panic("TrimDataBufs 1: first must be 1");
1030
1031     /* Skip over continuation buffers containing message data */
1032     iov = &p->wirevec[2];
1033     end = iov + (p->niovecs - 2);
1034     length = p->length - p->wirevec[1].iov_len;
1035     for (; iov < end && length > 0; iov++) {
1036         if (!iov->iov_base)
1037             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1038         length -= iov->iov_len;
1039     }
1040
1041     /* iov now points to the first empty data buffer. */
1042     if (iov >= end)
1043         return 0;
1044
1045     NETPRI;
1046     MUTEX_ENTER(&rx_freePktQ_lock);
1047
1048     for (; iov < end; iov++) {
1049         if (!iov->iov_base)
1050             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1051         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1052         p->niovecs--;
1053     }
1054     rxi_PacketsUnWait();
1055
1056     MUTEX_EXIT(&rx_freePktQ_lock);
1057     USERPRI;
1058
1059     return 0;
1060 }
1061 #endif /* RX_ENABLE_TSFPQ */
1062
1063 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1064  * remove it yourself first if you call this routine. */
1065 #ifdef RX_ENABLE_TSFPQ
1066 void
1067 rxi_FreePacket(struct rx_packet *p)
1068 {
1069     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1070     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1071 }
1072 #else /* RX_ENABLE_TSFPQ */
1073 void
1074 rxi_FreePacket(struct rx_packet *p)
1075 {
1076     SPLVAR;
1077
1078     NETPRI;
1079     MUTEX_ENTER(&rx_freePktQ_lock);
1080
1081     rxi_FreeDataBufsNoLock(p, 2);
1082     rxi_FreePacketNoLock(p);
1083     /* Wakeup anyone waiting for packets */
1084     rxi_PacketsUnWait();
1085
1086     MUTEX_EXIT(&rx_freePktQ_lock);
1087     USERPRI;
1088 }
1089 #endif /* RX_ENABLE_TSFPQ */
1090
1091 /* rxi_AllocPacket sets up p->length so it reflects the number of
1092  * bytes in the packet at this point, **not including** the header.
1093  * The header is absolutely necessary, besides, this is the way the
1094  * length field is usually used */
1095 #ifdef RX_ENABLE_TSFPQ
1096 struct rx_packet *
1097 rxi_AllocPacketNoLock(int class)
1098 {
1099     struct rx_packet *p;
1100     struct rx_ts_info_t * rx_ts_info;
1101
1102     RX_TS_INFO_GET(rx_ts_info);
1103
1104 #ifdef KERNEL
1105     if (rxi_OverQuota(class)) {
1106         rxi_NeedMorePackets = TRUE;
1107         if (rx_stats_active) {
1108             switch (class) {
1109             case RX_PACKET_CLASS_RECEIVE:
1110                 rx_atomic_inc(rx_stats.receivePktAllocFailures);
1111                 break;
1112             case RX_PACKET_CLASS_SEND:
1113                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1114                 break;
1115             case RX_PACKET_CLASS_SPECIAL:
1116                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1117                 break;
1118             case RX_PACKET_CLASS_RECV_CBUF:
1119                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1120                 break;
1121             case RX_PACKET_CLASS_SEND_CBUF:
1122                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1123                 break;
1124             }
1125         }
1126         return (struct rx_packet *)0;
1127     }
1128 #endif /* KERNEL */
1129
1130     if (rx_stats_active)
1131         rx_atomic_inc(&rx_stats.packetRequests);
1132     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1133
1134 #ifdef KERNEL
1135         if (queue_IsEmpty(&rx_freePacketQueue))
1136             osi_Panic("rxi_AllocPacket error");
1137 #else /* KERNEL */
1138         if (queue_IsEmpty(&rx_freePacketQueue))
1139             rxi_MorePacketsNoLock(rx_maxSendWindow);
1140 #endif /* KERNEL */
1141
1142
1143         RX_TS_FPQ_GTOL(rx_ts_info);
1144     }
1145
1146     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1147
1148     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1149
1150
1151     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1152      * order to truncate outbound packets.  In the near future, may need
1153      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1154      */
1155     RX_PACKET_IOV_FULLINIT(p);
1156     return p;
1157 }
1158 #else /* RX_ENABLE_TSFPQ */
1159 struct rx_packet *
1160 rxi_AllocPacketNoLock(int class)
1161 {
1162     struct rx_packet *p;
1163
1164 #ifdef KERNEL
1165     if (rxi_OverQuota(class)) {
1166         rxi_NeedMorePackets = TRUE;
1167         if (rx_stats_active) {
1168             switch (class) {
1169             case RX_PACKET_CLASS_RECEIVE:
1170                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1171                 break;
1172             case RX_PACKET_CLASS_SEND:
1173                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1174                 break;
1175             case RX_PACKET_CLASS_SPECIAL:
1176                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1177                 break;
1178             case RX_PACKET_CLASS_RECV_CBUF:
1179                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1180                 break;
1181             case RX_PACKET_CLASS_SEND_CBUF:
1182                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1183                 break;
1184             }
1185         }
1186         return (struct rx_packet *)0;
1187     }
1188 #endif /* KERNEL */
1189
1190     if (rx_stats_active)
1191         rx_atomic_inc(&rx_stats.packetRequests);
1192
1193 #ifdef KERNEL
1194     if (queue_IsEmpty(&rx_freePacketQueue))
1195         osi_Panic("rxi_AllocPacket error");
1196 #else /* KERNEL */
1197     if (queue_IsEmpty(&rx_freePacketQueue))
1198         rxi_MorePacketsNoLock(rx_maxSendWindow);
1199 #endif /* KERNEL */
1200
1201     rx_nFreePackets--;
1202     p = queue_First(&rx_freePacketQueue, rx_packet);
1203     queue_Remove(p);
1204     RX_FPQ_MARK_USED(p);
1205
1206     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1207
1208
1209     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1210      * order to truncate outbound packets.  In the near future, may need
1211      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1212      */
1213     RX_PACKET_IOV_FULLINIT(p);
1214     return p;
1215 }
1216 #endif /* RX_ENABLE_TSFPQ */
1217
1218 #ifdef RX_ENABLE_TSFPQ
1219 struct rx_packet *
1220 rxi_AllocPacketTSFPQ(int class, int pull_global)
1221 {
1222     struct rx_packet *p;
1223     struct rx_ts_info_t * rx_ts_info;
1224
1225     RX_TS_INFO_GET(rx_ts_info);
1226
1227     if (rx_stats_active)
1228         rx_atomic_inc(&rx_stats.packetRequests);
1229     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1230         MUTEX_ENTER(&rx_freePktQ_lock);
1231
1232         if (queue_IsEmpty(&rx_freePacketQueue))
1233             rxi_MorePacketsNoLock(rx_maxSendWindow);
1234
1235         RX_TS_FPQ_GTOL(rx_ts_info);
1236
1237         MUTEX_EXIT(&rx_freePktQ_lock);
1238     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1239         return NULL;
1240     }
1241
1242     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1243
1244     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1245
1246     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1247      * order to truncate outbound packets.  In the near future, may need
1248      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1249      */
1250     RX_PACKET_IOV_FULLINIT(p);
1251     return p;
1252 }
1253 #endif /* RX_ENABLE_TSFPQ */
1254
1255 #ifdef RX_ENABLE_TSFPQ
1256 struct rx_packet *
1257 rxi_AllocPacket(int class)
1258 {
1259     struct rx_packet *p;
1260
1261     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1262     return p;
1263 }
1264 #else /* RX_ENABLE_TSFPQ */
1265 struct rx_packet *
1266 rxi_AllocPacket(int class)
1267 {
1268     struct rx_packet *p;
1269
1270     MUTEX_ENTER(&rx_freePktQ_lock);
1271     p = rxi_AllocPacketNoLock(class);
1272     MUTEX_EXIT(&rx_freePktQ_lock);
1273     return p;
1274 }
1275 #endif /* RX_ENABLE_TSFPQ */
1276
1277 /* This guy comes up with as many buffers as it {takes,can get} given
1278  * the MTU for this call. It also sets the packet length before
1279  * returning.  caution: this is often called at NETPRI
1280  * Called with call locked.
1281  */
1282 struct rx_packet *
1283 rxi_AllocSendPacket(struct rx_call *call, int want)
1284 {
1285     struct rx_packet *p = (struct rx_packet *)0;
1286     int mud;
1287     unsigned delta;
1288
1289     SPLVAR;
1290     mud = call->MTU - RX_HEADER_SIZE;
1291     delta =
1292         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1293         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1294
1295 #ifdef RX_ENABLE_TSFPQ
1296     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1297         want += delta;
1298         want = MIN(want, mud);
1299
1300         if ((unsigned)want > p->length)
1301             (void)rxi_AllocDataBuf(p, (want - p->length),
1302                                    RX_PACKET_CLASS_SEND_CBUF);
1303
1304         if (p->length > mud)
1305             p->length = mud;
1306
1307         if (delta >= p->length) {
1308             rxi_FreePacket(p);
1309             p = NULL;
1310         } else {
1311             p->length -= delta;
1312         }
1313         return p;
1314     }
1315 #endif /* RX_ENABLE_TSFPQ */
1316
1317     while (!(call->error)) {
1318         MUTEX_ENTER(&rx_freePktQ_lock);
1319         /* if an error occurred, or we get the packet we want, we're done */
1320         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1321             MUTEX_EXIT(&rx_freePktQ_lock);
1322
1323             want += delta;
1324             want = MIN(want, mud);
1325
1326             if ((unsigned)want > p->length)
1327                 (void)rxi_AllocDataBuf(p, (want - p->length),
1328                                        RX_PACKET_CLASS_SEND_CBUF);
1329
1330             if (p->length > mud)
1331                 p->length = mud;
1332
1333             if (delta >= p->length) {
1334                 rxi_FreePacket(p);
1335                 p = NULL;
1336             } else {
1337                 p->length -= delta;
1338             }
1339             break;
1340         }
1341
1342         /* no error occurred, and we didn't get a packet, so we sleep.
1343          * At this point, we assume that packets will be returned
1344          * sooner or later, as packets are acknowledged, and so we
1345          * just wait.  */
1346         NETPRI;
1347         call->flags |= RX_CALL_WAIT_PACKETS;
1348         MUTEX_ENTER(&rx_refcnt_mutex);
1349         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1350         MUTEX_EXIT(&rx_refcnt_mutex);
1351         MUTEX_EXIT(&call->lock);
1352         rx_waitingForPackets = 1;
1353
1354 #ifdef  RX_ENABLE_LOCKS
1355         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1356 #else
1357         osi_rxSleep(&rx_waitingForPackets);
1358 #endif
1359         MUTEX_EXIT(&rx_freePktQ_lock);
1360         MUTEX_ENTER(&call->lock);
1361         MUTEX_ENTER(&rx_refcnt_mutex);
1362         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1363         MUTEX_EXIT(&rx_refcnt_mutex);
1364         call->flags &= ~RX_CALL_WAIT_PACKETS;
1365         USERPRI;
1366     }
1367
1368     return p;
1369 }
1370
1371 #ifndef KERNEL
1372 #ifdef AFS_NT40_ENV
1373 /* Windows does not use file descriptors. */
1374 #define CountFDs(amax) 0
1375 #else
1376 /* count the number of used FDs */
1377 static int
1378 CountFDs(int amax)
1379 {
1380     struct stat tstat;
1381     int i, code;
1382     int count;
1383
1384     count = 0;
1385     for (i = 0; i < amax; i++) {
1386         code = fstat(i, &tstat);
1387         if (code == 0)
1388             count++;
1389     }
1390     return count;
1391 }
1392 #endif /* AFS_NT40_ENV */
1393 #else /* KERNEL */
1394
1395 #define CountFDs(amax) amax
1396
1397 #endif /* KERNEL */
1398
1399 #if !defined(KERNEL) || defined(UKERNEL)
1400
1401 /* This function reads a single packet from the interface into the
1402  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1403  * (host,port) of the sender are stored in the supplied variables, and
1404  * the data length of the packet is stored in the packet structure.
1405  * The header is decoded. */
1406 int
1407 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1408                u_short * port)
1409 {
1410     struct sockaddr_in from;
1411     unsigned int nbytes;
1412     afs_int32 rlen;
1413     afs_uint32 tlen, savelen;
1414     struct msghdr msg;
1415     rx_computelen(p, tlen);
1416     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1417
1418     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1419     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1420                                  * it once in order to avoid races.  */
1421     tlen = rlen - tlen;
1422     if (tlen > 0) {
1423         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1424         if (tlen > 0) {
1425             tlen = rlen - tlen;
1426         } else
1427             tlen = rlen;
1428     } else
1429         tlen = rlen;
1430
1431     /* Extend the last iovec for padding, it's just to make sure that the
1432      * read doesn't return more data than we expect, and is done to get around
1433      * our problems caused by the lack of a length field in the rx header.
1434      * Use the extra buffer that follows the localdata in each packet
1435      * structure. */
1436     savelen = p->wirevec[p->niovecs - 1].iov_len;
1437     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1438
1439     memset(&msg, 0, sizeof(msg));
1440     msg.msg_name = (char *)&from;
1441     msg.msg_namelen = sizeof(struct sockaddr_in);
1442     msg.msg_iov = p->wirevec;
1443     msg.msg_iovlen = p->niovecs;
1444     nbytes = rxi_Recvmsg(socket, &msg, 0);
1445
1446     /* restore the vec to its correct state */
1447     p->wirevec[p->niovecs - 1].iov_len = savelen;
1448
1449     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1450     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1451         if (nbytes < 0 && errno == EWOULDBLOCK) {
1452             if (rx_stats_active)
1453                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1454         } else if (nbytes <= 0) {
1455             if (rx_stats_active) {
1456                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1457                 rx_stats.bogusHost = from.sin_addr.s_addr;
1458             }
1459             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1460                  ntohs(from.sin_port), nbytes));
1461         }
1462         return 0;
1463     }
1464 #ifdef RXDEBUG
1465     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1466                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1467         rxi_DecodePacketHeader(p);
1468
1469         *host = from.sin_addr.s_addr;
1470         *port = from.sin_port;
1471
1472         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1473               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1474               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1475               p->length));
1476 #ifdef RX_TRIMDATABUFS
1477         rxi_TrimDataBufs(p, 1);
1478 #endif
1479         return 0;
1480     }
1481 #endif
1482     else {
1483         /* Extract packet header. */
1484         rxi_DecodePacketHeader(p);
1485
1486         *host = from.sin_addr.s_addr;
1487         *port = from.sin_port;
1488         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1489             if (rx_stats_active) {
1490                 struct rx_peer *peer;
1491                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1492                 /*
1493                  * Try to look up this peer structure.  If it doesn't exist,
1494                  * don't create a new one -
1495                  * we don't keep count of the bytes sent/received if a peer
1496                  * structure doesn't already exist.
1497                  *
1498                  * The peer/connection cleanup code assumes that there is 1 peer
1499                  * per connection.  If we actually created a peer structure here
1500                  * and this packet was an rxdebug packet, the peer structure would
1501                  * never be cleaned up.
1502                  */
1503                 peer = rxi_FindPeer(*host, *port, 0, 0);
1504                 /* Since this may not be associated with a connection,
1505                  * it may have no refCount, meaning we could race with
1506                  * ReapConnections
1507                  */
1508                 if (peer && (peer->refCount > 0)) {
1509                     MUTEX_ENTER(&peer->peer_lock);
1510                     hadd32(peer->bytesReceived, p->length);
1511                     MUTEX_EXIT(&peer->peer_lock);
1512                 }
1513             }
1514         }
1515
1516 #ifdef RX_TRIMDATABUFS
1517         /* Free any empty packet buffers at the end of this packet */
1518         rxi_TrimDataBufs(p, 1);
1519 #endif
1520         return 1;
1521     }
1522 }
1523
1524 #endif /* !KERNEL || UKERNEL */
1525
1526 /* This function splits off the first packet in a jumbo packet.
1527  * As of AFS 3.5, jumbograms contain more than one fixed size
1528  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1529  * last packet header. All packets (except the last) are padded to
1530  * fall on RX_CBUFFERSIZE boundaries.
1531  * HACK: We store the length of the first n-1 packets in the
1532  * last two pad bytes. */
1533
1534 struct rx_packet *
1535 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1536                      int first)
1537 {
1538     struct rx_packet *np;
1539     struct rx_jumboHeader *jp;
1540     int niov, i;
1541     struct iovec *iov;
1542     int length;
1543     afs_uint32 temp;
1544
1545     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1546      * bytes in length. All but the first packet are preceded by
1547      * an abbreviated four byte header. The length of the last packet
1548      * is calculated from the size of the jumbogram. */
1549     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1550
1551     if ((int)p->length < length) {
1552         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1553         return NULL;
1554     }
1555     niov = p->niovecs - 2;
1556     if (niov < 1) {
1557         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1558         return NULL;
1559     }
1560     iov = &p->wirevec[2];
1561     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1562
1563     /* Get a pointer to the abbreviated packet header */
1564     jp = (struct rx_jumboHeader *)
1565         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1566
1567     /* Set up the iovecs for the next packet */
1568     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1569     np->wirevec[0].iov_len = sizeof(struct rx_header);
1570     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1571     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1572     np->niovecs = niov + 1;
1573     for (i = 2, iov++; i <= niov; i++, iov++) {
1574         np->wirevec[i] = *iov;
1575     }
1576     np->length = p->length - length;
1577     p->length = RX_JUMBOBUFFERSIZE;
1578     p->niovecs = 2;
1579
1580     /* Convert the jumbo packet header to host byte order */
1581     temp = ntohl(*(afs_uint32 *) jp);
1582     jp->flags = (u_char) (temp >> 24);
1583     jp->cksum = (u_short) (temp);
1584
1585     /* Fill in the packet header */
1586     np->header = p->header;
1587     np->header.serial = p->header.serial + 1;
1588     np->header.seq = p->header.seq + 1;
1589     np->header.flags = jp->flags;
1590     np->header.spare = jp->cksum;
1591
1592     return np;
1593 }
1594
1595 #ifndef KERNEL
1596 /* Send a udp datagram */
1597 int
1598 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1599             int length, int istack)
1600 {
1601     struct msghdr msg;
1602         int ret;
1603
1604     memset(&msg, 0, sizeof(msg));
1605     msg.msg_iov = dvec;
1606     msg.msg_iovlen = nvecs;
1607     msg.msg_name = addr;
1608     msg.msg_namelen = sizeof(struct sockaddr_in);
1609
1610     ret = rxi_Sendmsg(socket, &msg, 0);
1611
1612     return ret;
1613 }
1614 #elif !defined(UKERNEL)
1615 /*
1616  * message receipt is done in rxk_input or rx_put.
1617  */
1618
1619 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1620 /*
1621  * Copy an mblock to the contiguous area pointed to by cp.
1622  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1623  * but it doesn't really.
1624  * Returns the number of bytes not transferred.
1625  * The message is NOT changed.
1626  */
1627 static int
1628 cpytoc(mblk_t * mp, int off, int len, char *cp)
1629 {
1630     int n;
1631
1632     for (; mp && len > 0; mp = mp->b_cont) {
1633         if (mp->b_datap->db_type != M_DATA) {
1634             return -1;
1635         }
1636         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1637         memcpy(cp, (char *)mp->b_rptr, n);
1638         cp += n;
1639         len -= n;
1640         mp->b_rptr += n;
1641     }
1642     return (len);
1643 }
1644
1645 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1646  * but it doesn't really.
1647  * This sucks, anyway, do it like m_cpy.... below
1648  */
1649 static int
1650 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1651            int niovs)
1652 {
1653     int m, n, o, t, i;
1654
1655     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1656         if (mp->b_datap->db_type != M_DATA) {
1657             return -1;
1658         }
1659         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1660         len -= n;
1661         while (n) {
1662             if (!t) {
1663                 o = 0;
1664                 i++;
1665                 t = iovs[i].iov_len;
1666             }
1667             m = MIN(n, t);
1668             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1669             mp->b_rptr += m;
1670             o += m;
1671             t -= m;
1672             n -= m;
1673         }
1674     }
1675     return (len);
1676 }
1677
1678 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1679 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1680 #else
1681 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1682 static int
1683 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1684 {
1685     caddr_t p1, p2;
1686     unsigned int l1, l2, i, t;
1687
1688     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1689         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1690
1691     while (off && m)
1692         if (m->m_len <= off) {
1693             off -= m->m_len;
1694             m = m->m_next;
1695             continue;
1696         } else
1697             break;
1698
1699     if (m == NULL)
1700         return len;
1701
1702     p1 = mtod(m, caddr_t) + off;
1703     l1 = m->m_len - off;
1704     i = 0;
1705     p2 = iovs[0].iov_base;
1706     l2 = iovs[0].iov_len;
1707
1708     while (len) {
1709         t = MIN(l1, MIN(l2, (unsigned int)len));
1710         memcpy(p2, p1, t);
1711         p1 += t;
1712         p2 += t;
1713         l1 -= t;
1714         l2 -= t;
1715         len -= t;
1716         if (!l1) {
1717             m = m->m_next;
1718             if (!m)
1719                 break;
1720             p1 = mtod(m, caddr_t);
1721             l1 = m->m_len;
1722         }
1723         if (!l2) {
1724             if (++i >= niovs)
1725                 break;
1726             p2 = iovs[i].iov_base;
1727             l2 = iovs[i].iov_len;
1728         }
1729
1730     }
1731
1732     return len;
1733 }
1734 #endif /* LINUX */
1735 #endif /* AFS_SUN5_ENV */
1736
1737 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1738 int
1739 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1740 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1741      mblk_t *amb;
1742 #else
1743      struct mbuf *amb;
1744 #endif
1745      void (*free) ();
1746      struct rx_packet *phandle;
1747      int hdr_len, data_len;
1748 {
1749     int code;
1750
1751     code =
1752         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1753                      phandle->niovecs);
1754     (*free) (amb);
1755
1756     return code;
1757 }
1758 #endif /* LINUX */
1759 #endif /*KERNEL && !UKERNEL */
1760
1761
1762 /* send a response to a debug packet */
1763
1764 struct rx_packet *
1765 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1766                        afs_uint32 ahost, short aport, int istack)
1767 {
1768     struct rx_debugIn tin;
1769     afs_int32 tl;
1770     struct rx_serverQueueEntry *np, *nqe;
1771
1772     /*
1773      * Only respond to client-initiated Rx debug packets,
1774      * and clear the client flag in the response.
1775      */
1776     if (ap->header.flags & RX_CLIENT_INITIATED) {
1777         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1778         rxi_EncodePacketHeader(ap);
1779     } else {
1780         return ap;
1781     }
1782
1783     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1784     /* all done with packet, now set length to the truth, so we can
1785      * reuse this packet */
1786     rx_computelen(ap, ap->length);
1787
1788     tin.type = ntohl(tin.type);
1789     tin.index = ntohl(tin.index);
1790     switch (tin.type) {
1791     case RX_DEBUGI_GETSTATS:{
1792             struct rx_debugStats tstat;
1793
1794             /* get basic stats */
1795             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1796             tstat.version = RX_DEBUGI_VERSION;
1797 #ifndef RX_ENABLE_LOCKS
1798             tstat.waitingForPackets = rx_waitingForPackets;
1799 #endif
1800             MUTEX_ENTER(&rx_serverPool_lock);
1801             tstat.nFreePackets = htonl(rx_nFreePackets);
1802             tstat.nPackets = htonl(rx_nPackets);
1803             tstat.callsExecuted = htonl(rxi_nCalls);
1804             tstat.packetReclaims = htonl(rx_packetReclaims);
1805             tstat.usedFDs = CountFDs(64);
1806             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1807             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1808             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1809                         tstat.idleThreads);
1810             MUTEX_EXIT(&rx_serverPool_lock);
1811             tstat.idleThreads = htonl(tstat.idleThreads);
1812             tl = sizeof(struct rx_debugStats) - ap->length;
1813             if (tl > 0)
1814                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1815
1816             if (tl <= 0) {
1817                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1818                                (char *)&tstat);
1819                 ap->length = sizeof(struct rx_debugStats);
1820                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1821                 rx_computelen(ap, ap->length);
1822             }
1823             break;
1824         }
1825
1826     case RX_DEBUGI_GETALLCONN:
1827     case RX_DEBUGI_GETCONN:{
1828             unsigned int i, j;
1829             struct rx_connection *tc;
1830             struct rx_call *tcall;
1831             struct rx_debugConn tconn;
1832             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1833
1834
1835             tl = sizeof(struct rx_debugConn) - ap->length;
1836             if (tl > 0)
1837                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1838             if (tl > 0)
1839                 return ap;
1840
1841             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1842             /* get N'th (maybe) "interesting" connection info */
1843             for (i = 0; i < rx_hashTableSize; i++) {
1844 #if !defined(KERNEL)
1845                 /* the time complexity of the algorithm used here
1846                  * exponentially increses with the number of connections.
1847                  */
1848 #ifdef AFS_PTHREAD_ENV
1849                 pthread_yield();
1850 #else
1851                 (void)IOMGR_Poll();
1852 #endif
1853 #endif
1854                 MUTEX_ENTER(&rx_connHashTable_lock);
1855                 /* We might be slightly out of step since we are not
1856                  * locking each call, but this is only debugging output.
1857                  */
1858                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1859                     if ((all || rxi_IsConnInteresting(tc))
1860                         && tin.index-- <= 0) {
1861                         tconn.host = tc->peer->host;
1862                         tconn.port = tc->peer->port;
1863                         tconn.cid = htonl(tc->cid);
1864                         tconn.epoch = htonl(tc->epoch);
1865                         tconn.serial = htonl(tc->serial);
1866                         for (j = 0; j < RX_MAXCALLS; j++) {
1867                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1868                             if ((tcall = tc->call[j])) {
1869                                 tconn.callState[j] = tcall->state;
1870                                 tconn.callMode[j] = tcall->mode;
1871                                 tconn.callFlags[j] = tcall->flags;
1872                                 if (queue_IsNotEmpty(&tcall->rq))
1873                                     tconn.callOther[j] |= RX_OTHER_IN;
1874                                 if (queue_IsNotEmpty(&tcall->tq))
1875                                     tconn.callOther[j] |= RX_OTHER_OUT;
1876                             } else
1877                                 tconn.callState[j] = RX_STATE_NOTINIT;
1878                         }
1879
1880                         tconn.natMTU = htonl(tc->peer->natMTU);
1881                         tconn.error = htonl(tc->error);
1882                         tconn.flags = tc->flags;
1883                         tconn.type = tc->type;
1884                         tconn.securityIndex = tc->securityIndex;
1885                         if (tc->securityObject) {
1886                             RXS_GetStats(tc->securityObject, tc,
1887                                          &tconn.secStats);
1888 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1889 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1890                             DOHTONL(flags);
1891                             DOHTONL(expires);
1892                             DOHTONL(packetsReceived);
1893                             DOHTONL(packetsSent);
1894                             DOHTONL(bytesReceived);
1895                             DOHTONL(bytesSent);
1896                             for (i = 0;
1897                                  i <
1898                                  sizeof(tconn.secStats.spares) /
1899                                  sizeof(short); i++)
1900                                 DOHTONS(spares[i]);
1901                             for (i = 0;
1902                                  i <
1903                                  sizeof(tconn.secStats.sparel) /
1904                                  sizeof(afs_int32); i++)
1905                                 DOHTONL(sparel[i]);
1906                         }
1907
1908                         MUTEX_EXIT(&rx_connHashTable_lock);
1909                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1910                                        (char *)&tconn);
1911                         tl = ap->length;
1912                         ap->length = sizeof(struct rx_debugConn);
1913                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1914                                             istack);
1915                         ap->length = tl;
1916                         return ap;
1917                     }
1918                 }
1919                 MUTEX_EXIT(&rx_connHashTable_lock);
1920             }
1921             /* if we make it here, there are no interesting packets */
1922             tconn.cid = htonl(0xffffffff);      /* means end */
1923             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1924                            (char *)&tconn);
1925             tl = ap->length;
1926             ap->length = sizeof(struct rx_debugConn);
1927             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1928             ap->length = tl;
1929             break;
1930         }
1931
1932         /*
1933          * Pass back all the peer structures we have available
1934          */
1935
1936     case RX_DEBUGI_GETPEER:{
1937             unsigned int i;
1938             struct rx_peer *tp;
1939             struct rx_debugPeer tpeer;
1940
1941
1942             tl = sizeof(struct rx_debugPeer) - ap->length;
1943             if (tl > 0)
1944                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1945             if (tl > 0)
1946                 return ap;
1947
1948             memset(&tpeer, 0, sizeof(tpeer));
1949             for (i = 0; i < rx_hashTableSize; i++) {
1950 #if !defined(KERNEL)
1951                 /* the time complexity of the algorithm used here
1952                  * exponentially increses with the number of peers.
1953                  *
1954                  * Yielding after processing each hash table entry
1955                  * and dropping rx_peerHashTable_lock.
1956                  * also increases the risk that we will miss a new
1957                  * entry - but we are willing to live with this
1958                  * limitation since this is meant for debugging only
1959                  */
1960 #ifdef AFS_PTHREAD_ENV
1961                 pthread_yield();
1962 #else
1963                 (void)IOMGR_Poll();
1964 #endif
1965 #endif
1966                 MUTEX_ENTER(&rx_peerHashTable_lock);
1967                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1968                     if (tin.index-- <= 0) {
1969                         tp->refCount++;
1970                         MUTEX_EXIT(&rx_peerHashTable_lock);
1971
1972                         MUTEX_ENTER(&tp->peer_lock);
1973                         tpeer.host = tp->host;
1974                         tpeer.port = tp->port;
1975                         tpeer.ifMTU = htons(tp->ifMTU);
1976                         tpeer.idleWhen = htonl(tp->idleWhen);
1977                         tpeer.refCount = htons(tp->refCount);
1978                         tpeer.burstSize = tp->burstSize;
1979                         tpeer.burst = tp->burst;
1980                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1981                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1982                         tpeer.rtt = htonl(tp->rtt);
1983                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1984                         tpeer.timeout.sec = htonl(tp->timeout.sec);
1985                         tpeer.timeout.usec = htonl(tp->timeout.usec);
1986                         tpeer.nSent = htonl(tp->nSent);
1987                         tpeer.reSends = htonl(tp->reSends);
1988                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
1989                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
1990                         tpeer.rateFlag = htonl(tp->rateFlag);
1991                         tpeer.natMTU = htons(tp->natMTU);
1992                         tpeer.maxMTU = htons(tp->maxMTU);
1993                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1994                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1995                         tpeer.MTU = htons(tp->MTU);
1996                         tpeer.cwind = htons(tp->cwind);
1997                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
1998                         tpeer.congestSeq = htons(tp->congestSeq);
1999                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2000                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2001                         tpeer.bytesReceived.high =
2002                             htonl(tp->bytesReceived.high);
2003                         tpeer.bytesReceived.low =
2004                             htonl(tp->bytesReceived.low);
2005                         MUTEX_EXIT(&tp->peer_lock);
2006
2007                         MUTEX_ENTER(&rx_peerHashTable_lock);
2008                         tp->refCount--;
2009                         MUTEX_EXIT(&rx_peerHashTable_lock);
2010
2011                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2012                                        (char *)&tpeer);
2013                         tl = ap->length;
2014                         ap->length = sizeof(struct rx_debugPeer);
2015                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2016                                             istack);
2017                         ap->length = tl;
2018                         return ap;
2019                     }
2020                 }
2021                 MUTEX_EXIT(&rx_peerHashTable_lock);
2022             }
2023             /* if we make it here, there are no interesting packets */
2024             tpeer.host = htonl(0xffffffff);     /* means end */
2025             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2026                            (char *)&tpeer);
2027             tl = ap->length;
2028             ap->length = sizeof(struct rx_debugPeer);
2029             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2030             ap->length = tl;
2031             break;
2032         }
2033
2034     case RX_DEBUGI_RXSTATS:{
2035             int i;
2036             afs_int32 *s;
2037
2038             tl = sizeof(rx_stats) - ap->length;
2039             if (tl > 0)
2040                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2041             if (tl > 0)
2042                 return ap;
2043
2044             /* Since its all int32s convert to network order with a loop. */
2045         if (rx_stats_active)
2046             MUTEX_ENTER(&rx_stats_mutex);
2047             s = (afs_int32 *) & rx_stats;
2048             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2049                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2050
2051             tl = ap->length;
2052             ap->length = sizeof(rx_stats);
2053         if (rx_stats_active)
2054             MUTEX_EXIT(&rx_stats_mutex);
2055             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2056             ap->length = tl;
2057             break;
2058         }
2059
2060     default:
2061         /* error response packet */
2062         tin.type = htonl(RX_DEBUGI_BADTYPE);
2063         tin.index = tin.type;
2064         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2065         tl = ap->length;
2066         ap->length = sizeof(struct rx_debugIn);
2067         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2068         ap->length = tl;
2069         break;
2070     }
2071     return ap;
2072 }
2073
2074 struct rx_packet *
2075 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2076                          afs_uint32 ahost, short aport, int istack)
2077 {
2078     afs_int32 tl;
2079
2080     /*
2081      * Only respond to client-initiated version requests, and
2082      * clear that flag in the response.
2083      */
2084     if (ap->header.flags & RX_CLIENT_INITIATED) {
2085         char buf[66];
2086
2087         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2088         rxi_EncodePacketHeader(ap);
2089         memset(buf, 0, sizeof(buf));
2090         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2091         rx_packetwrite(ap, 0, 65, buf);
2092         tl = ap->length;
2093         ap->length = 65;
2094         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2095         ap->length = tl;
2096     }
2097
2098     return ap;
2099 }
2100
2101
2102 /* send a debug packet back to the sender */
2103 static void
2104 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2105                     afs_uint32 ahost, short aport, afs_int32 istack)
2106 {
2107     struct sockaddr_in taddr;
2108     unsigned int i, nbytes, savelen = 0;
2109     int saven = 0;
2110 #ifdef KERNEL
2111     int waslocked = ISAFS_GLOCK();
2112 #endif
2113
2114     taddr.sin_family = AF_INET;
2115     taddr.sin_port = aport;
2116     taddr.sin_addr.s_addr = ahost;
2117 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2118     taddr.sin_len = sizeof(struct sockaddr_in);
2119 #endif
2120
2121     /* We need to trim the niovecs. */
2122     nbytes = apacket->length;
2123     for (i = 1; i < apacket->niovecs; i++) {
2124         if (nbytes <= apacket->wirevec[i].iov_len) {
2125             savelen = apacket->wirevec[i].iov_len;
2126             saven = apacket->niovecs;
2127             apacket->wirevec[i].iov_len = nbytes;
2128             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2129         } else
2130             nbytes -= apacket->wirevec[i].iov_len;
2131     }
2132 #ifdef KERNEL
2133 #ifdef RX_KERNEL_TRACE
2134     if (ICL_SETACTIVE(afs_iclSetp)) {
2135         if (!waslocked)
2136             AFS_GLOCK();
2137         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2138                    "before osi_NetSend()");
2139         AFS_GUNLOCK();
2140     } else
2141 #else
2142     if (waslocked)
2143         AFS_GUNLOCK();
2144 #endif
2145 #endif
2146     /* debug packets are not reliably delivered, hence the cast below. */
2147     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2148                       apacket->length + RX_HEADER_SIZE, istack);
2149 #ifdef KERNEL
2150 #ifdef RX_KERNEL_TRACE
2151     if (ICL_SETACTIVE(afs_iclSetp)) {
2152         AFS_GLOCK();
2153         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2154                    "after osi_NetSend()");
2155         if (!waslocked)
2156             AFS_GUNLOCK();
2157     } else
2158 #else
2159     if (waslocked)
2160         AFS_GLOCK();
2161 #endif
2162 #endif
2163     if (saven) {                /* means we truncated the packet above. */
2164         apacket->wirevec[i - 1].iov_len = savelen;
2165         apacket->niovecs = saven;
2166     }
2167
2168 }
2169
2170 /* Send the packet to appropriate destination for the specified
2171  * call.  The header is first encoded and placed in the packet.
2172  */
2173 void
2174 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2175                struct rx_packet *p, int istack)
2176 {
2177 #if defined(KERNEL)
2178     int waslocked;
2179 #endif
2180     int code;
2181     struct sockaddr_in addr;
2182     struct rx_peer *peer = conn->peer;
2183     osi_socket socket;
2184 #ifdef RXDEBUG
2185     char deliveryType = 'S';
2186 #endif
2187     /* The address we're sending the packet to */
2188     memset(&addr, 0, sizeof(addr));
2189     addr.sin_family = AF_INET;
2190     addr.sin_port = peer->port;
2191     addr.sin_addr.s_addr = peer->host;
2192
2193     /* This stuff should be revamped, I think, so that most, if not
2194      * all, of the header stuff is always added here.  We could
2195      * probably do away with the encode/decode routines. XXXXX */
2196
2197     /* Stamp each packet with a unique serial number.  The serial
2198      * number is maintained on a connection basis because some types
2199      * of security may be based on the serial number of the packet,
2200      * and security is handled on a per authenticated-connection
2201      * basis. */
2202     /* Pre-increment, to guarantee no zero serial number; a zero
2203      * serial number means the packet was never sent. */
2204     MUTEX_ENTER(&conn->conn_data_lock);
2205     p->header.serial = ++conn->serial;
2206     if (p->length > conn->peer->maxPacketSize) {
2207         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2208             (p->header.flags & RX_REQUEST_ACK)) {
2209             conn->lastPingSize = p->length;
2210             conn->lastPingSizeSer = p->header.serial;
2211         } else if (p->header.seq != 0) {
2212             conn->lastPacketSize = p->length;
2213             conn->lastPacketSizeSeq = p->header.seq;
2214         }
2215     }
2216     MUTEX_EXIT(&conn->conn_data_lock);
2217     /* This is so we can adjust retransmit time-outs better in the face of
2218      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2219      */
2220     if (p->firstSerial == 0) {
2221         p->firstSerial = p->header.serial;
2222     }
2223 #ifdef RXDEBUG
2224     /* If an output tracer function is defined, call it with the packet and
2225      * network address.  Note this function may modify its arguments. */
2226     if (rx_almostSent) {
2227         int drop = (*rx_almostSent) (p, &addr);
2228         /* drop packet if return value is non-zero? */
2229         if (drop)
2230             deliveryType = 'D'; /* Drop the packet */
2231     }
2232 #endif
2233
2234     /* Get network byte order header */
2235     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2236                                  * touch ALL the fields */
2237
2238     /* Send the packet out on the same socket that related packets are being
2239      * received on */
2240     socket =
2241         (conn->type ==
2242          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2243
2244 #ifdef RXDEBUG
2245     /* Possibly drop this packet,  for testing purposes */
2246     if ((deliveryType == 'D')
2247         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2248             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2249         deliveryType = 'D';     /* Drop the packet */
2250     } else {
2251         deliveryType = 'S';     /* Send the packet */
2252 #endif /* RXDEBUG */
2253
2254         /* Loop until the packet is sent.  We'd prefer just to use a
2255          * blocking socket, but unfortunately the interface doesn't
2256          * allow us to have the socket block in send mode, and not
2257          * block in receive mode */
2258 #ifdef KERNEL
2259         waslocked = ISAFS_GLOCK();
2260 #ifdef RX_KERNEL_TRACE
2261         if (ICL_SETACTIVE(afs_iclSetp)) {
2262             if (!waslocked)
2263                 AFS_GLOCK();
2264             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2265                        "before osi_NetSend()");
2266             AFS_GUNLOCK();
2267         } else
2268 #else
2269         if (waslocked)
2270             AFS_GUNLOCK();
2271 #endif
2272 #endif
2273         if ((code =
2274              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2275                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2276             /* send failed, so let's hurry up the resend, eh? */
2277             if (rx_stats_active)
2278                 rx_atomic_inc(&rx_stats.netSendFailures);
2279             p->retryTime = p->timeSent; /* resend it very soon */
2280             clock_Addmsec(&(p->retryTime),
2281                           10 + (((afs_uint32) p->backoff) << 8));
2282             /* Some systems are nice and tell us right away that we cannot
2283              * reach this recipient by returning an error code.
2284              * So, when this happens let's "down" the host NOW so
2285              * we don't sit around waiting for this host to timeout later.
2286              */
2287             if (call &&
2288 #ifdef AFS_NT40_ENV
2289                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2290 #elif defined(AFS_LINUX20_ENV)
2291                 code == -ENETUNREACH
2292 #elif defined(AFS_DARWIN_ENV)
2293                 code == EHOSTUNREACH
2294 #else
2295                 0
2296 #endif
2297                 )
2298                 call->lastReceiveTime = 0;
2299         }
2300 #ifdef KERNEL
2301 #ifdef RX_KERNEL_TRACE
2302         if (ICL_SETACTIVE(afs_iclSetp)) {
2303             AFS_GLOCK();
2304             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2305                        "after osi_NetSend()");
2306             if (!waslocked)
2307                 AFS_GUNLOCK();
2308         } else
2309 #else
2310         if (waslocked)
2311             AFS_GLOCK();
2312 #endif
2313 #endif
2314 #ifdef RXDEBUG
2315     }
2316     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d\n",
2317           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2318           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2319           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2320 #endif
2321     if (rx_stats_active) {
2322         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2323         MUTEX_ENTER(&peer->peer_lock);
2324         hadd32(peer->bytesSent, p->length);
2325         MUTEX_EXIT(&peer->peer_lock);
2326     }
2327 }
2328
2329 /* Send a list of packets to appropriate destination for the specified
2330  * connection.  The headers are first encoded and placed in the packets.
2331  */
2332 void
2333 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2334                    struct rx_packet **list, int len, int istack)
2335 {
2336 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2337     int waslocked;
2338 #endif
2339     struct sockaddr_in addr;
2340     struct rx_peer *peer = conn->peer;
2341     osi_socket socket;
2342     struct rx_packet *p = NULL;
2343     struct iovec wirevec[RX_MAXIOVECS];
2344     int i, length, code;
2345     afs_uint32 serial;
2346     afs_uint32 temp;
2347     struct rx_jumboHeader *jp;
2348 #ifdef RXDEBUG
2349     char deliveryType = 'S';
2350 #endif
2351     /* The address we're sending the packet to */
2352     addr.sin_family = AF_INET;
2353     addr.sin_port = peer->port;
2354     addr.sin_addr.s_addr = peer->host;
2355
2356     if (len + 1 > RX_MAXIOVECS) {
2357         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2358     }
2359
2360     /*
2361      * Stamp the packets in this jumbogram with consecutive serial numbers
2362      */
2363     MUTEX_ENTER(&conn->conn_data_lock);
2364     serial = conn->serial;
2365     conn->serial += len;
2366     for (i = 0; i < len; i++) {
2367         p = list[i];
2368         if (p->length > conn->peer->maxPacketSize) {
2369             /* a ping *or* a sequenced packet can count */
2370             if ((p->length > conn->peer->maxPacketSize)) {
2371                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2372                      (p->header.flags & RX_REQUEST_ACK)) &&
2373                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2374                     conn->lastPingSize = p->length;
2375                     conn->lastPingSizeSer = serial + i;
2376                 } else if ((p->header.seq != 0) &&
2377                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2378                     conn->lastPacketSize = p->length;
2379                     conn->lastPacketSizeSeq = p->header.seq;
2380                 }
2381             }
2382         }
2383     }
2384     MUTEX_EXIT(&conn->conn_data_lock);
2385
2386
2387     /* This stuff should be revamped, I think, so that most, if not
2388      * all, of the header stuff is always added here.  We could
2389      * probably do away with the encode/decode routines. XXXXX */
2390
2391     jp = NULL;
2392     length = RX_HEADER_SIZE;
2393     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2394     wirevec[0].iov_len = RX_HEADER_SIZE;
2395     for (i = 0; i < len; i++) {
2396         p = list[i];
2397
2398         /* The whole 3.5 jumbogram scheme relies on packets fitting
2399          * in a single packet buffer. */
2400         if (p->niovecs > 2) {
2401             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2402         }
2403
2404         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2405          * in this chunk.  */
2406         if (i < len - 1) {
2407             if (p->length != RX_JUMBOBUFFERSIZE) {
2408                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2409             }
2410             p->header.flags |= RX_JUMBO_PACKET;
2411             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2412             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2413         } else {
2414             wirevec[i + 1].iov_len = p->length;
2415             length += p->length;
2416         }
2417         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2418         if (jp != NULL) {
2419             /* Convert jumbo packet header to network byte order */
2420             temp = (afs_uint32) (p->header.flags) << 24;
2421             temp |= (afs_uint32) (p->header.spare);
2422             *(afs_uint32 *) jp = htonl(temp);
2423         }
2424         jp = (struct rx_jumboHeader *)
2425             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2426
2427         /* Stamp each packet with a unique serial number.  The serial
2428          * number is maintained on a connection basis because some types
2429          * of security may be based on the serial number of the packet,
2430          * and security is handled on a per authenticated-connection
2431          * basis. */
2432         /* Pre-increment, to guarantee no zero serial number; a zero
2433          * serial number means the packet was never sent. */
2434         p->header.serial = ++serial;
2435         /* This is so we can adjust retransmit time-outs better in the face of
2436          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2437          */
2438         if (p->firstSerial == 0) {
2439             p->firstSerial = p->header.serial;
2440         }
2441 #ifdef RXDEBUG
2442         /* If an output tracer function is defined, call it with the packet and
2443          * network address.  Note this function may modify its arguments. */
2444         if (rx_almostSent) {
2445             int drop = (*rx_almostSent) (p, &addr);
2446             /* drop packet if return value is non-zero? */
2447             if (drop)
2448                 deliveryType = 'D';     /* Drop the packet */
2449         }
2450 #endif
2451
2452         /* Get network byte order header */
2453         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2454                                          * touch ALL the fields */
2455     }
2456
2457     /* Send the packet out on the same socket that related packets are being
2458      * received on */
2459     socket =
2460         (conn->type ==
2461          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2462
2463 #ifdef RXDEBUG
2464     /* Possibly drop this packet,  for testing purposes */
2465     if ((deliveryType == 'D')
2466         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2467             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2468         deliveryType = 'D';     /* Drop the packet */
2469     } else {
2470         deliveryType = 'S';     /* Send the packet */
2471 #endif /* RXDEBUG */
2472
2473         /* Loop until the packet is sent.  We'd prefer just to use a
2474          * blocking socket, but unfortunately the interface doesn't
2475          * allow us to have the socket block in send mode, and not
2476          * block in receive mode */
2477 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2478         waslocked = ISAFS_GLOCK();
2479         if (!istack && waslocked)
2480             AFS_GUNLOCK();
2481 #endif
2482         if ((code =
2483              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2484                          istack)) != 0) {
2485             /* send failed, so let's hurry up the resend, eh? */
2486             if (rx_stats_active)
2487                 rx_atomic_inc(&rx_stats.netSendFailures);
2488             for (i = 0; i < len; i++) {
2489                 p = list[i];
2490                 p->retryTime = p->timeSent;     /* resend it very soon */
2491                 clock_Addmsec(&(p->retryTime),
2492                               10 + (((afs_uint32) p->backoff) << 8));
2493             }
2494             /* Some systems are nice and tell us right away that we cannot
2495              * reach this recipient by returning an error code.
2496              * So, when this happens let's "down" the host NOW so
2497              * we don't sit around waiting for this host to timeout later.
2498              */
2499             if (call &&
2500 #ifdef AFS_NT40_ENV
2501                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2502 #elif defined(AFS_LINUX20_ENV)
2503                 code == -ENETUNREACH
2504 #elif defined(AFS_DARWIN_ENV)
2505                 code == EHOSTUNREACH
2506 #else
2507                 0
2508 #endif
2509                 )
2510                 call->lastReceiveTime = 0;
2511         }
2512 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2513         if (!istack && waslocked)
2514             AFS_GLOCK();
2515 #endif
2516 #ifdef RXDEBUG
2517     }
2518
2519     osi_Assert(p != NULL);
2520
2521     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d\n",
2522           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2523           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2524           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2525
2526 #endif
2527     if (rx_stats_active) {
2528         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2529         MUTEX_ENTER(&peer->peer_lock);
2530         hadd32(peer->bytesSent, p->length);
2531         MUTEX_EXIT(&peer->peer_lock);
2532     }
2533 }
2534
2535
2536 /* Send a "special" packet to the peer connection.  If call is
2537  * specified, then the packet is directed to a specific call channel
2538  * associated with the connection, otherwise it is directed to the
2539  * connection only. Uses optionalPacket if it is supplied, rather than
2540  * allocating a new packet buffer.  Nbytes is the length of the data
2541  * portion of the packet.  If data is non-null, nbytes of data are
2542  * copied into the packet.  Type is the type of the packet, as defined
2543  * in rx.h.  Bug: there's a lot of duplication between this and other
2544  * routines.  This needs to be cleaned up. */
2545 struct rx_packet *
2546 rxi_SendSpecial(struct rx_call *call,
2547                 struct rx_connection *conn,
2548                 struct rx_packet *optionalPacket, int type, char *data,
2549                 int nbytes, int istack)
2550 {
2551     /* Some of the following stuff should be common code for all
2552      * packet sends (it's repeated elsewhere) */
2553     struct rx_packet *p;
2554     unsigned int i = 0;
2555     int savelen = 0, saven = 0;
2556     int channel, callNumber;
2557     if (call) {
2558         channel = call->channel;
2559         callNumber = *call->callNumber;
2560         /* BUSY packets refer to the next call on this connection */
2561         if (type == RX_PACKET_TYPE_BUSY) {
2562             callNumber++;
2563         }
2564     } else {
2565         channel = 0;
2566         callNumber = 0;
2567     }
2568     p = optionalPacket;
2569     if (!p) {
2570         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2571         if (!p)
2572             osi_Panic("rxi_SendSpecial failure");
2573     }
2574
2575     if (nbytes != -1)
2576         p->length = nbytes;
2577     else
2578         nbytes = p->length;
2579     p->header.serviceId = conn->serviceId;
2580     p->header.securityIndex = conn->securityIndex;
2581     p->header.cid = (conn->cid | channel);
2582     p->header.callNumber = callNumber;
2583     p->header.seq = 0;
2584     p->header.epoch = conn->epoch;
2585     p->header.type = type;
2586     p->header.flags = 0;
2587     if (conn->type == RX_CLIENT_CONNECTION)
2588         p->header.flags |= RX_CLIENT_INITIATED;
2589     if (data)
2590         rx_packetwrite(p, 0, nbytes, data);
2591
2592     for (i = 1; i < p->niovecs; i++) {
2593         if (nbytes <= p->wirevec[i].iov_len) {
2594             savelen = p->wirevec[i].iov_len;
2595             saven = p->niovecs;
2596             p->wirevec[i].iov_len = nbytes;
2597             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2598         } else
2599             nbytes -= p->wirevec[i].iov_len;
2600     }
2601
2602     if (call)
2603         rxi_Send(call, p, istack);
2604     else
2605         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2606     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2607         /* really need to do this, but it seems safer this way, given that  */
2608         /* sneaky optionalPacket... */
2609         p->wirevec[i - 1].iov_len = savelen;
2610         p->niovecs = saven;
2611     }
2612     if (!optionalPacket)
2613         rxi_FreePacket(p);
2614     return optionalPacket;
2615 }
2616
2617
2618 /* Encode the packet's header (from the struct header in the packet to
2619  * the net byte order representation in the wire representation of the
2620  * packet, which is what is actually sent out on the wire) */
2621 void
2622 rxi_EncodePacketHeader(struct rx_packet *p)
2623 {
2624     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2625
2626     memset(buf, 0, RX_HEADER_SIZE);
2627     *buf++ = htonl(p->header.epoch);
2628     *buf++ = htonl(p->header.cid);
2629     *buf++ = htonl(p->header.callNumber);
2630     *buf++ = htonl(p->header.seq);
2631     *buf++ = htonl(p->header.serial);
2632     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2633                    | (((afs_uint32) p->header.flags) << 16)
2634                    | (p->header.userStatus << 8) | p->header.securityIndex);
2635     /* Note: top 16 bits of this next word were reserved */
2636     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2637 }
2638
2639 /* Decode the packet's header (from net byte order to a struct header) */
2640 void
2641 rxi_DecodePacketHeader(struct rx_packet *p)
2642 {
2643     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2644     afs_uint32 temp;
2645
2646     p->header.epoch = ntohl(*buf);
2647     buf++;
2648     p->header.cid = ntohl(*buf);
2649     buf++;
2650     p->header.callNumber = ntohl(*buf);
2651     buf++;
2652     p->header.seq = ntohl(*buf);
2653     buf++;
2654     p->header.serial = ntohl(*buf);
2655     buf++;
2656
2657     temp = ntohl(*buf);
2658     buf++;
2659
2660     /* C will truncate byte fields to bytes for me */
2661     p->header.type = temp >> 24;
2662     p->header.flags = temp >> 16;
2663     p->header.userStatus = temp >> 8;
2664     p->header.securityIndex = temp >> 0;
2665
2666     temp = ntohl(*buf);
2667     buf++;
2668
2669     p->header.serviceId = (temp & 0xffff);
2670     p->header.spare = temp >> 16;
2671     /* Note: top 16 bits of this last word are the security checksum */
2672 }
2673
2674 /*
2675  * LOCKS HELD: called with call->lock held.
2676  *
2677  * PrepareSendPacket is the only place in the code that
2678  * can increment call->tnext.  This could become an atomic
2679  * in the future.  Beyond that there is nothing in this
2680  * function that requires the call being locked.  This
2681  * function can only be called by the application thread.
2682  */
2683 void
2684 rxi_PrepareSendPacket(struct rx_call *call,
2685                       struct rx_packet *p, int last)
2686 {
2687     struct rx_connection *conn = call->conn;
2688     afs_uint32 seq = call->tnext++;
2689     unsigned int i;
2690     afs_int32 len;              /* len must be a signed type; it can go negative */
2691
2692     /* No data packets on call 0. Where do these come from? */
2693     if (*call->callNumber == 0)
2694         *call->callNumber = 1;
2695
2696     MUTEX_EXIT(&call->lock);
2697     p->flags &= ~RX_PKTFLAG_ACKED;
2698     p->header.cid = (conn->cid | call->channel);
2699     p->header.serviceId = conn->serviceId;
2700     p->header.securityIndex = conn->securityIndex;
2701
2702     p->header.callNumber = *call->callNumber;
2703     p->header.seq = seq;
2704     p->header.epoch = conn->epoch;
2705     p->header.type = RX_PACKET_TYPE_DATA;
2706     p->header.flags = 0;
2707     p->header.spare = 0;
2708     if (conn->type == RX_CLIENT_CONNECTION)
2709         p->header.flags |= RX_CLIENT_INITIATED;
2710
2711     if (last)
2712         p->header.flags |= RX_LAST_PACKET;
2713
2714     clock_Zero(&p->retryTime);  /* Never yet transmitted */
2715     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2716     p->header.serial = 0;       /* Another way of saying never transmitted... */
2717     p->backoff = 0;
2718
2719     /* Now that we're sure this is the last data on the call, make sure
2720      * that the "length" and the sum of the iov_lens matches. */
2721     len = p->length + call->conn->securityHeaderSize;
2722
2723     for (i = 1; i < p->niovecs && len > 0; i++) {
2724         len -= p->wirevec[i].iov_len;
2725     }
2726     if (len > 0) {
2727         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2728     } else if (i < p->niovecs) {
2729         /* Free any extra elements in the wirevec */
2730 #if defined(RX_ENABLE_TSFPQ)
2731         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2732 #else /* !RX_ENABLE_TSFPQ */
2733         MUTEX_ENTER(&rx_freePktQ_lock);
2734         rxi_FreeDataBufsNoLock(p, i);
2735         MUTEX_EXIT(&rx_freePktQ_lock);
2736 #endif /* !RX_ENABLE_TSFPQ */
2737
2738         p->niovecs = i;
2739     }
2740     if (len)
2741         p->wirevec[i - 1].iov_len += len;
2742     RXS_PreparePacket(conn->securityObject, call, p);
2743     MUTEX_ENTER(&call->lock);
2744 }
2745
2746 /* Given an interface MTU size, calculate an adjusted MTU size that
2747  * will make efficient use of the RX buffers when the peer is sending
2748  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2749 int
2750 rxi_AdjustIfMTU(int mtu)
2751 {
2752     int adjMTU;
2753     int frags;
2754
2755     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2756         return mtu;
2757     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2758     if (mtu <= adjMTU) {
2759         return mtu;
2760     }
2761     mtu -= adjMTU;
2762     if (mtu <= 0) {
2763         return adjMTU;
2764     }
2765     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2766     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2767 }
2768
2769 /* Given an interface MTU size, and the peer's advertised max receive
2770  * size, calculate an adjisted maxMTU size that makes efficient use
2771  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2772 int
2773 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2774 {
2775     int maxMTU = mtu * rxi_nSendFrags;
2776     maxMTU = MIN(maxMTU, peerMaxMTU);
2777     return rxi_AdjustIfMTU(maxMTU);
2778 }
2779
2780 /* Given a packet size, figure out how many datagram packet will fit.
2781  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2782  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2783  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2784 int
2785 rxi_AdjustDgramPackets(int frags, int mtu)
2786 {
2787     int maxMTU;
2788     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2789         return 1;
2790     }
2791     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2792     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2793     /* subtract the size of the first and last packets */
2794     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2795     if (maxMTU < 0) {
2796         return 1;
2797     }
2798     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2799 }
2800
2801 #ifndef KERNEL
2802 /*
2803  * This function can be used by the Windows Cache Manager
2804  * to dump the list of all rx packets so that we can determine
2805  * where the packet leakage is.
2806  */
2807 int rx_DumpPackets(FILE *outputFile, char *cookie)
2808 {
2809 #ifdef RXDEBUG_PACKET
2810     struct rx_packet *p;
2811 #ifdef AFS_NT40_ENV
2812     int zilch;
2813     char output[2048];
2814 #define RXDPRINTF sprintf
2815 #define RXDPRINTOUT output
2816 #else
2817 #define RXDPRINTF fprintf
2818 #define RXDPRINTOUT outputFile
2819 #endif
2820
2821     NETPRI;
2822     MUTEX_ENTER(&rx_freePktQ_lock);
2823     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2824 #ifdef AFS_NT40_ENV
2825     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2826 #endif
2827
2828     for (p = rx_mallocedP; p; p = p->allNextp) {
2829         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2830                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2831                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2832                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2833                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2834                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2835 #ifdef AFS_NT40_ENV
2836         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2837 #endif
2838     }
2839
2840     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2841 #ifdef AFS_NT40_ENV
2842     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2843 #endif
2844
2845     MUTEX_EXIT(&rx_freePktQ_lock);
2846     USERPRI;
2847 #endif /* RXDEBUG_PACKET */
2848     return 0;
2849 }
2850 #endif