src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # include <afs/opr.h>
  47 # if defined(AFS_NT40_ENV)
  48 #  ifndef EWOULDBLOCK
  49 #   define EWOULDBLOCK WSAEWOULDBLOCK
  50 #  endif
  51 #  include "rx_user.h"
  52 #  include "rx_xmit_nt.h"
  53 # endif
  54 # include <lwp.h>
  55 #endif /* KERNEL */
  56
  57 #ifdef  AFS_SUN5_ENV
  58 # include <sys/sysmacros.h>
  59 #endif
  60
  61 #include <opr/queue.h>
  62
  63 #include "rx.h"
  64 #include "rx_clock.h"
  65 #include "rx_packet.h"
  66 #include "rx_atomic.h"
  67 #include "rx_globals.h"
  68 #include "rx_internal.h"
  69 #include "rx_stats.h"
  70
  71 #include "rx_peer.h"
  72 #include "rx_conn.h"
  73 #include "rx_call.h"
  74
  75 #ifdef RX_LOCKS_DB
  76 /* rxdb_fileID is used to identify the lock location, along with line#. */
  77 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  78 #endif /* RX_LOCKS_DB */
  79 static struct rx_packet *rx_mallocedP = 0;
  80 #ifdef RXDEBUG_PACKET
  81 static afs_uint32       rx_packet_id = 0;
  82 #endif
  83
  84 extern char cml_version_number[];
  85
  86 static int AllocPacketBufs(int class, int num_pkts, struct opr_queue *q);
  87
  88 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  89                                 afs_uint32 ahost, short aport,
  90                                 afs_int32 istack);
  91 static struct rx_packet *rxi_AllocPacketNoLock(int class);
  92
  93 #ifndef KERNEL
  94 static void rxi_MorePacketsNoLock(int apackets);
  95 #endif
  96
  97 #ifdef RX_ENABLE_TSFPQ
  98 static int rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first,
  99                                  int flush_global);
 100 static void rxi_AdjustLocalPacketsTSFPQ(int num_keep_local,
 101                                         int allow_overcommit);
 102 #else
 103 static void rxi_FreePacketNoLock(struct rx_packet *p);
 104 static int rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first);
 105 static int rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first,
 106                                    struct opr_queue * q);
 107 #endif
 108
 109 extern struct opr_queue rx_idleServerQueue;
 110
 111 /* some rules about packets:
 112  * 1.  When a packet is allocated, the final iov_buf contains room for
 113  * a security trailer, but iov_len masks that fact.  If the security
 114  * package wants to add the trailer, it may do so, and then extend
 115  * iov_len appropriately.  For this reason, packet's niovecs and
 116  * iov_len fields should be accurate before calling PreparePacket.
 117 */
 118
 119 /* Preconditions:
 120  *        all packet buffers (iov_base) are integral multiples of
 121  *        the word size.
 122  *        offset is an integral multiple of the word size.
 123  */
 124 afs_int32
 125 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 126 {
 127     unsigned int i;
 128     size_t l;
 129     for (l = 0, i = 1; i < packet->niovecs; i++) {
 130         if (l + packet->wirevec[i].iov_len > offset) {
 131             return
 132                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 133                                  (offset - l)));
 134         }
 135         l += packet->wirevec[i].iov_len;
 136     }
 137
 138     return 0;
 139 }
 140
 141 /* Preconditions:
 142  *        all packet buffers (iov_base) are integral multiples of the word size.
 143  *        offset is an integral multiple of the word size.
 144  */
 145 afs_int32
 146 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 147 {
 148     unsigned int i;
 149     size_t l;
 150     for (l = 0, i = 1; i < packet->niovecs; i++) {
 151         if (l + packet->wirevec[i].iov_len > offset) {
 152             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 153                              (offset - l))) = data;
 154             return 0;
 155         }
 156         l += packet->wirevec[i].iov_len;
 157     }
 158
 159     return 0;
 160 }
 161
 162 /* Preconditions:
 163  *        all packet buffers (iov_base) are integral multiples of the
 164  *        word size.
 165  *        offset is an integral multiple of the word size.
 166  * Packet Invariants:
 167  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 168  */
 169 afs_int32
 170 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 171                   char *out)
 172 {
 173     unsigned int i, j, l, r;
 174     for (l = 0, i = 1; i < packet->niovecs; i++) {
 175         if (l + packet->wirevec[i].iov_len > offset) {
 176             break;
 177         }
 178         l += packet->wirevec[i].iov_len;
 179     }
 180
 181     /* i is the iovec which contains the first little bit of data in which we
 182      * are interested.  l is the total length of everything prior to this iovec.
 183      * j is the number of bytes we can safely copy out of this iovec.
 184      * offset only applies to the first iovec.
 185      */
 186     r = resid;
 187     while ((r > 0) && (i < packet->niovecs)) {
 188         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 189         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 190         r -= j;
 191         out += j;
 192         l += packet->wirevec[i].iov_len;
 193         offset = l;
 194         i++;
 195     }
 196
 197     return (r ? (resid - r) : resid);
 198 }
 199
 200
 201 /* Preconditions:
 202  *        all packet buffers (iov_base) are integral multiples of the
 203  *        word size.
 204  *        offset is an integral multiple of the word size.
 205  */
 206 afs_int32
 207 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 208 {
 209     unsigned int i, j, l, o, r;
 210     char *b;
 211
 212     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 213         if (l + packet->wirevec[i].iov_len > o) {
 214             break;
 215         }
 216         l += packet->wirevec[i].iov_len;
 217     }
 218
 219     /* i is the iovec which contains the first little bit of data in which we
 220      * are interested.  l is the total length of everything prior to this iovec.
 221      * j is the number of bytes we can safely copy out of this iovec.
 222      * offset only applies to the first iovec.
 223      */
 224     r = resid;
 225     while ((r > 0) && (i <= RX_MAXWVECS)) {
 226         if (i >= packet->niovecs)
 227             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 228                 break;
 229
 230         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 231         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 232         memcpy(b, in, j);
 233         r -= j;
 234         in += j;
 235         l += packet->wirevec[i].iov_len;
 236         offset = l;
 237         i++;
 238     }
 239
 240     return (r ? (resid - r) : resid);
 241 }
 242
 243 int
 244 rxi_AllocPackets(int class, int num_pkts, struct opr_queue * q)
 245 {
 246     struct opr_queue *c;
 247
 248     num_pkts = AllocPacketBufs(class, num_pkts, q);
 249
 250     for (opr_queue_Scan(q, c)) {
 251         RX_PACKET_IOV_FULLINIT(opr_queue_Entry(c, struct rx_packet, entry));
 252     }
 253
 254     return num_pkts;
 255 }
 256
 257 #ifdef RX_ENABLE_TSFPQ
 258 static int
 259 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 260 {
 261     struct rx_ts_info_t * rx_ts_info;
 262     int transfer;
 263     SPLVAR;
 264
 265     RX_TS_INFO_GET(rx_ts_info);
 266
 267     transfer = num_pkts - rx_ts_info->_FPQ.len;
 268     if (transfer > 0) {
 269         NETPRI;
 270         MUTEX_ENTER(&rx_freePktQ_lock);
 271         transfer = MAX(transfer, rx_TSFPQGlobSize);
 272         if (transfer > rx_nFreePackets) {
 273             /* alloc enough for us, plus a few globs for other threads */
 274             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 275         }
 276
 277         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 278
 279         MUTEX_EXIT(&rx_freePktQ_lock);
 280         USERPRI;
 281     }
 282
 283     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 284
 285     return num_pkts;
 286 }
 287 #else /* RX_ENABLE_TSFPQ */
 288 static int
 289 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 290 {
 291     struct rx_packet *c;
 292     int i;
 293 #ifdef KERNEL
 294     int overq = 0;
 295 #endif
 296     SPLVAR;
 297
 298     NETPRI;
 299
 300     MUTEX_ENTER(&rx_freePktQ_lock);
 301
 302 #ifdef KERNEL
 303     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 304          num_pkts--, overq++);
 305
 306     if (overq) {
 307         rxi_NeedMorePackets = TRUE;
 308         if (rx_stats_active) {
 309             switch (class) {
 310             case RX_PACKET_CLASS_RECEIVE:
 311                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 312                 break;
 313             case RX_PACKET_CLASS_SEND:
 314                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 315                 break;
 316             case RX_PACKET_CLASS_SPECIAL:
 317                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 318                 break;
 319             case RX_PACKET_CLASS_RECV_CBUF:
 320                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 321                 break;
 322             case RX_PACKET_CLASS_SEND_CBUF:
 323                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 324                 break;
 325             }
 326         }
 327     }
 328
 329     if (rx_nFreePackets < num_pkts)
 330         num_pkts = rx_nFreePackets;
 331
 332     if (!num_pkts) {
 333         rxi_NeedMorePackets = TRUE;
 334         goto done;
 335     }
 336 #else /* KERNEL */
 337     if (rx_nFreePackets < num_pkts) {
 338         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 339     }
 340 #endif /* KERNEL */
 341
 342     for (i=0, c=opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
 343          i < num_pkts;
 344          i++, c=opr_queue_Next(&c->entry, struct rx_packet, entry)) {
 345         RX_FPQ_MARK_USED(c);
 346     }
 347
 348     opr_queue_SplitBeforeAppend(&rx_freePacketQueue, q, &c->entry);
 349
 350     rx_nFreePackets -= num_pkts;
 351
 352 #ifdef KERNEL
 353   done:
 354 #endif
 355     MUTEX_EXIT(&rx_freePktQ_lock);
 356
 357     USERPRI;
 358     return num_pkts;
 359 }
 360 #endif /* RX_ENABLE_TSFPQ */
 361
 362 /*
 363  * Free a packet currently used as a continuation buffer
 364  */
 365 #ifdef RX_ENABLE_TSFPQ
 366 /* num_pkts=0 means queue length is unknown */
 367 int
 368 rxi_FreePackets(int num_pkts, struct opr_queue * q)
 369 {
 370     struct rx_ts_info_t * rx_ts_info;
 371     struct opr_queue *cursor, *store;
 372     SPLVAR;
 373
 374     osi_Assert(num_pkts >= 0);
 375     RX_TS_INFO_GET(rx_ts_info);
 376
 377     if (!num_pkts) {
 378         for (opr_queue_ScanSafe(q, cursor, store)) {
 379             num_pkts++;
 380             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 381                                                  entry), 2, 0);
 382         }
 383     } else {
 384         for (opr_queue_ScanSafe(q, cursor, store)) {
 385             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 386                                                  entry), 2, 0);
 387         }
 388     }
 389
 390     if (num_pkts) {
 391         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 392     }
 393
 394     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 395         NETPRI;
 396         MUTEX_ENTER(&rx_freePktQ_lock);
 397
 398         RX_TS_FPQ_LTOG(rx_ts_info);
 399
 400         /* Wakeup anyone waiting for packets */
 401         rxi_PacketsUnWait();
 402
 403         MUTEX_EXIT(&rx_freePktQ_lock);
 404         USERPRI;
 405     }
 406
 407     return num_pkts;
 408 }
 409 #else /* RX_ENABLE_TSFPQ */
 410 /* num_pkts=0 means queue length is unknown */
 411 int
 412 rxi_FreePackets(int num_pkts, struct opr_queue *q)
 413 {
 414     struct opr_queue cbs;
 415     struct opr_queue *cursor, *store;
 416     int qlen = 0;
 417     SPLVAR;
 418
 419     osi_Assert(num_pkts >= 0);
 420     opr_queue_Init(&cbs);
 421
 422     if (!num_pkts) {
 423         for (opr_queue_ScanSafe(q, cursor, store)) {
 424             struct rx_packet *p
 425                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 426             if (p->niovecs > 2) {
 427                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 428             }
 429             RX_FPQ_MARK_FREE(p);
 430             num_pkts++;
 431         }
 432         if (!num_pkts)
 433             return 0;
 434     } else {
 435         for (opr_queue_ScanSafe(q, cursor, store)) {
 436             struct rx_packet *p
 437                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 438
 439             if (p->niovecs > 2) {
 440                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 441             }
 442             RX_FPQ_MARK_FREE(p);
 443         }
 444     }
 445
 446     if (qlen) {
 447         opr_queue_SpliceAppend(q, &cbs);
 448         qlen += num_pkts;
 449     } else
 450         qlen = num_pkts;
 451
 452     NETPRI;
 453     MUTEX_ENTER(&rx_freePktQ_lock);
 454
 455     opr_queue_SpliceAppend(&rx_freePacketQueue, q);
 456     rx_nFreePackets += qlen;
 457
 458     /* Wakeup anyone waiting for packets */
 459     rxi_PacketsUnWait();
 460
 461     MUTEX_EXIT(&rx_freePktQ_lock);
 462     USERPRI;
 463
 464     return num_pkts;
 465 }
 466 #endif /* RX_ENABLE_TSFPQ */
 467
 468 /* this one is kind of awful.
 469  * In rxkad, the packet has been all shortened, and everything, ready for
 470  * sending.  All of a sudden, we discover we need some of that space back.
 471  * This isn't terribly general, because it knows that the packets are only
 472  * rounded up to the EBS (userdata + security header).
 473  */
 474 int
 475 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 476 {
 477     int i;
 478     i = p->niovecs - 1;
 479     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 480         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 481             p->wirevec[i].iov_len += nb;
 482             return 0;
 483         }
 484     } else {
 485         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 486             p->wirevec[i].iov_len += nb;
 487             return 0;
 488         }
 489     }
 490
 491     return 0;
 492 }
 493
 494 /* get sufficient space to store nb bytes of data (or more), and hook
 495  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 496  * returns the number of bytes >0 which it failed to come up with.
 497  * Don't need to worry about locking on packet, since only
 498  * one thread can manipulate one at a time. Locking on continution
 499  * packets is handled by AllocPacketBufs */
 500 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 501 int
 502 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 503 {
 504     int i, nv;
 505     struct opr_queue q, *cursor, *store;
 506
 507     /* compute the number of cbuf's we need */
 508     nv = nb / RX_CBUFFERSIZE;
 509     if ((nv * RX_CBUFFERSIZE) < nb)
 510         nv++;
 511     if ((nv + p->niovecs) > RX_MAXWVECS)
 512         nv = RX_MAXWVECS - p->niovecs;
 513     if (nv < 1)
 514         return nb;
 515
 516     /* allocate buffers */
 517     opr_queue_Init(&q);
 518     nv = AllocPacketBufs(class, nv, &q);
 519
 520     /* setup packet iovs */
 521     i = p ->niovecs;
 522     for (opr_queue_ScanSafe(&q, cursor, store)) {
 523         struct rx_packet *cb
 524             = opr_queue_Entry(cursor, struct rx_packet, entry);
 525
 526         opr_queue_Remove(&cb->entry);
 527         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 528         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 529         i++;
 530     }
 531
 532     nb -= (nv * RX_CBUFFERSIZE);
 533     p->length += (nv * RX_CBUFFERSIZE);
 534     p->niovecs += nv;
 535
 536     return nb;
 537 }
 538
 539 /* Add more packet buffers */
 540 #ifdef RX_ENABLE_TSFPQ
 541 void
 542 rxi_MorePackets(int apackets)
 543 {
 544     struct rx_packet *p, *e;
 545     struct rx_ts_info_t * rx_ts_info;
 546     int getme;
 547     SPLVAR;
 548
 549     getme = apackets * sizeof(struct rx_packet);
 550     p = osi_Alloc(getme);
 551     osi_Assert(p);
 552
 553     PIN(p, getme);              /* XXXXX */
 554     memset(p, 0, getme);
 555     RX_TS_INFO_GET(rx_ts_info);
 556
 557     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 558     /* TSFPQ patch also needs to keep track of total packets */
 559
 560     MUTEX_ENTER(&rx_packets_mutex);
 561     rx_nPackets += apackets;
 562     RX_TS_FPQ_COMPUTE_LIMITS;
 563     MUTEX_EXIT(&rx_packets_mutex);
 564
 565     for (e = p + apackets; p < e; p++) {
 566         RX_PACKET_IOV_INIT(p);
 567         p->niovecs = 2;
 568
 569         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 570
 571         NETPRI;
 572         MUTEX_ENTER(&rx_freePktQ_lock);
 573 #ifdef RXDEBUG_PACKET
 574         p->packetId = rx_packet_id++;
 575         p->allNextp = rx_mallocedP;
 576 #endif /* RXDEBUG_PACKET */
 577         rx_mallocedP = p;
 578         MUTEX_EXIT(&rx_freePktQ_lock);
 579         USERPRI;
 580     }
 581     rx_ts_info->_FPQ.delta += apackets;
 582
 583     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 584         NETPRI;
 585         MUTEX_ENTER(&rx_freePktQ_lock);
 586
 587         RX_TS_FPQ_LTOG(rx_ts_info);
 588         rxi_NeedMorePackets = FALSE;
 589         rxi_PacketsUnWait();
 590
 591         MUTEX_EXIT(&rx_freePktQ_lock);
 592         USERPRI;
 593     }
 594 }
 595 #else /* RX_ENABLE_TSFPQ */
 596 void
 597 rxi_MorePackets(int apackets)
 598 {
 599     struct rx_packet *p, *e;
 600     int getme;
 601     SPLVAR;
 602
 603     getme = apackets * sizeof(struct rx_packet);
 604     p = osi_Alloc(getme);
 605     osi_Assert(p);
 606
 607     PIN(p, getme);              /* XXXXX */
 608     memset(p, 0, getme);
 609     NETPRI;
 610     MUTEX_ENTER(&rx_freePktQ_lock);
 611
 612     for (e = p + apackets; p < e; p++) {
 613         RX_PACKET_IOV_INIT(p);
 614 #ifdef RX_TRACK_PACKETS
 615         p->flags |= RX_PKTFLAG_FREE;
 616 #endif
 617         p->niovecs = 2;
 618
 619         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 620 #ifdef RXDEBUG_PACKET
 621         p->packetId = rx_packet_id++;
 622         p->allNextp = rx_mallocedP;
 623 #endif /* RXDEBUG_PACKET */
 624         rx_mallocedP = p;
 625     }
 626
 627     rx_nPackets += apackets;
 628     rx_nFreePackets += apackets;
 629     rxi_NeedMorePackets = FALSE;
 630     rxi_PacketsUnWait();
 631
 632     MUTEX_EXIT(&rx_freePktQ_lock);
 633     USERPRI;
 634 }
 635 #endif /* RX_ENABLE_TSFPQ */
 636
 637 #ifdef RX_ENABLE_TSFPQ
 638 void
 639 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 640 {
 641     struct rx_packet *p, *e;
 642     struct rx_ts_info_t * rx_ts_info;
 643     int getme;
 644     SPLVAR;
 645
 646     getme = apackets * sizeof(struct rx_packet);
 647     p = osi_Alloc(getme);
 648
 649     PIN(p, getme);              /* XXXXX */
 650     memset(p, 0, getme);
 651     RX_TS_INFO_GET(rx_ts_info);
 652
 653     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 654     /* TSFPQ patch also needs to keep track of total packets */
 655     MUTEX_ENTER(&rx_packets_mutex);
 656     rx_nPackets += apackets;
 657     RX_TS_FPQ_COMPUTE_LIMITS;
 658     MUTEX_EXIT(&rx_packets_mutex);
 659
 660     for (e = p + apackets; p < e; p++) {
 661         RX_PACKET_IOV_INIT(p);
 662         p->niovecs = 2;
 663         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 664
 665         NETPRI;
 666         MUTEX_ENTER(&rx_freePktQ_lock);
 667 #ifdef RXDEBUG_PACKET
 668         p->packetId = rx_packet_id++;
 669         p->allNextp = rx_mallocedP;
 670 #endif /* RXDEBUG_PACKET */
 671         rx_mallocedP = p;
 672         MUTEX_EXIT(&rx_freePktQ_lock);
 673         USERPRI;
 674     }
 675     rx_ts_info->_FPQ.delta += apackets;
 676
 677     if (flush_global &&
 678         (num_keep_local < apackets)) {
 679         NETPRI;
 680         MUTEX_ENTER(&rx_freePktQ_lock);
 681
 682         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 683         rxi_NeedMorePackets = FALSE;
 684         rxi_PacketsUnWait();
 685
 686         MUTEX_EXIT(&rx_freePktQ_lock);
 687         USERPRI;
 688     }
 689 }
 690 #endif /* RX_ENABLE_TSFPQ */
 691
 692 #ifndef KERNEL
 693 /* Add more packet buffers */
 694 static void
 695 rxi_MorePacketsNoLock(int apackets)
 696 {
 697 #ifdef RX_ENABLE_TSFPQ
 698     struct rx_ts_info_t * rx_ts_info;
 699 #endif /* RX_ENABLE_TSFPQ */
 700     struct rx_packet *p, *e;
 701     int getme;
 702
 703     /* allocate enough packets that 1/4 of the packets will be able
 704      * to hold maximal amounts of data */
 705     apackets += (apackets / 4)
 706         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 707     do {
 708         getme = apackets * sizeof(struct rx_packet);
 709         p = osi_Alloc(getme);
 710         if (p == NULL) {
 711             apackets -= apackets / 4;
 712             osi_Assert(apackets > 0);
 713         }
 714     } while(p == NULL);
 715     memset(p, 0, getme);
 716
 717 #ifdef RX_ENABLE_TSFPQ
 718     RX_TS_INFO_GET(rx_ts_info);
 719     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 720 #endif /* RX_ENABLE_TSFPQ */
 721
 722     for (e = p + apackets; p < e; p++) {
 723         RX_PACKET_IOV_INIT(p);
 724 #ifdef RX_TRACK_PACKETS
 725         p->flags |= RX_PKTFLAG_FREE;
 726 #endif
 727         p->niovecs = 2;
 728
 729         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 730 #ifdef RXDEBUG_PACKET
 731         p->packetId = rx_packet_id++;
 732         p->allNextp = rx_mallocedP;
 733 #endif /* RXDEBUG_PACKET */
 734         rx_mallocedP = p;
 735     }
 736
 737     rx_nFreePackets += apackets;
 738     MUTEX_ENTER(&rx_packets_mutex);
 739     rx_nPackets += apackets;
 740 #ifdef RX_ENABLE_TSFPQ
 741     RX_TS_FPQ_COMPUTE_LIMITS;
 742 #endif /* RX_ENABLE_TSFPQ */
 743     MUTEX_EXIT(&rx_packets_mutex);
 744     rxi_NeedMorePackets = FALSE;
 745     rxi_PacketsUnWait();
 746 }
 747 #endif /* !KERNEL */
 748
 749 void
 750 rxi_FreeAllPackets(void)
 751 {
 752     /* must be called at proper interrupt level, etcetera */
 753     /* MTUXXX need to free all Packets */
 754     osi_Free(rx_mallocedP,
 755              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 756     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 757 }
 758
 759 #ifdef RX_ENABLE_TSFPQ
 760 static void
 761 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 762 {
 763     struct rx_ts_info_t * rx_ts_info;
 764     int xfer;
 765     SPLVAR;
 766
 767     RX_TS_INFO_GET(rx_ts_info);
 768
 769     if (num_keep_local != rx_ts_info->_FPQ.len) {
 770         NETPRI;
 771         MUTEX_ENTER(&rx_freePktQ_lock);
 772         if (num_keep_local < rx_ts_info->_FPQ.len) {
 773             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 774             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 775             rxi_PacketsUnWait();
 776         } else {
 777             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 778             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 779                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 780             if (rx_nFreePackets < xfer) {
 781                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 782             }
 783             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 784         }
 785         MUTEX_EXIT(&rx_freePktQ_lock);
 786         USERPRI;
 787     }
 788 }
 789
 790 void
 791 rxi_FlushLocalPacketsTSFPQ(void)
 792 {
 793     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 794 }
 795 #endif /* RX_ENABLE_TSFPQ */
 796
 797 /* Allocate more packets iff we need more continuation buffers */
 798 /* In kernel, can't page in memory with interrupts disabled, so we
 799  * don't use the event mechanism. */
 800 void
 801 rx_CheckPackets(void)
 802 {
 803     if (rxi_NeedMorePackets) {
 804         rxi_MorePackets(rx_maxSendWindow);
 805     }
 806 }
 807
 808 /* In the packet freeing routine below, the assumption is that
 809    we want all of the packets to be used equally frequently, so that we
 810    don't get packet buffers paging out.  It would be just as valid to
 811    assume that we DO want them to page out if not many are being used.
 812    In any event, we assume the former, and append the packets to the end
 813    of the free list.  */
 814 /* This explanation is bogus.  The free list doesn't remain in any kind of
 815    useful order for afs_int32: the packets in use get pretty much randomly scattered
 816    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 817    must be stored so that packets which are adjacent in memory are adjacent in the
 818    free list.  An array springs rapidly to mind.
 819    */
 820
 821 /* Actually free the packet p. */
 822 #ifndef RX_ENABLE_TSFPQ
 823 static void
 824 rxi_FreePacketNoLock(struct rx_packet *p)
 825 {
 826     dpf(("Free %"AFS_PTR_FMT"\n", p));
 827
 828     RX_FPQ_MARK_FREE(p);
 829     rx_nFreePackets++;
 830     opr_queue_Append(&rx_freePacketQueue, &p->entry);
 831 }
 832 #endif /* RX_ENABLE_TSFPQ */
 833
 834 #ifdef RX_ENABLE_TSFPQ
 835 static void
 836 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 837 {
 838     struct rx_ts_info_t * rx_ts_info;
 839     dpf(("Free %"AFS_PTR_FMT"\n", p));
 840
 841     RX_TS_INFO_GET(rx_ts_info);
 842     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 843
 844     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 845         NETPRI;
 846         MUTEX_ENTER(&rx_freePktQ_lock);
 847
 848         RX_TS_FPQ_LTOG(rx_ts_info);
 849
 850         /* Wakeup anyone waiting for packets */
 851         rxi_PacketsUnWait();
 852
 853         MUTEX_EXIT(&rx_freePktQ_lock);
 854         USERPRI;
 855     }
 856 }
 857 #endif /* RX_ENABLE_TSFPQ */
 858
 859 /*
 860  * free continuation buffers off a packet into a queue
 861  *
 862  * [IN] p      -- packet from which continuation buffers will be freed
 863  * [IN] first  -- iovec offset of first continuation buffer to free
 864  * [IN] q      -- queue into which continuation buffers will be chained
 865  *
 866  * returns:
 867  *   number of continuation buffers freed
 868  */
 869 #ifndef RX_ENABLE_TSFPQ
 870 static int
 871 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct opr_queue * q)
 872 {
 873     struct iovec *iov;
 874     struct rx_packet * cb;
 875     int count = 0;
 876
 877     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 878         iov = &p->wirevec[first];
 879         if (!iov->iov_base)
 880             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 881         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 882         RX_FPQ_MARK_FREE(cb);
 883         opr_queue_Append(q, &cb->entry);
 884     }
 885     p->length = 0;
 886     p->niovecs = 0;
 887
 888     return count;
 889 }
 890
 891 /*
 892  * free packet continuation buffers into the global free packet pool
 893  *
 894  * [IN] p      -- packet from which to free continuation buffers
 895  * [IN] first  -- iovec offset of first continuation buffer to free
 896  *
 897  * returns:
 898  *   zero always
 899  */
 900 static int
 901 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 902 {
 903     struct iovec *iov;
 904
 905     for (first = MAX(2, first); first < p->niovecs; first++) {
 906         iov = &p->wirevec[first];
 907         if (!iov->iov_base)
 908             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 909         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 910     }
 911     p->length = 0;
 912     p->niovecs = 0;
 913
 914     return 0;
 915 }
 916
 917 #else
 918
 919 /*
 920  * free packet continuation buffers into the thread-local free pool
 921  *
 922  * [IN] p             -- packet from which continuation buffers will be freed
 923  * [IN] first         -- iovec offset of first continuation buffer to free
 924  *                       any value less than 2, the min number of iovecs,
 925  *                       is treated as if it is 2.
 926  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 927  *                       global free pool before returning
 928  *
 929  * returns:
 930  *   zero always
 931  */
 932 static int
 933 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 934 {
 935     struct iovec *iov;
 936     struct rx_ts_info_t * rx_ts_info;
 937
 938     RX_TS_INFO_GET(rx_ts_info);
 939
 940     for (first = MAX(2, first); first < p->niovecs; first++) {
 941         iov = &p->wirevec[first];
 942         if (!iov->iov_base)
 943             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 944         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 945     }
 946     p->length = 0;
 947     p->niovecs = 0;
 948
 949     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 950         NETPRI;
 951         MUTEX_ENTER(&rx_freePktQ_lock);
 952
 953         RX_TS_FPQ_LTOG(rx_ts_info);
 954
 955         /* Wakeup anyone waiting for packets */
 956         rxi_PacketsUnWait();
 957
 958         MUTEX_EXIT(&rx_freePktQ_lock);
 959         USERPRI;
 960     }
 961     return 0;
 962 }
 963 #endif /* RX_ENABLE_TSFPQ */
 964
 965 int rxi_nBadIovecs = 0;
 966
 967 /* rxi_RestoreDataBufs
 968  *
 969  * Restore the correct sizes to the iovecs. Called when reusing a packet
 970  * for reading off the wire.
 971  */
 972 void
 973 rxi_RestoreDataBufs(struct rx_packet *p)
 974 {
 975     unsigned int i;
 976     struct iovec *iov;
 977
 978     RX_PACKET_IOV_INIT(p);
 979
 980     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 981         if (!iov->iov_base) {
 982             rxi_nBadIovecs++;
 983             p->niovecs = i;
 984             break;
 985         }
 986         iov->iov_len = RX_CBUFFERSIZE;
 987     }
 988 }
 989
 990 #ifdef RX_ENABLE_TSFPQ
 991 int
 992 rxi_TrimDataBufs(struct rx_packet *p, int first)
 993 {
 994     int length;
 995     struct iovec *iov, *end;
 996     struct rx_ts_info_t * rx_ts_info;
 997     SPLVAR;
 998
 999     if (first != 1)
1000         osi_Panic("TrimDataBufs 1: first must be 1");
1001
1002     /* Skip over continuation buffers containing message data */
1003     iov = &p->wirevec[2];
1004     end = iov + (p->niovecs - 2);
1005     length = p->length - p->wirevec[1].iov_len;
1006     for (; iov < end && length > 0; iov++) {
1007         if (!iov->iov_base)
1008             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1009         length -= iov->iov_len;
1010     }
1011
1012     /* iov now points to the first empty data buffer. */
1013     if (iov >= end)
1014         return 0;
1015
1016     RX_TS_INFO_GET(rx_ts_info);
1017     for (; iov < end; iov++) {
1018         if (!iov->iov_base)
1019             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1020         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1021         p->niovecs--;
1022     }
1023     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1024         NETPRI;
1025         MUTEX_ENTER(&rx_freePktQ_lock);
1026
1027         RX_TS_FPQ_LTOG(rx_ts_info);
1028         rxi_PacketsUnWait();
1029
1030         MUTEX_EXIT(&rx_freePktQ_lock);
1031         USERPRI;
1032     }
1033
1034     return 0;
1035 }
1036 #else /* RX_ENABLE_TSFPQ */
1037 int
1038 rxi_TrimDataBufs(struct rx_packet *p, int first)
1039 {
1040     int length;
1041     struct iovec *iov, *end;
1042     SPLVAR;
1043
1044     if (first != 1)
1045         osi_Panic("TrimDataBufs 1: first must be 1");
1046
1047     /* Skip over continuation buffers containing message data */
1048     iov = &p->wirevec[2];
1049     end = iov + (p->niovecs - 2);
1050     length = p->length - p->wirevec[1].iov_len;
1051     for (; iov < end && length > 0; iov++) {
1052         if (!iov->iov_base)
1053             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1054         length -= iov->iov_len;
1055     }
1056
1057     /* iov now points to the first empty data buffer. */
1058     if (iov >= end)
1059         return 0;
1060
1061     NETPRI;
1062     MUTEX_ENTER(&rx_freePktQ_lock);
1063
1064     for (; iov < end; iov++) {
1065         if (!iov->iov_base)
1066             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1067         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1068         p->niovecs--;
1069     }
1070     rxi_PacketsUnWait();
1071
1072     MUTEX_EXIT(&rx_freePktQ_lock);
1073     USERPRI;
1074
1075     return 0;
1076 }
1077 #endif /* RX_ENABLE_TSFPQ */
1078
1079 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1080  * remove it yourself first if you call this routine. */
1081 #ifdef RX_ENABLE_TSFPQ
1082 void
1083 rxi_FreePacket(struct rx_packet *p)
1084 {
1085     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1086     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1087 }
1088 #else /* RX_ENABLE_TSFPQ */
1089 void
1090 rxi_FreePacket(struct rx_packet *p)
1091 {
1092     SPLVAR;
1093
1094     NETPRI;
1095     MUTEX_ENTER(&rx_freePktQ_lock);
1096
1097     rxi_FreeDataBufsNoLock(p, 2);
1098     rxi_FreePacketNoLock(p);
1099     /* Wakeup anyone waiting for packets */
1100     rxi_PacketsUnWait();
1101
1102     MUTEX_EXIT(&rx_freePktQ_lock);
1103     USERPRI;
1104 }
1105 #endif /* RX_ENABLE_TSFPQ */
1106
1107 /* rxi_AllocPacket sets up p->length so it reflects the number of
1108  * bytes in the packet at this point, **not including** the header.
1109  * The header is absolutely necessary, besides, this is the way the
1110  * length field is usually used */
1111 #ifdef RX_ENABLE_TSFPQ
1112 static struct rx_packet *
1113 rxi_AllocPacketNoLock(int class)
1114 {
1115     struct rx_packet *p;
1116     struct rx_ts_info_t * rx_ts_info;
1117
1118     RX_TS_INFO_GET(rx_ts_info);
1119
1120     if (rx_stats_active)
1121         rx_atomic_inc(&rx_stats.packetRequests);
1122     if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1123
1124 #ifdef KERNEL
1125         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1126             osi_Panic("rxi_AllocPacket error");
1127 #else /* KERNEL */
1128         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1129             rxi_MorePacketsNoLock(rx_maxSendWindow);
1130 #endif /* KERNEL */
1131
1132
1133         RX_TS_FPQ_GTOL(rx_ts_info);
1134     }
1135
1136     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1137
1138     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1139
1140
1141     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1142      * order to truncate outbound packets.  In the near future, may need
1143      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1144      */
1145     RX_PACKET_IOV_FULLINIT(p);
1146     return p;
1147 }
1148 #else /* RX_ENABLE_TSFPQ */
1149 static struct rx_packet *
1150 rxi_AllocPacketNoLock(int class)
1151 {
1152     struct rx_packet *p;
1153
1154 #ifdef KERNEL
1155     if (rxi_OverQuota(class)) {
1156         rxi_NeedMorePackets = TRUE;
1157         if (rx_stats_active) {
1158             switch (class) {
1159             case RX_PACKET_CLASS_RECEIVE:
1160                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1161                 break;
1162             case RX_PACKET_CLASS_SEND:
1163                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1164                 break;
1165             case RX_PACKET_CLASS_SPECIAL:
1166                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1167                 break;
1168             case RX_PACKET_CLASS_RECV_CBUF:
1169                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1170                 break;
1171             case RX_PACKET_CLASS_SEND_CBUF:
1172                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1173                 break;
1174             }
1175         }
1176         return (struct rx_packet *)0;
1177     }
1178 #endif /* KERNEL */
1179
1180     if (rx_stats_active)
1181         rx_atomic_inc(&rx_stats.packetRequests);
1182
1183 #ifdef KERNEL
1184     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1185         osi_Panic("rxi_AllocPacket error");
1186 #else /* KERNEL */
1187     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1188         rxi_MorePacketsNoLock(rx_maxSendWindow);
1189 #endif /* KERNEL */
1190
1191     rx_nFreePackets--;
1192     p = opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
1193     opr_queue_Remove(&p->entry);
1194     RX_FPQ_MARK_USED(p);
1195
1196     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1197
1198
1199     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1200      * order to truncate outbound packets.  In the near future, may need
1201      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1202      */
1203     RX_PACKET_IOV_FULLINIT(p);
1204     return p;
1205 }
1206 #endif /* RX_ENABLE_TSFPQ */
1207
1208 #ifdef RX_ENABLE_TSFPQ
1209 static struct rx_packet *
1210 rxi_AllocPacketTSFPQ(int class, int pull_global)
1211 {
1212     struct rx_packet *p;
1213     struct rx_ts_info_t * rx_ts_info;
1214
1215     RX_TS_INFO_GET(rx_ts_info);
1216
1217     if (rx_stats_active)
1218         rx_atomic_inc(&rx_stats.packetRequests);
1219     if (pull_global && opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1220         MUTEX_ENTER(&rx_freePktQ_lock);
1221
1222         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1223             rxi_MorePacketsNoLock(rx_maxSendWindow);
1224
1225         RX_TS_FPQ_GTOL(rx_ts_info);
1226
1227         MUTEX_EXIT(&rx_freePktQ_lock);
1228     } else if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1229         return NULL;
1230     }
1231
1232     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1233
1234     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1235
1236     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1237      * order to truncate outbound packets.  In the near future, may need
1238      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1239      */
1240     RX_PACKET_IOV_FULLINIT(p);
1241     return p;
1242 }
1243 #endif /* RX_ENABLE_TSFPQ */
1244
1245 #ifdef RX_ENABLE_TSFPQ
1246 struct rx_packet *
1247 rxi_AllocPacket(int class)
1248 {
1249     struct rx_packet *p;
1250
1251     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1252     return p;
1253 }
1254 #else /* RX_ENABLE_TSFPQ */
1255 struct rx_packet *
1256 rxi_AllocPacket(int class)
1257 {
1258     struct rx_packet *p;
1259
1260     MUTEX_ENTER(&rx_freePktQ_lock);
1261     p = rxi_AllocPacketNoLock(class);
1262     MUTEX_EXIT(&rx_freePktQ_lock);
1263     return p;
1264 }
1265 #endif /* RX_ENABLE_TSFPQ */
1266
1267 /* This guy comes up with as many buffers as it {takes,can get} given
1268  * the MTU for this call. It also sets the packet length before
1269  * returning.  caution: this is often called at NETPRI
1270  * Called with call locked.
1271  */
1272 struct rx_packet *
1273 rxi_AllocSendPacket(struct rx_call *call, int want)
1274 {
1275     struct rx_packet *p = (struct rx_packet *)0;
1276     int mud;
1277     unsigned delta;
1278
1279     SPLVAR;
1280     mud = call->MTU - RX_HEADER_SIZE;
1281     delta =
1282         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1283         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1284
1285 #ifdef RX_ENABLE_TSFPQ
1286     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1287         want += delta;
1288         want = MIN(want, mud);
1289
1290         if ((unsigned)want > p->length)
1291             (void)rxi_AllocDataBuf(p, (want - p->length),
1292                                    RX_PACKET_CLASS_SEND_CBUF);
1293
1294         if (p->length > mud)
1295             p->length = mud;
1296
1297         if (delta >= p->length) {
1298             rxi_FreePacket(p);
1299             p = NULL;
1300         } else {
1301             p->length -= delta;
1302         }
1303         return p;
1304     }
1305 #endif /* RX_ENABLE_TSFPQ */
1306
1307     while (!(call->error)) {
1308         MUTEX_ENTER(&rx_freePktQ_lock);
1309         /* if an error occurred, or we get the packet we want, we're done */
1310         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1311             MUTEX_EXIT(&rx_freePktQ_lock);
1312
1313             want += delta;
1314             want = MIN(want, mud);
1315
1316             if ((unsigned)want > p->length)
1317                 (void)rxi_AllocDataBuf(p, (want - p->length),
1318                                        RX_PACKET_CLASS_SEND_CBUF);
1319
1320             if (p->length > mud)
1321                 p->length = mud;
1322
1323             if (delta >= p->length) {
1324                 rxi_FreePacket(p);
1325                 p = NULL;
1326             } else {
1327                 p->length -= delta;
1328             }
1329             break;
1330         }
1331
1332         /* no error occurred, and we didn't get a packet, so we sleep.
1333          * At this point, we assume that packets will be returned
1334          * sooner or later, as packets are acknowledged, and so we
1335          * just wait.  */
1336         NETPRI;
1337         call->flags |= RX_CALL_WAIT_PACKETS;
1338         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1339         MUTEX_EXIT(&call->lock);
1340         rx_waitingForPackets = 1;
1341
1342 #ifdef  RX_ENABLE_LOCKS
1343         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1344 #else
1345         osi_rxSleep(&rx_waitingForPackets);
1346 #endif
1347         MUTEX_EXIT(&rx_freePktQ_lock);
1348         MUTEX_ENTER(&call->lock);
1349         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1350         call->flags &= ~RX_CALL_WAIT_PACKETS;
1351         USERPRI;
1352     }
1353
1354     return p;
1355 }
1356
1357 #ifndef KERNEL
1358 #ifdef AFS_NT40_ENV
1359 /* Windows does not use file descriptors. */
1360 #define CountFDs(amax) 0
1361 #else
1362 /* count the number of used FDs */
1363 static int
1364 CountFDs(int amax)
1365 {
1366     struct stat tstat;
1367     int i, code;
1368     int count;
1369
1370     count = 0;
1371     for (i = 0; i < amax; i++) {
1372         code = fstat(i, &tstat);
1373         if (code == 0)
1374             count++;
1375     }
1376     return count;
1377 }
1378 #endif /* AFS_NT40_ENV */
1379 #else /* KERNEL */
1380
1381 #define CountFDs(amax) amax
1382
1383 #endif /* KERNEL */
1384
1385 #if !defined(KERNEL) || defined(UKERNEL)
1386
1387 /* This function reads a single packet from the interface into the
1388  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1389  * (host,port) of the sender are stored in the supplied variables, and
1390  * the data length of the packet is stored in the packet structure.
1391  * The header is decoded. */
1392 int
1393 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1394                u_short * port)
1395 {
1396     struct sockaddr_in from;
1397     int nbytes;
1398     afs_int32 rlen;
1399     afs_uint32 tlen, savelen;
1400     struct msghdr msg;
1401     rx_computelen(p, tlen);
1402     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1403
1404     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1405     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1406                                  * it once in order to avoid races.  */
1407     tlen = rlen - tlen;
1408     if (tlen > 0) {
1409         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1410         if (tlen > 0) {
1411             tlen = rlen - tlen;
1412         } else
1413             tlen = rlen;
1414     } else
1415         tlen = rlen;
1416
1417     /* Extend the last iovec for padding, it's just to make sure that the
1418      * read doesn't return more data than we expect, and is done to get around
1419      * our problems caused by the lack of a length field in the rx header.
1420      * Use the extra buffer that follows the localdata in each packet
1421      * structure. */
1422     savelen = p->wirevec[p->niovecs - 1].iov_len;
1423     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1424
1425     memset(&msg, 0, sizeof(msg));
1426     msg.msg_name = (char *)&from;
1427     msg.msg_namelen = sizeof(struct sockaddr_in);
1428     msg.msg_iov = p->wirevec;
1429     msg.msg_iovlen = p->niovecs;
1430     nbytes = rxi_Recvmsg(socket, &msg, 0);
1431
1432     /* restore the vec to its correct state */
1433     p->wirevec[p->niovecs - 1].iov_len = savelen;
1434
1435     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1436     if (nbytes < 0 || (nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1437         if (nbytes < 0 && errno == EWOULDBLOCK) {
1438             if (rx_stats_active)
1439                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1440         } else if (nbytes <= 0) {
1441             if (rx_stats_active) {
1442                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1443                 rx_stats.bogusHost = from.sin_addr.s_addr;
1444             }
1445             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1446                  ntohs(from.sin_port), nbytes));
1447         }
1448         return 0;
1449     }
1450 #ifdef RXDEBUG
1451     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1452                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1453         rxi_DecodePacketHeader(p);
1454
1455         *host = from.sin_addr.s_addr;
1456         *port = from.sin_port;
1457
1458         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1459               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1460               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1461               p->length));
1462 #ifdef RX_TRIMDATABUFS
1463         rxi_TrimDataBufs(p, 1);
1464 #endif
1465         return 0;
1466     }
1467 #endif
1468     else {
1469         /* Extract packet header. */
1470         rxi_DecodePacketHeader(p);
1471
1472         *host = from.sin_addr.s_addr;
1473         *port = from.sin_port;
1474         if (rx_stats_active
1475             && p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1476
1477                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1478         }
1479
1480 #ifdef RX_TRIMDATABUFS
1481         /* Free any empty packet buffers at the end of this packet */
1482         rxi_TrimDataBufs(p, 1);
1483 #endif
1484         return 1;
1485     }
1486 }
1487
1488 #endif /* !KERNEL || UKERNEL */
1489
1490 /* This function splits off the first packet in a jumbo packet.
1491  * As of AFS 3.5, jumbograms contain more than one fixed size
1492  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1493  * last packet header. All packets (except the last) are padded to
1494  * fall on RX_CBUFFERSIZE boundaries.
1495  * HACK: We store the length of the first n-1 packets in the
1496  * last two pad bytes. */
1497
1498 struct rx_packet *
1499 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1500                      int first)
1501 {
1502     struct rx_packet *np;
1503     struct rx_jumboHeader *jp;
1504     int niov, i;
1505     struct iovec *iov;
1506     int length;
1507     afs_uint32 temp;
1508
1509     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1510      * bytes in length. All but the first packet are preceded by
1511      * an abbreviated four byte header. The length of the last packet
1512      * is calculated from the size of the jumbogram. */
1513     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1514
1515     if ((int)p->length < length) {
1516         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1517         return NULL;
1518     }
1519     niov = p->niovecs - 2;
1520     if (niov < 1) {
1521         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1522         return NULL;
1523     }
1524     iov = &p->wirevec[2];
1525     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1526
1527     /* Get a pointer to the abbreviated packet header */
1528     jp = (struct rx_jumboHeader *)
1529         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1530
1531     /* Set up the iovecs for the next packet */
1532     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1533     np->wirevec[0].iov_len = sizeof(struct rx_header);
1534     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1535     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1536     np->niovecs = niov + 1;
1537     for (i = 2, iov++; i <= niov; i++, iov++) {
1538         np->wirevec[i] = *iov;
1539     }
1540     np->length = p->length - length;
1541     p->length = RX_JUMBOBUFFERSIZE;
1542     p->niovecs = 2;
1543
1544     /* Convert the jumbo packet header to host byte order */
1545     temp = ntohl(*(afs_uint32 *) jp);
1546     jp->flags = (u_char) (temp >> 24);
1547     jp->cksum = (u_short) (temp);
1548
1549     /* Fill in the packet header */
1550     np->header = p->header;
1551     np->header.serial = p->header.serial + 1;
1552     np->header.seq = p->header.seq + 1;
1553     np->header.flags = jp->flags;
1554     np->header.spare = jp->cksum;
1555
1556     return np;
1557 }
1558
1559 #ifndef KERNEL
1560 /* Send a udp datagram */
1561 int
1562 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1563             int length, int istack)
1564 {
1565     struct msghdr msg;
1566         int ret;
1567
1568     memset(&msg, 0, sizeof(msg));
1569     msg.msg_iov = dvec;
1570     msg.msg_iovlen = nvecs;
1571     msg.msg_name = addr;
1572     msg.msg_namelen = sizeof(struct sockaddr_in);
1573
1574     ret = rxi_Sendmsg(socket, &msg, 0);
1575
1576     return ret;
1577 }
1578 #elif !defined(UKERNEL)
1579 /*
1580  * message receipt is done in rxk_input or rx_put.
1581  */
1582
1583 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1584 /*
1585  * Copy an mblock to the contiguous area pointed to by cp.
1586  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1587  * but it doesn't really.
1588  * Returns the number of bytes not transferred.
1589  * The message is NOT changed.
1590  */
1591 static int
1592 cpytoc(mblk_t * mp, int off, int len, char *cp)
1593 {
1594     int n;
1595
1596     for (; mp && len > 0; mp = mp->b_cont) {
1597         if (mp->b_datap->db_type != M_DATA) {
1598             return -1;
1599         }
1600         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1601         memcpy(cp, (char *)mp->b_rptr, n);
1602         cp += n;
1603         len -= n;
1604         mp->b_rptr += n;
1605     }
1606     return (len);
1607 }
1608
1609 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1610  * but it doesn't really.
1611  * This sucks, anyway, do it like m_cpy.... below
1612  */
1613 static int
1614 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1615            int niovs)
1616 {
1617     int m, n, o, t, i;
1618
1619     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1620         if (mp->b_datap->db_type != M_DATA) {
1621             return -1;
1622         }
1623         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1624         len -= n;
1625         while (n) {
1626             if (!t) {
1627                 o = 0;
1628                 i++;
1629                 t = iovs[i].iov_len;
1630             }
1631             m = MIN(n, t);
1632             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1633             mp->b_rptr += m;
1634             o += m;
1635             t -= m;
1636             n -= m;
1637         }
1638     }
1639     return (len);
1640 }
1641
1642 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1643 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1644 #else
1645 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1646 static int
1647 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1648 {
1649     caddr_t p1, p2;
1650     unsigned int l1, l2, i, t;
1651
1652     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1653         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1654
1655     while (off && m)
1656         if (m->m_len <= off) {
1657             off -= m->m_len;
1658             m = m->m_next;
1659             continue;
1660         } else
1661             break;
1662
1663     if (m == NULL)
1664         return len;
1665
1666     p1 = mtod(m, caddr_t) + off;
1667     l1 = m->m_len - off;
1668     i = 0;
1669     p2 = iovs[0].iov_base;
1670     l2 = iovs[0].iov_len;
1671
1672     while (len) {
1673         t = MIN(l1, MIN(l2, (unsigned int)len));
1674         memcpy(p2, p1, t);
1675         p1 += t;
1676         p2 += t;
1677         l1 -= t;
1678         l2 -= t;
1679         len -= t;
1680         if (!l1) {
1681             m = m->m_next;
1682             if (!m)
1683                 break;
1684             p1 = mtod(m, caddr_t);
1685             l1 = m->m_len;
1686         }
1687         if (!l2) {
1688             if (++i >= niovs)
1689                 break;
1690             p2 = iovs[i].iov_base;
1691             l2 = iovs[i].iov_len;
1692         }
1693
1694     }
1695
1696     return len;
1697 }
1698 #endif /* LINUX */
1699 #endif /* AFS_SUN5_ENV */
1700
1701 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1702 #if defined(AFS_NBSD_ENV)
1703 int
1704 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1705 #else
1706 int
1707 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1708 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1709      mblk_t *amb;
1710 #else
1711      struct mbuf *amb;
1712 #endif
1713      void (*free) ();
1714      struct rx_packet *phandle;
1715      int hdr_len, data_len;
1716 #endif /* AFS_NBSD_ENV */
1717 {
1718     int code;
1719
1720     code =
1721         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1722                      phandle->niovecs);
1723     (*free) (amb);
1724
1725     return code;
1726 }
1727 #endif /* LINUX */
1728 #endif /*KERNEL && !UKERNEL */
1729
1730
1731 /* send a response to a debug packet */
1732
1733 struct rx_packet *
1734 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1735                        afs_uint32 ahost, short aport, int istack)
1736 {
1737     struct rx_debugIn tin;
1738     afs_int32 tl;
1739
1740     /*
1741      * Only respond to client-initiated Rx debug packets,
1742      * and clear the client flag in the response.
1743      */
1744     if (ap->header.flags & RX_CLIENT_INITIATED) {
1745         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1746         rxi_EncodePacketHeader(ap);
1747     } else {
1748         return ap;
1749     }
1750
1751     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1752     /* all done with packet, now set length to the truth, so we can
1753      * reuse this packet */
1754     rx_computelen(ap, ap->length);
1755
1756     tin.type = ntohl(tin.type);
1757     tin.index = ntohl(tin.index);
1758     switch (tin.type) {
1759     case RX_DEBUGI_GETSTATS:{
1760             struct rx_debugStats tstat;
1761
1762             /* get basic stats */
1763             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1764             tstat.version = RX_DEBUGI_VERSION;
1765 #ifndef RX_ENABLE_LOCKS
1766             tstat.waitingForPackets = rx_waitingForPackets;
1767 #endif
1768             MUTEX_ENTER(&rx_serverPool_lock);
1769             tstat.nFreePackets = htonl(rx_nFreePackets);
1770             tstat.nPackets = htonl(rx_nPackets);
1771             tstat.callsExecuted = htonl(rxi_nCalls);
1772             tstat.packetReclaims = htonl(rx_packetReclaims);
1773             tstat.usedFDs = CountFDs(64);
1774             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1775             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1776             tstat.idleThreads = opr_queue_Count(&rx_idleServerQueue);
1777             MUTEX_EXIT(&rx_serverPool_lock);
1778             tstat.idleThreads = htonl(tstat.idleThreads);
1779             tl = sizeof(struct rx_debugStats) - ap->length;
1780             if (tl > 0)
1781                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1782
1783             if (tl <= 0) {
1784                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1785                                (char *)&tstat);
1786                 ap->length = sizeof(struct rx_debugStats);
1787                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1788                 rx_computelen(ap, ap->length);
1789             }
1790             break;
1791         }
1792
1793     case RX_DEBUGI_GETALLCONN:
1794     case RX_DEBUGI_GETCONN:{
1795             unsigned int i, j;
1796             struct rx_connection *tc;
1797             struct rx_call *tcall;
1798             struct rx_debugConn tconn;
1799             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1800
1801
1802             tl = sizeof(struct rx_debugConn) - ap->length;
1803             if (tl > 0)
1804                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1805             if (tl > 0)
1806                 return ap;
1807
1808             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1809             /* get N'th (maybe) "interesting" connection info */
1810             for (i = 0; i < rx_hashTableSize; i++) {
1811 #if !defined(KERNEL)
1812                 /* the time complexity of the algorithm used here
1813                  * exponentially increses with the number of connections.
1814                  */
1815 #ifdef AFS_PTHREAD_ENV
1816                 pthread_yield();
1817 #else
1818                 (void)IOMGR_Poll();
1819 #endif
1820 #endif
1821                 MUTEX_ENTER(&rx_connHashTable_lock);
1822                 /* We might be slightly out of step since we are not
1823                  * locking each call, but this is only debugging output.
1824                  */
1825                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1826                     if ((all || rxi_IsConnInteresting(tc))
1827                         && tin.index-- <= 0) {
1828                         tconn.host = tc->peer->host;
1829                         tconn.port = tc->peer->port;
1830                         tconn.cid = htonl(tc->cid);
1831                         tconn.epoch = htonl(tc->epoch);
1832                         tconn.serial = htonl(tc->serial);
1833                         for (j = 0; j < RX_MAXCALLS; j++) {
1834                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1835                             if ((tcall = tc->call[j])) {
1836                                 tconn.callState[j] = tcall->state;
1837                                 tconn.callMode[j] = tcall->app.mode;
1838                                 tconn.callFlags[j] = tcall->flags;
1839                                 if (!opr_queue_IsEmpty(&tcall->rq))
1840                                     tconn.callOther[j] |= RX_OTHER_IN;
1841                                 if (!opr_queue_IsEmpty(&tcall->tq))
1842                                     tconn.callOther[j] |= RX_OTHER_OUT;
1843                             } else
1844                                 tconn.callState[j] = RX_STATE_NOTINIT;
1845                         }
1846
1847                         tconn.natMTU = htonl(tc->peer->natMTU);
1848                         tconn.error = htonl(tc->error);
1849                         tconn.flags = tc->flags;
1850                         tconn.type = tc->type;
1851                         tconn.securityIndex = tc->securityIndex;
1852                         if (tc->securityObject) {
1853                             RXS_GetStats(tc->securityObject, tc,
1854                                          &tconn.secStats);
1855 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1856 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1857                             DOHTONL(flags);
1858                             DOHTONL(expires);
1859                             DOHTONL(packetsReceived);
1860                             DOHTONL(packetsSent);
1861                             DOHTONL(bytesReceived);
1862                             DOHTONL(bytesSent);
1863                             for (i = 0;
1864                                  i <
1865                                  sizeof(tconn.secStats.spares) /
1866                                  sizeof(short); i++)
1867                                 DOHTONS(spares[i]);
1868                             for (i = 0;
1869                                  i <
1870                                  sizeof(tconn.secStats.sparel) /
1871                                  sizeof(afs_int32); i++)
1872                                 DOHTONL(sparel[i]);
1873                         }
1874
1875                         MUTEX_EXIT(&rx_connHashTable_lock);
1876                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1877                                        (char *)&tconn);
1878                         tl = ap->length;
1879                         ap->length = sizeof(struct rx_debugConn);
1880                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1881                                             istack);
1882                         ap->length = tl;
1883                         return ap;
1884                     }
1885                 }
1886                 MUTEX_EXIT(&rx_connHashTable_lock);
1887             }
1888             /* if we make it here, there are no interesting packets */
1889             tconn.cid = htonl(0xffffffff);      /* means end */
1890             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1891                            (char *)&tconn);
1892             tl = ap->length;
1893             ap->length = sizeof(struct rx_debugConn);
1894             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1895             ap->length = tl;
1896             break;
1897         }
1898
1899         /*
1900          * Pass back all the peer structures we have available
1901          */
1902
1903     case RX_DEBUGI_GETPEER:{
1904             unsigned int i;
1905             struct rx_peer *tp;
1906             struct rx_debugPeer tpeer;
1907
1908
1909             tl = sizeof(struct rx_debugPeer) - ap->length;
1910             if (tl > 0)
1911                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1912             if (tl > 0)
1913                 return ap;
1914
1915             memset(&tpeer, 0, sizeof(tpeer));
1916             for (i = 0; i < rx_hashTableSize; i++) {
1917 #if !defined(KERNEL)
1918                 /* the time complexity of the algorithm used here
1919                  * exponentially increses with the number of peers.
1920                  *
1921                  * Yielding after processing each hash table entry
1922                  * and dropping rx_peerHashTable_lock.
1923                  * also increases the risk that we will miss a new
1924                  * entry - but we are willing to live with this
1925                  * limitation since this is meant for debugging only
1926                  */
1927 #ifdef AFS_PTHREAD_ENV
1928                 pthread_yield();
1929 #else
1930                 (void)IOMGR_Poll();
1931 #endif
1932 #endif
1933                 MUTEX_ENTER(&rx_peerHashTable_lock);
1934                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1935                     if (tin.index-- <= 0) {
1936                         tp->refCount++;
1937                         MUTEX_EXIT(&rx_peerHashTable_lock);
1938
1939                         MUTEX_ENTER(&tp->peer_lock);
1940                         tpeer.host = tp->host;
1941                         tpeer.port = tp->port;
1942                         tpeer.ifMTU = htons(tp->ifMTU);
1943                         tpeer.idleWhen = htonl(tp->idleWhen);
1944                         tpeer.refCount = htons(tp->refCount);
1945                         tpeer.burstSize = 0;
1946                         tpeer.burst = 0;
1947                         tpeer.burstWait.sec = 0;
1948                         tpeer.burstWait.usec = 0;
1949                         tpeer.rtt = htonl(tp->rtt);
1950                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1951                         tpeer.nSent = htonl(tp->nSent);
1952                         tpeer.reSends = htonl(tp->reSends);
1953                         tpeer.natMTU = htons(tp->natMTU);
1954                         tpeer.maxMTU = htons(tp->maxMTU);
1955                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1956                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1957                         tpeer.MTU = htons(tp->MTU);
1958                         tpeer.cwind = htons(tp->cwind);
1959                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
1960                         tpeer.congestSeq = htons(tp->congestSeq);
1961                         tpeer.bytesSent.high =
1962                             htonl(tp->bytesSent >> 32);
1963                         tpeer.bytesSent.low =
1964                             htonl(tp->bytesSent & MAX_AFS_UINT32);
1965                         tpeer.bytesReceived.high =
1966                             htonl(tp->bytesReceived >> 32);
1967                         tpeer.bytesReceived.low =
1968                             htonl(tp->bytesReceived & MAX_AFS_UINT32);
1969                         MUTEX_EXIT(&tp->peer_lock);
1970
1971                         MUTEX_ENTER(&rx_peerHashTable_lock);
1972                         tp->refCount--;
1973                         MUTEX_EXIT(&rx_peerHashTable_lock);
1974
1975                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1976                                        (char *)&tpeer);
1977                         tl = ap->length;
1978                         ap->length = sizeof(struct rx_debugPeer);
1979                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1980                                             istack);
1981                         ap->length = tl;
1982                         return ap;
1983                     }
1984                 }
1985                 MUTEX_EXIT(&rx_peerHashTable_lock);
1986             }
1987             /* if we make it here, there are no interesting packets */
1988             tpeer.host = htonl(0xffffffff);     /* means end */
1989             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1990                            (char *)&tpeer);
1991             tl = ap->length;
1992             ap->length = sizeof(struct rx_debugPeer);
1993             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1994             ap->length = tl;
1995             break;
1996         }
1997
1998     case RX_DEBUGI_RXSTATS:{
1999             int i;
2000             afs_int32 *s;
2001
2002             tl = sizeof(rx_stats) - ap->length;
2003             if (tl > 0)
2004                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2005             if (tl > 0)
2006                 return ap;
2007
2008             /* Since its all int32s convert to network order with a loop. */
2009         if (rx_stats_active)
2010             MUTEX_ENTER(&rx_stats_mutex);
2011             s = (afs_int32 *) & rx_stats;
2012             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2013                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2014
2015             tl = ap->length;
2016             ap->length = sizeof(rx_stats);
2017         if (rx_stats_active)
2018             MUTEX_EXIT(&rx_stats_mutex);
2019             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2020             ap->length = tl;
2021             break;
2022         }
2023
2024     default:
2025         /* error response packet */
2026         tin.type = htonl(RX_DEBUGI_BADTYPE);
2027         tin.index = tin.type;
2028         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2029         tl = ap->length;
2030         ap->length = sizeof(struct rx_debugIn);
2031         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2032         ap->length = tl;
2033         break;
2034     }
2035     return ap;
2036 }
2037
2038 struct rx_packet *
2039 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2040                          afs_uint32 ahost, short aport, int istack)
2041 {
2042     afs_int32 tl;
2043
2044     /*
2045      * Only respond to client-initiated version requests, and
2046      * clear that flag in the response.
2047      */
2048     if (ap->header.flags & RX_CLIENT_INITIATED) {
2049         char buf[66];
2050
2051         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2052         rxi_EncodePacketHeader(ap);
2053         memset(buf, 0, sizeof(buf));
2054         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2055         rx_packetwrite(ap, 0, 65, buf);
2056         tl = ap->length;
2057         ap->length = 65;
2058         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2059         ap->length = tl;
2060     }
2061
2062     return ap;
2063 }
2064
2065
2066 /* send a debug packet back to the sender */
2067 static void
2068 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2069                     afs_uint32 ahost, short aport, afs_int32 istack)
2070 {
2071     struct sockaddr_in taddr;
2072     unsigned int i, nbytes, savelen = 0;
2073     int saven = 0;
2074 #ifdef KERNEL
2075     int waslocked = ISAFS_GLOCK();
2076 #endif
2077
2078     taddr.sin_family = AF_INET;
2079     taddr.sin_port = aport;
2080     taddr.sin_addr.s_addr = ahost;
2081     memset(&taddr.sin_zero, 0, sizeof(taddr.sin_zero));
2082 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2083     taddr.sin_len = sizeof(struct sockaddr_in);
2084 #endif
2085
2086     /* We need to trim the niovecs. */
2087     nbytes = apacket->length;
2088     for (i = 1; i < apacket->niovecs; i++) {
2089         if (nbytes <= apacket->wirevec[i].iov_len) {
2090             savelen = apacket->wirevec[i].iov_len;
2091             saven = apacket->niovecs;
2092             apacket->wirevec[i].iov_len = nbytes;
2093             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2094         } else
2095             nbytes -= apacket->wirevec[i].iov_len;
2096     }
2097 #ifdef KERNEL
2098 #ifdef RX_KERNEL_TRACE
2099     if (ICL_SETACTIVE(afs_iclSetp)) {
2100         if (!waslocked)
2101             AFS_GLOCK();
2102         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2103                    "before osi_NetSend()");
2104         AFS_GUNLOCK();
2105     }
2106 #else
2107     if (waslocked)
2108         AFS_GUNLOCK();
2109 #endif
2110 #endif
2111     /* debug packets are not reliably delivered, hence the cast below. */
2112     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2113                       apacket->length + RX_HEADER_SIZE, istack);
2114 #ifdef KERNEL
2115 #ifdef RX_KERNEL_TRACE
2116     if (ICL_SETACTIVE(afs_iclSetp)) {
2117         AFS_GLOCK();
2118         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2119                    "after osi_NetSend()");
2120         if (!waslocked)
2121             AFS_GUNLOCK();
2122     }
2123 #else
2124     if (waslocked)
2125         AFS_GLOCK();
2126 #endif
2127 #endif
2128     if (saven) {                /* means we truncated the packet above. */
2129         apacket->wirevec[i - 1].iov_len = savelen;
2130         apacket->niovecs = saven;
2131     }
2132
2133 }
2134
2135 static void
2136 rxi_NetSendError(struct rx_call *call, int code)
2137 {
2138     int down = 0;
2139 #ifdef AFS_NT40_ENV
2140     if (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) {
2141         down = 1;
2142     }
2143     if (code == -WSAEHOSTUNREACH) {
2144         down = 1;
2145     }
2146 #elif defined(AFS_LINUX20_ENV)
2147     if (code == -ENETUNREACH) {
2148         down = 1;
2149     }
2150 #elif defined(AFS_DARWIN_ENV)
2151     if (code == EHOSTUNREACH) {
2152         down = 1;
2153     }
2154 #endif
2155     if (down) {
2156         call->lastReceiveTime = 0;
2157     }
2158 }
2159
2160 /* Send the packet to appropriate destination for the specified
2161  * call.  The header is first encoded and placed in the packet.
2162  */
2163 void
2164 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2165                struct rx_packet *p, int istack)
2166 {
2167 #if defined(KERNEL)
2168     int waslocked;
2169 #endif
2170     int code;
2171     struct sockaddr_in addr;
2172     struct rx_peer *peer = conn->peer;
2173     osi_socket socket;
2174 #ifdef RXDEBUG
2175     char deliveryType = 'S';
2176 #endif
2177     /* The address we're sending the packet to */
2178     memset(&addr, 0, sizeof(addr));
2179     addr.sin_family = AF_INET;
2180     addr.sin_port = peer->port;
2181     addr.sin_addr.s_addr = peer->host;
2182     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2183
2184     /* This stuff should be revamped, I think, so that most, if not
2185      * all, of the header stuff is always added here.  We could
2186      * probably do away with the encode/decode routines. XXXXX */
2187
2188     /* Stamp each packet with a unique serial number.  The serial
2189      * number is maintained on a connection basis because some types
2190      * of security may be based on the serial number of the packet,
2191      * and security is handled on a per authenticated-connection
2192      * basis. */
2193     /* Pre-increment, to guarantee no zero serial number; a zero
2194      * serial number means the packet was never sent. */
2195     MUTEX_ENTER(&conn->conn_data_lock);
2196     p->header.serial = ++conn->serial;
2197     if (p->length > conn->peer->maxPacketSize) {
2198         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2199             (p->header.flags & RX_REQUEST_ACK)) {
2200             conn->lastPingSize = p->length;
2201             conn->lastPingSizeSer = p->header.serial;
2202         } else if (p->header.seq != 0) {
2203             conn->lastPacketSize = p->length;
2204             conn->lastPacketSizeSeq = p->header.seq;
2205         }
2206     }
2207     MUTEX_EXIT(&conn->conn_data_lock);
2208     /* This is so we can adjust retransmit time-outs better in the face of
2209      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2210      */
2211     if (p->firstSerial == 0) {
2212         p->firstSerial = p->header.serial;
2213     }
2214 #ifdef RXDEBUG
2215     /* If an output tracer function is defined, call it with the packet and
2216      * network address.  Note this function may modify its arguments. */
2217     if (rx_almostSent) {
2218         int drop = (*rx_almostSent) (p, &addr);
2219         /* drop packet if return value is non-zero? */
2220         if (drop)
2221             deliveryType = 'D'; /* Drop the packet */
2222     }
2223 #endif
2224
2225     /* Get network byte order header */
2226     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2227                                  * touch ALL the fields */
2228
2229     /* Send the packet out on the same socket that related packets are being
2230      * received on */
2231     socket =
2232         (conn->type ==
2233          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2234
2235 #ifdef RXDEBUG
2236     /* Possibly drop this packet,  for testing purposes */
2237     if ((deliveryType == 'D')
2238         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2239             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2240         deliveryType = 'D';     /* Drop the packet */
2241     } else {
2242         deliveryType = 'S';     /* Send the packet */
2243 #endif /* RXDEBUG */
2244
2245         /* Loop until the packet is sent.  We'd prefer just to use a
2246          * blocking socket, but unfortunately the interface doesn't
2247          * allow us to have the socket block in send mode, and not
2248          * block in receive mode */
2249 #ifdef KERNEL
2250         waslocked = ISAFS_GLOCK();
2251 #ifdef RX_KERNEL_TRACE
2252         if (ICL_SETACTIVE(afs_iclSetp)) {
2253             if (!waslocked)
2254                 AFS_GLOCK();
2255             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2256                        "before osi_NetSend()");
2257             AFS_GUNLOCK();
2258         }
2259 #else
2260         if (waslocked)
2261             AFS_GUNLOCK();
2262 #endif
2263 #endif
2264         if ((code =
2265              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2266                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2267             /* send failed, so let's hurry up the resend, eh? */
2268             if (rx_stats_active)
2269                 rx_atomic_inc(&rx_stats.netSendFailures);
2270             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2271
2272             /* Some systems are nice and tell us right away that we cannot
2273              * reach this recipient by returning an error code.
2274              * So, when this happens let's "down" the host NOW so
2275              * we don't sit around waiting for this host to timeout later.
2276              */
2277             if (call) {
2278                 rxi_NetSendError(call, code);
2279             }
2280         }
2281 #ifdef KERNEL
2282 #ifdef RX_KERNEL_TRACE
2283         if (ICL_SETACTIVE(afs_iclSetp)) {
2284             AFS_GLOCK();
2285             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2286                        "after osi_NetSend()");
2287             if (!waslocked)
2288                 AFS_GUNLOCK();
2289         }
2290 #else
2291         if (waslocked)
2292             AFS_GLOCK();
2293 #endif
2294 #endif
2295 #ifdef RXDEBUG
2296     }
2297     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2298           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2299           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2300           p->header.seq, p->header.flags, p, p->length));
2301 #endif
2302     if (rx_stats_active) {
2303         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2304         MUTEX_ENTER(&peer->peer_lock);
2305         peer->bytesSent += p->length;
2306         MUTEX_EXIT(&peer->peer_lock);
2307     }
2308 }
2309
2310 /* Send a list of packets to appropriate destination for the specified
2311  * connection.  The headers are first encoded and placed in the packets.
2312  */
2313 void
2314 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2315                    struct rx_packet **list, int len, int istack)
2316 {
2317 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2318     int waslocked;
2319 #endif
2320     struct sockaddr_in addr;
2321     struct rx_peer *peer = conn->peer;
2322     osi_socket socket;
2323     struct rx_packet *p = NULL;
2324     struct iovec wirevec[RX_MAXIOVECS];
2325     int i, length, code;
2326     afs_uint32 serial;
2327     afs_uint32 temp;
2328     struct rx_jumboHeader *jp;
2329 #ifdef RXDEBUG
2330     char deliveryType = 'S';
2331 #endif
2332     /* The address we're sending the packet to */
2333     addr.sin_family = AF_INET;
2334     addr.sin_port = peer->port;
2335     addr.sin_addr.s_addr = peer->host;
2336     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2337
2338     if (len + 1 > RX_MAXIOVECS) {
2339         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2340     }
2341
2342     /*
2343      * Stamp the packets in this jumbogram with consecutive serial numbers
2344      */
2345     MUTEX_ENTER(&conn->conn_data_lock);
2346     serial = conn->serial;
2347     conn->serial += len;
2348     for (i = 0; i < len; i++) {
2349         p = list[i];
2350         /* a ping *or* a sequenced packet can count */
2351         if (p->length > conn->peer->maxPacketSize) {
2352             if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2353                  (p->header.flags & RX_REQUEST_ACK)) &&
2354                 ((i == 0) || (p->length >= conn->lastPingSize))) {
2355                 conn->lastPingSize = p->length;
2356                 conn->lastPingSizeSer = serial + i;
2357             } else if ((p->header.seq != 0) &&
2358                        ((i == 0) || (p->length >= conn->lastPacketSize))) {
2359                 conn->lastPacketSize = p->length;
2360                 conn->lastPacketSizeSeq = p->header.seq;
2361             }
2362         }
2363     }
2364     MUTEX_EXIT(&conn->conn_data_lock);
2365
2366
2367     /* This stuff should be revamped, I think, so that most, if not
2368      * all, of the header stuff is always added here.  We could
2369      * probably do away with the encode/decode routines. XXXXX */
2370
2371     jp = NULL;
2372     length = RX_HEADER_SIZE;
2373     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2374     wirevec[0].iov_len = RX_HEADER_SIZE;
2375     for (i = 0; i < len; i++) {
2376         p = list[i];
2377
2378         /* The whole 3.5 jumbogram scheme relies on packets fitting
2379          * in a single packet buffer. */
2380         if (p->niovecs > 2) {
2381             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2382         }
2383
2384         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2385          * in this chunk.  */
2386         if (i < len - 1) {
2387             if (p->length != RX_JUMBOBUFFERSIZE) {
2388                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2389             }
2390             p->header.flags |= RX_JUMBO_PACKET;
2391             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2392             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2393         } else {
2394             wirevec[i + 1].iov_len = p->length;
2395             length += p->length;
2396         }
2397         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2398         if (jp != NULL) {
2399             /* Convert jumbo packet header to network byte order */
2400             temp = (afs_uint32) (p->header.flags) << 24;
2401             temp |= (afs_uint32) (p->header.spare);
2402             *(afs_uint32 *) jp = htonl(temp);
2403         }
2404         jp = (struct rx_jumboHeader *)
2405             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2406
2407         /* Stamp each packet with a unique serial number.  The serial
2408          * number is maintained on a connection basis because some types
2409          * of security may be based on the serial number of the packet,
2410          * and security is handled on a per authenticated-connection
2411          * basis. */
2412         /* Pre-increment, to guarantee no zero serial number; a zero
2413          * serial number means the packet was never sent. */
2414         p->header.serial = ++serial;
2415         /* This is so we can adjust retransmit time-outs better in the face of
2416          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2417          */
2418         if (p->firstSerial == 0) {
2419             p->firstSerial = p->header.serial;
2420         }
2421 #ifdef RXDEBUG
2422         /* If an output tracer function is defined, call it with the packet and
2423          * network address.  Note this function may modify its arguments. */
2424         if (rx_almostSent) {
2425             int drop = (*rx_almostSent) (p, &addr);
2426             /* drop packet if return value is non-zero? */
2427             if (drop)
2428                 deliveryType = 'D';     /* Drop the packet */
2429         }
2430 #endif
2431
2432         /* Get network byte order header */
2433         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2434                                          * touch ALL the fields */
2435     }
2436
2437     /* Send the packet out on the same socket that related packets are being
2438      * received on */
2439     socket =
2440         (conn->type ==
2441          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2442
2443 #ifdef RXDEBUG
2444     /* Possibly drop this packet,  for testing purposes */
2445     if ((deliveryType == 'D')
2446         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2447             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2448         deliveryType = 'D';     /* Drop the packet */
2449     } else {
2450         deliveryType = 'S';     /* Send the packet */
2451 #endif /* RXDEBUG */
2452
2453         /* Loop until the packet is sent.  We'd prefer just to use a
2454          * blocking socket, but unfortunately the interface doesn't
2455          * allow us to have the socket block in send mode, and not
2456          * block in receive mode */
2457 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2458         waslocked = ISAFS_GLOCK();
2459         if (!istack && waslocked)
2460             AFS_GUNLOCK();
2461 #endif
2462         if ((code =
2463              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2464                          istack)) != 0) {
2465             /* send failed, so let's hurry up the resend, eh? */
2466             if (rx_stats_active)
2467                 rx_atomic_inc(&rx_stats.netSendFailures);
2468             for (i = 0; i < len; i++) {
2469                 p = list[i];
2470                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2471             }
2472             /* Some systems are nice and tell us right away that we cannot
2473              * reach this recipient by returning an error code.
2474              * So, when this happens let's "down" the host NOW so
2475              * we don't sit around waiting for this host to timeout later.
2476              */
2477             if (call) {
2478                 rxi_NetSendError(call, code);
2479             }
2480         }
2481 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2482         if (!istack && waslocked)
2483             AFS_GLOCK();
2484 #endif
2485 #ifdef RXDEBUG
2486     }
2487
2488     osi_Assert(p != NULL);
2489
2490     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2491           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2492           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2493           p->header.seq, p->header.flags, p, p->length));
2494
2495 #endif
2496     if (rx_stats_active) {
2497         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2498         MUTEX_ENTER(&peer->peer_lock);
2499         peer->bytesSent += p->length;
2500         MUTEX_EXIT(&peer->peer_lock);
2501     }
2502 }
2503
2504 /* Send a raw abort packet, without any call or connection structures */
2505 void
2506 rxi_SendRawAbort(osi_socket socket, afs_uint32 host, u_short port,
2507                  afs_int32 error, struct rx_packet *source, int istack)
2508 {
2509     struct rx_header theader;
2510     struct sockaddr_in addr;
2511     struct iovec iov[2];
2512
2513     memset(&theader, 0, sizeof(theader));
2514     theader.epoch = htonl(source->header.epoch);
2515     theader.callNumber = htonl(source->header.callNumber);
2516     theader.serial = htonl(1);
2517     theader.type = RX_PACKET_TYPE_ABORT;
2518     theader.serviceId = htons(source->header.serviceId);
2519     theader.securityIndex = source->header.securityIndex;
2520     theader.cid = htonl(source->header.cid);
2521
2522     /*
2523      * If the abort is being sent in response to a server initiated packet,
2524      * set client_initiated in the abort to ensure it is not associated by
2525      * the receiver with a connection in the opposite direction.
2526      */
2527     if ((source->header.flags & RX_CLIENT_INITIATED) != RX_CLIENT_INITIATED)
2528         theader.flags |= RX_CLIENT_INITIATED;
2529
2530     error = htonl(error);
2531
2532     iov[0].iov_base = &theader;
2533     iov[0].iov_len = sizeof(struct rx_header);
2534     iov[1].iov_base = &error;
2535     iov[1].iov_len = sizeof(error);
2536
2537     addr.sin_family = AF_INET;
2538     addr.sin_addr.s_addr = host;
2539     addr.sin_port = port;
2540     memset(&addr.sin_zero, 0, sizeof(addr.sin_zero));
2541 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2542     addr.sin_len = sizeof(struct sockaddr_in);
2543 #endif
2544
2545     osi_NetSend(socket, &addr, iov, 2,
2546                 sizeof(struct rx_header) + sizeof(error), istack);
2547 }
2548
2549 /* Send a "special" packet to the peer connection.  If call is
2550  * specified, then the packet is directed to a specific call channel
2551  * associated with the connection, otherwise it is directed to the
2552  * connection only. Uses optionalPacket if it is supplied, rather than
2553  * allocating a new packet buffer.  Nbytes is the length of the data
2554  * portion of the packet.  If data is non-null, nbytes of data are
2555  * copied into the packet.  Type is the type of the packet, as defined
2556  * in rx.h.  Bug: there's a lot of duplication between this and other
2557  * routines.  This needs to be cleaned up. */
2558 struct rx_packet *
2559 rxi_SendSpecial(struct rx_call *call,
2560                 struct rx_connection *conn,
2561                 struct rx_packet *optionalPacket, int type, char *data,
2562                 int nbytes, int istack)
2563 {
2564     /* Some of the following stuff should be common code for all
2565      * packet sends (it's repeated elsewhere) */
2566     struct rx_packet *p;
2567     unsigned int i = 0;
2568     int savelen = 0, saven = 0;
2569     int channel, callNumber;
2570     if (call) {
2571         channel = call->channel;
2572         callNumber = *call->callNumber;
2573         /* BUSY packets refer to the next call on this connection */
2574         if (type == RX_PACKET_TYPE_BUSY) {
2575             callNumber++;
2576         }
2577     } else {
2578         channel = 0;
2579         callNumber = 0;
2580     }
2581     p = optionalPacket;
2582     if (!p) {
2583         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2584         if (!p)
2585             osi_Panic("rxi_SendSpecial failure");
2586     }
2587
2588     if (nbytes != -1)
2589         p->length = nbytes;
2590     else
2591         nbytes = p->length;
2592     p->header.serviceId = conn->serviceId;
2593     p->header.securityIndex = conn->securityIndex;
2594     p->header.cid = (conn->cid | channel);
2595     p->header.callNumber = callNumber;
2596     p->header.seq = 0;
2597     p->header.epoch = conn->epoch;
2598     p->header.type = type;
2599     p->header.flags = 0;
2600     if (conn->type == RX_CLIENT_CONNECTION)
2601         p->header.flags |= RX_CLIENT_INITIATED;
2602     if (data)
2603         rx_packetwrite(p, 0, nbytes, data);
2604
2605     for (i = 1; i < p->niovecs; i++) {
2606         if (nbytes <= p->wirevec[i].iov_len) {
2607             savelen = p->wirevec[i].iov_len;
2608             saven = p->niovecs;
2609             p->wirevec[i].iov_len = nbytes;
2610             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2611         } else
2612             nbytes -= p->wirevec[i].iov_len;
2613     }
2614
2615     if (call)
2616         rxi_Send(call, p, istack);
2617     else
2618         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2619     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2620         /* really need to do this, but it seems safer this way, given that  */
2621         /* sneaky optionalPacket... */
2622         p->wirevec[i - 1].iov_len = savelen;
2623         p->niovecs = saven;
2624     }
2625     if (!optionalPacket)
2626         rxi_FreePacket(p);
2627     return optionalPacket;
2628 }
2629
2630
2631 /* Encode the packet's header (from the struct header in the packet to
2632  * the net byte order representation in the wire representation of the
2633  * packet, which is what is actually sent out on the wire) */
2634 void
2635 rxi_EncodePacketHeader(struct rx_packet *p)
2636 {
2637     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2638
2639     memset(buf, 0, RX_HEADER_SIZE);
2640     *buf++ = htonl(p->header.epoch);
2641     *buf++ = htonl(p->header.cid);
2642     *buf++ = htonl(p->header.callNumber);
2643     *buf++ = htonl(p->header.seq);
2644     *buf++ = htonl(p->header.serial);
2645     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2646                    | (((afs_uint32) p->header.flags) << 16)
2647                    | (p->header.userStatus << 8) | p->header.securityIndex);
2648     /* Note: top 16 bits of this next word were reserved */
2649     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2650 }
2651
2652 /* Decode the packet's header (from net byte order to a struct header) */
2653 void
2654 rxi_DecodePacketHeader(struct rx_packet *p)
2655 {
2656     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2657     afs_uint32 temp;
2658
2659     p->header.epoch = ntohl(*buf);
2660     buf++;
2661     p->header.cid = ntohl(*buf);
2662     buf++;
2663     p->header.callNumber = ntohl(*buf);
2664     buf++;
2665     p->header.seq = ntohl(*buf);
2666     buf++;
2667     p->header.serial = ntohl(*buf);
2668     buf++;
2669
2670     temp = ntohl(*buf);
2671     buf++;
2672
2673     /* C will truncate byte fields to bytes for me */
2674     p->header.type = temp >> 24;
2675     p->header.flags = temp >> 16;
2676     p->header.userStatus = temp >> 8;
2677     p->header.securityIndex = temp >> 0;
2678
2679     temp = ntohl(*buf);
2680     buf++;
2681
2682     p->header.serviceId = (temp & 0xffff);
2683     p->header.spare = temp >> 16;
2684     /* Note: top 16 bits of this last word are the security checksum */
2685 }
2686
2687 /*
2688  * LOCKS HELD: called with call->lock held.
2689  *
2690  * PrepareSendPacket is the only place in the code that
2691  * can increment call->tnext.  This could become an atomic
2692  * in the future.  Beyond that there is nothing in this
2693  * function that requires the call being locked.  This
2694  * function can only be called by the application thread.
2695  */
2696 void
2697 rxi_PrepareSendPacket(struct rx_call *call,
2698                       struct rx_packet *p, int last)
2699 {
2700     struct rx_connection *conn = call->conn;
2701     afs_uint32 seq = call->tnext++;
2702     unsigned int i;
2703     afs_int32 len;              /* len must be a signed type; it can go negative */
2704     int code;
2705
2706     /* No data packets on call 0. Where do these come from? */
2707     if (*call->callNumber == 0)
2708         *call->callNumber = 1;
2709
2710     MUTEX_EXIT(&call->lock);
2711     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2712
2713     p->header.cid = (conn->cid | call->channel);
2714     p->header.serviceId = conn->serviceId;
2715     p->header.securityIndex = conn->securityIndex;
2716
2717     p->header.callNumber = *call->callNumber;
2718     p->header.seq = seq;
2719     p->header.epoch = conn->epoch;
2720     p->header.type = RX_PACKET_TYPE_DATA;
2721     p->header.flags = 0;
2722     p->header.spare = 0;
2723     if (conn->type == RX_CLIENT_CONNECTION)
2724         p->header.flags |= RX_CLIENT_INITIATED;
2725
2726     if (last)
2727         p->header.flags |= RX_LAST_PACKET;
2728
2729     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2730     p->header.serial = 0;       /* Another way of saying never transmitted... */
2731
2732     /* Now that we're sure this is the last data on the call, make sure
2733      * that the "length" and the sum of the iov_lens matches. */
2734     len = p->length + call->conn->securityHeaderSize;
2735
2736     for (i = 1; i < p->niovecs && len > 0; i++) {
2737         len -= p->wirevec[i].iov_len;
2738     }
2739     if (len > 0) {
2740         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2741     } else if (i < p->niovecs) {
2742         /* Free any extra elements in the wirevec */
2743 #if defined(RX_ENABLE_TSFPQ)
2744         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2745 #else /* !RX_ENABLE_TSFPQ */
2746         MUTEX_ENTER(&rx_freePktQ_lock);
2747         rxi_FreeDataBufsNoLock(p, i);
2748         MUTEX_EXIT(&rx_freePktQ_lock);
2749 #endif /* !RX_ENABLE_TSFPQ */
2750
2751         p->niovecs = i;
2752     }
2753     if (len)
2754         p->wirevec[i - 1].iov_len += len;
2755     MUTEX_ENTER(&call->lock);
2756     code = RXS_PreparePacket(conn->securityObject, call, p);
2757     if (code) {
2758         MUTEX_EXIT(&call->lock);
2759         rxi_ConnectionError(conn, code);
2760         MUTEX_ENTER(&conn->conn_data_lock);
2761         p = rxi_SendConnectionAbort(conn, p, 0, 0);
2762         MUTEX_EXIT(&conn->conn_data_lock);
2763         MUTEX_ENTER(&call->lock);
2764         /* setting a connection error means all calls for that conn are also
2765          * error'd. if this call does not have an error by now, something is
2766          * very wrong, and we risk sending data in the clear that is supposed
2767          * to be encrypted. */
2768         osi_Assert(call->error);
2769     }
2770 }
2771
2772 /* Given an interface MTU size, calculate an adjusted MTU size that
2773  * will make efficient use of the RX buffers when the peer is sending
2774  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2775 int
2776 rxi_AdjustIfMTU(int mtu)
2777 {
2778     int adjMTU;
2779     int frags;
2780
2781     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2782         return mtu;
2783     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2784     if (mtu <= adjMTU) {
2785         return mtu;
2786     }
2787     mtu -= adjMTU;
2788     if (mtu <= 0) {
2789         return adjMTU;
2790     }
2791     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2792     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2793 }
2794
2795 /* Given an interface MTU size, and the peer's advertised max receive
2796  * size, calculate an adjisted maxMTU size that makes efficient use
2797  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2798 int
2799 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2800 {
2801     int maxMTU = mtu * rxi_nSendFrags;
2802     maxMTU = MIN(maxMTU, peerMaxMTU);
2803     return rxi_AdjustIfMTU(maxMTU);
2804 }
2805
2806 /* Given a packet size, figure out how many datagram packet will fit.
2807  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2808  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2809  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2810 int
2811 rxi_AdjustDgramPackets(int frags, int mtu)
2812 {
2813     int maxMTU;
2814     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2815         return 1;
2816     }
2817     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2818     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2819     /* subtract the size of the first and last packets */
2820     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2821     if (maxMTU < 0) {
2822         return 1;
2823     }
2824     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2825 }
2826
2827 #ifndef KERNEL
2828 /*
2829  * This function can be used by the Windows Cache Manager
2830  * to dump the list of all rx packets so that we can determine
2831  * where the packet leakage is.
2832  */
2833 int rx_DumpPackets(FILE *outputFile, char *cookie)
2834 {
2835 #ifdef RXDEBUG_PACKET
2836     struct rx_packet *p;
2837 #ifdef AFS_NT40_ENV
2838     int zilch;
2839     char output[2048];
2840 #define RXDPRINTF sprintf
2841 #define RXDPRINTOUT output
2842 #else
2843 #define RXDPRINTF fprintf
2844 #define RXDPRINTOUT outputFile
2845 #endif
2846
2847     NETPRI;
2848     MUTEX_ENTER(&rx_freePktQ_lock);
2849     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2850 #ifdef AFS_NT40_ENV
2851     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2852 #endif
2853
2854     for (p = rx_mallocedP; p; p = p->allNextp) {
2855         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2856                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2857                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2858                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2859                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2860                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2861 #ifdef AFS_NT40_ENV
2862         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2863 #endif
2864     }
2865
2866     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2867 #ifdef AFS_NT40_ENV
2868     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2869 #endif
2870
2871     MUTEX_EXIT(&rx_freePktQ_lock);
2872     USERPRI;
2873 #endif /* RXDEBUG_PACKET */
2874     return 0;
2875 }
2876 #endif