src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #include <afs/param.h>
  12
  13 #ifdef KERNEL
  14 # if defined(UKERNEL)
  15 #  include "afs/sysincludes.h"
  16 #  include "afsincludes.h"
  17 #  include "rx_kcommon.h"
  18 # else /* defined(UKERNEL) */
  19 #  ifdef RX_KERNEL_TRACE
  20 #   include "rx_kcommon.h"
  21 #  endif
  22 #  include "h/types.h"
  23 #  ifndef AFS_LINUX20_ENV
  24 #   include "h/systm.h"
  25 #  endif
  26 #  if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV) || defined(AFS_NBSD50_ENV)
  27 #   include "afs/sysincludes.h"
  28 #  endif
  29 #  if defined(AFS_OBSD_ENV)
  30 #   include "h/proc.h"
  31 #  endif
  32 #  include "h/socket.h"
  33 #  if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  34 #   if  !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  35 #    include "sys/mount.h"              /* it gets pulled in by something later anyway */
  36 #   endif
  37 #   include "h/mbuf.h"
  38 #  endif
  39 #  include "netinet/in.h"
  40 #  include "afs/afs_osi.h"
  41 #  include "rx_kmutex.h"
  42 # endif /* defined(UKERNEL) */
  43 #else /* KERNEL */
  44 # include <roken.h>
  45 # include <assert.h>
  46 # include <afs/opr.h>
  47 # if defined(AFS_NT40_ENV)
  48 #  ifndef EWOULDBLOCK
  49 #   define EWOULDBLOCK WSAEWOULDBLOCK
  50 #  endif
  51 #  include "rx_user.h"
  52 #  include "rx_xmit_nt.h"
  53 # endif
  54 # include <lwp.h>
  55 #endif /* KERNEL */
  56
  57 #ifdef  AFS_SUN5_ENV
  58 # include <sys/sysmacros.h>
  59 #endif
  60
  61 #include <opr/queue.h>
  62
  63 #include "rx.h"
  64 #include "rx_clock.h"
  65 #include "rx_packet.h"
  66 #include "rx_atomic.h"
  67 #include "rx_globals.h"
  68 #include "rx_internal.h"
  69 #include "rx_stats.h"
  70
  71 #include "rx_peer.h"
  72 #include "rx_conn.h"
  73 #include "rx_call.h"
  74
  75 #ifdef RX_LOCKS_DB
  76 /* rxdb_fileID is used to identify the lock location, along with line#. */
  77 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  78 #endif /* RX_LOCKS_DB */
  79 static struct rx_packet *rx_mallocedP = 0;
  80 #ifdef RXDEBUG_PACKET
  81 static afs_uint32       rx_packet_id = 0;
  82 #endif
  83
  84 extern char cml_version_number[];
  85
  86 static int AllocPacketBufs(int class, int num_pkts, struct opr_queue *q);
  87
  88 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
  89                                 afs_uint32 ahost, short aport,
  90                                 afs_int32 istack);
  91 static struct rx_packet *rxi_AllocPacketNoLock(int class);
  92
  93 #ifndef KERNEL
  94 static void rxi_MorePacketsNoLock(int apackets);
  95 #endif
  96
  97 #ifdef RX_ENABLE_TSFPQ
  98 static int rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first,
  99                                  int flush_global);
 100 static void rxi_AdjustLocalPacketsTSFPQ(int num_keep_local,
 101                                         int allow_overcommit);
 102 #else
 103 static void rxi_FreePacketNoLock(struct rx_packet *p);
 104 static int rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first);
 105 static int rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first,
 106                                    struct opr_queue * q);
 107 #endif
 108
 109 extern struct opr_queue rx_idleServerQueue;
 110
 111 /* some rules about packets:
 112  * 1.  When a packet is allocated, the final iov_buf contains room for
 113  * a security trailer, but iov_len masks that fact.  If the security
 114  * package wants to add the trailer, it may do so, and then extend
 115  * iov_len appropriately.  For this reason, packet's niovecs and
 116  * iov_len fields should be accurate before calling PreparePacket.
 117 */
 118
 119 /* Preconditions:
 120  *        all packet buffers (iov_base) are integral multiples of
 121  *        the word size.
 122  *        offset is an integral multiple of the word size.
 123  */
 124 afs_int32
 125 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 126 {
 127     unsigned int i;
 128     size_t l;
 129     for (l = 0, i = 1; i < packet->niovecs; i++) {
 130         if (l + packet->wirevec[i].iov_len > offset) {
 131             return
 132                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 133                                  (offset - l)));
 134         }
 135         l += packet->wirevec[i].iov_len;
 136     }
 137
 138     return 0;
 139 }
 140
 141 /* Preconditions:
 142  *        all packet buffers (iov_base) are integral multiples of the word size.
 143  *        offset is an integral multiple of the word size.
 144  */
 145 afs_int32
 146 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 147 {
 148     unsigned int i;
 149     size_t l;
 150     for (l = 0, i = 1; i < packet->niovecs; i++) {
 151         if (l + packet->wirevec[i].iov_len > offset) {
 152             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 153                              (offset - l))) = data;
 154             return 0;
 155         }
 156         l += packet->wirevec[i].iov_len;
 157     }
 158
 159     return 0;
 160 }
 161
 162 /* Preconditions:
 163  *        all packet buffers (iov_base) are integral multiples of the
 164  *        word size.
 165  *        offset is an integral multiple of the word size.
 166  * Packet Invariants:
 167  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 168  */
 169 afs_int32
 170 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 171                   char *out)
 172 {
 173     unsigned int i, j, l, r;
 174     for (l = 0, i = 1; i < packet->niovecs; i++) {
 175         if (l + packet->wirevec[i].iov_len > offset) {
 176             break;
 177         }
 178         l += packet->wirevec[i].iov_len;
 179     }
 180
 181     /* i is the iovec which contains the first little bit of data in which we
 182      * are interested.  l is the total length of everything prior to this iovec.
 183      * j is the number of bytes we can safely copy out of this iovec.
 184      * offset only applies to the first iovec.
 185      */
 186     r = resid;
 187     while ((r > 0) && (i < packet->niovecs)) {
 188         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 189         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 190         r -= j;
 191         out += j;
 192         l += packet->wirevec[i].iov_len;
 193         offset = l;
 194         i++;
 195     }
 196
 197     return (r ? (resid - r) : resid);
 198 }
 199
 200
 201 /* Preconditions:
 202  *        all packet buffers (iov_base) are integral multiples of the
 203  *        word size.
 204  *        offset is an integral multiple of the word size.
 205  */
 206 afs_int32
 207 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 208 {
 209     unsigned int i, j, l, o, r;
 210     char *b;
 211
 212     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 213         if (l + packet->wirevec[i].iov_len > o) {
 214             break;
 215         }
 216         l += packet->wirevec[i].iov_len;
 217     }
 218
 219     /* i is the iovec which contains the first little bit of data in which we
 220      * are interested.  l is the total length of everything prior to this iovec.
 221      * j is the number of bytes we can safely copy out of this iovec.
 222      * offset only applies to the first iovec.
 223      */
 224     r = resid;
 225     while ((r > 0) && (i <= RX_MAXWVECS)) {
 226         if (i >= packet->niovecs)
 227             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 228                 break;
 229
 230         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 231         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 232         memcpy(b, in, j);
 233         r -= j;
 234         in += j;
 235         l += packet->wirevec[i].iov_len;
 236         offset = l;
 237         i++;
 238     }
 239
 240     return (r ? (resid - r) : resid);
 241 }
 242
 243 int
 244 rxi_AllocPackets(int class, int num_pkts, struct opr_queue * q)
 245 {
 246     struct opr_queue *c;
 247
 248     num_pkts = AllocPacketBufs(class, num_pkts, q);
 249
 250     for (opr_queue_Scan(q, c)) {
 251         RX_PACKET_IOV_FULLINIT(opr_queue_Entry(c, struct rx_packet, entry));
 252     }
 253
 254     return num_pkts;
 255 }
 256
 257 #ifdef RX_ENABLE_TSFPQ
 258 static int
 259 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 260 {
 261     struct rx_ts_info_t * rx_ts_info;
 262     int transfer;
 263     SPLVAR;
 264
 265     RX_TS_INFO_GET(rx_ts_info);
 266
 267     transfer = num_pkts - rx_ts_info->_FPQ.len;
 268     if (transfer > 0) {
 269         NETPRI;
 270         MUTEX_ENTER(&rx_freePktQ_lock);
 271         transfer = MAX(transfer, rx_TSFPQGlobSize);
 272         if (transfer > rx_nFreePackets) {
 273             /* alloc enough for us, plus a few globs for other threads */
 274             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 275         }
 276
 277         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 278
 279         MUTEX_EXIT(&rx_freePktQ_lock);
 280         USERPRI;
 281     }
 282
 283     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 284
 285     return num_pkts;
 286 }
 287 #else /* RX_ENABLE_TSFPQ */
 288 static int
 289 AllocPacketBufs(int class, int num_pkts, struct opr_queue * q)
 290 {
 291     struct rx_packet *c;
 292     int i;
 293 #ifdef KERNEL
 294     int overq = 0;
 295 #endif
 296     SPLVAR;
 297
 298     NETPRI;
 299
 300     MUTEX_ENTER(&rx_freePktQ_lock);
 301
 302 #ifdef KERNEL
 303     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 304          num_pkts--, overq++);
 305
 306     if (overq) {
 307         rxi_NeedMorePackets = TRUE;
 308         if (rx_stats_active) {
 309             switch (class) {
 310             case RX_PACKET_CLASS_RECEIVE:
 311                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
 312                 break;
 313             case RX_PACKET_CLASS_SEND:
 314                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
 315                 break;
 316             case RX_PACKET_CLASS_SPECIAL:
 317                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
 318                 break;
 319             case RX_PACKET_CLASS_RECV_CBUF:
 320                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
 321                 break;
 322             case RX_PACKET_CLASS_SEND_CBUF:
 323                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
 324                 break;
 325             }
 326         }
 327     }
 328
 329     if (rx_nFreePackets < num_pkts)
 330         num_pkts = rx_nFreePackets;
 331
 332     if (!num_pkts) {
 333         rxi_NeedMorePackets = TRUE;
 334         goto done;
 335     }
 336 #else /* KERNEL */
 337     if (rx_nFreePackets < num_pkts) {
 338         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 339     }
 340 #endif /* KERNEL */
 341
 342     for (i=0, c=opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
 343          i < num_pkts;
 344          i++, c=opr_queue_Next(&c->entry, struct rx_packet, entry)) {
 345         RX_FPQ_MARK_USED(c);
 346     }
 347
 348     opr_queue_SplitBeforeAppend(&rx_freePacketQueue, q, &c->entry);
 349
 350     rx_nFreePackets -= num_pkts;
 351
 352 #ifdef KERNEL
 353   done:
 354 #endif
 355     MUTEX_EXIT(&rx_freePktQ_lock);
 356
 357     USERPRI;
 358     return num_pkts;
 359 }
 360 #endif /* RX_ENABLE_TSFPQ */
 361
 362 /*
 363  * Free a packet currently used as a continuation buffer
 364  */
 365 #ifdef RX_ENABLE_TSFPQ
 366 /* num_pkts=0 means queue length is unknown */
 367 int
 368 rxi_FreePackets(int num_pkts, struct opr_queue * q)
 369 {
 370     struct rx_ts_info_t * rx_ts_info;
 371     struct opr_queue *cursor, *store;
 372     SPLVAR;
 373
 374     osi_Assert(num_pkts >= 0);
 375     RX_TS_INFO_GET(rx_ts_info);
 376
 377     if (!num_pkts) {
 378         for (opr_queue_ScanSafe(q, cursor, store)) {
 379             num_pkts++;
 380             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 381                                                  entry), 2, 0);
 382         }
 383     } else {
 384         for (opr_queue_ScanSafe(q, cursor, store)) {
 385             rxi_FreeDataBufsTSFPQ(opr_queue_Entry(cursor, struct rx_packet,
 386                                                  entry), 2, 0);
 387         }
 388     }
 389
 390     if (num_pkts) {
 391         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 392     }
 393
 394     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 395         NETPRI;
 396         MUTEX_ENTER(&rx_freePktQ_lock);
 397
 398         RX_TS_FPQ_LTOG(rx_ts_info);
 399
 400         /* Wakeup anyone waiting for packets */
 401         rxi_PacketsUnWait();
 402
 403         MUTEX_EXIT(&rx_freePktQ_lock);
 404         USERPRI;
 405     }
 406
 407     return num_pkts;
 408 }
 409 #else /* RX_ENABLE_TSFPQ */
 410 /* num_pkts=0 means queue length is unknown */
 411 int
 412 rxi_FreePackets(int num_pkts, struct opr_queue *q)
 413 {
 414     struct opr_queue cbs;
 415     struct opr_queue *cursor, *store;
 416     int qlen = 0;
 417     SPLVAR;
 418
 419     osi_Assert(num_pkts >= 0);
 420     opr_queue_Init(&cbs);
 421
 422     if (!num_pkts) {
 423         for (opr_queue_ScanSafe(q, cursor, store)) {
 424             struct rx_packet *p
 425                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 426             if (p->niovecs > 2) {
 427                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 428             }
 429             RX_FPQ_MARK_FREE(p);
 430             num_pkts++;
 431         }
 432         if (!num_pkts)
 433             return 0;
 434     } else {
 435         for (opr_queue_ScanSafe(q, cursor, store)) {
 436             struct rx_packet *p
 437                 = opr_queue_Entry(cursor, struct rx_packet, entry);
 438
 439             if (p->niovecs > 2) {
 440                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 441             }
 442             RX_FPQ_MARK_FREE(p);
 443         }
 444     }
 445
 446     if (qlen) {
 447         opr_queue_SpliceAppend(q, &cbs);
 448         qlen += num_pkts;
 449     } else
 450         qlen = num_pkts;
 451
 452     NETPRI;
 453     MUTEX_ENTER(&rx_freePktQ_lock);
 454
 455     opr_queue_SpliceAppend(&rx_freePacketQueue, q);
 456     rx_nFreePackets += qlen;
 457
 458     /* Wakeup anyone waiting for packets */
 459     rxi_PacketsUnWait();
 460
 461     MUTEX_EXIT(&rx_freePktQ_lock);
 462     USERPRI;
 463
 464     return num_pkts;
 465 }
 466 #endif /* RX_ENABLE_TSFPQ */
 467
 468 /* this one is kind of awful.
 469  * In rxkad, the packet has been all shortened, and everything, ready for
 470  * sending.  All of a sudden, we discover we need some of that space back.
 471  * This isn't terribly general, because it knows that the packets are only
 472  * rounded up to the EBS (userdata + security header).
 473  */
 474 int
 475 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 476 {
 477     int i;
 478     i = p->niovecs - 1;
 479     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 480         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 481             p->wirevec[i].iov_len += nb;
 482             return 0;
 483         }
 484     } else {
 485         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 486             p->wirevec[i].iov_len += nb;
 487             return 0;
 488         }
 489     }
 490
 491     return 0;
 492 }
 493
 494 /* get sufficient space to store nb bytes of data (or more), and hook
 495  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 496  * returns the number of bytes >0 which it failed to come up with.
 497  * Don't need to worry about locking on packet, since only
 498  * one thread can manipulate one at a time. Locking on continution
 499  * packets is handled by AllocPacketBufs */
 500 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 501 int
 502 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 503 {
 504     int i, nv;
 505     struct opr_queue q, *cursor, *store;
 506
 507     /* compute the number of cbuf's we need */
 508     nv = nb / RX_CBUFFERSIZE;
 509     if ((nv * RX_CBUFFERSIZE) < nb)
 510         nv++;
 511     if ((nv + p->niovecs) > RX_MAXWVECS)
 512         nv = RX_MAXWVECS - p->niovecs;
 513     if (nv < 1)
 514         return nb;
 515
 516     /* allocate buffers */
 517     opr_queue_Init(&q);
 518     nv = AllocPacketBufs(class, nv, &q);
 519
 520     /* setup packet iovs */
 521     i = p ->niovecs;
 522     for (opr_queue_ScanSafe(&q, cursor, store)) {
 523         struct rx_packet *cb
 524             = opr_queue_Entry(cursor, struct rx_packet, entry);
 525
 526         opr_queue_Remove(&cb->entry);
 527         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 528         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 529         i++;
 530     }
 531
 532     nb -= (nv * RX_CBUFFERSIZE);
 533     p->length += (nv * RX_CBUFFERSIZE);
 534     p->niovecs += nv;
 535
 536     return nb;
 537 }
 538
 539 /* Add more packet buffers */
 540 #ifdef RX_ENABLE_TSFPQ
 541 void
 542 rxi_MorePackets(int apackets)
 543 {
 544     struct rx_packet *p, *e;
 545     struct rx_ts_info_t * rx_ts_info;
 546     int getme;
 547     SPLVAR;
 548
 549     getme = apackets * sizeof(struct rx_packet);
 550     p = osi_Alloc(getme);
 551     osi_Assert(p);
 552
 553     PIN(p, getme);              /* XXXXX */
 554     memset(p, 0, getme);
 555     RX_TS_INFO_GET(rx_ts_info);
 556
 557     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 558     /* TSFPQ patch also needs to keep track of total packets */
 559
 560     MUTEX_ENTER(&rx_packets_mutex);
 561     rx_nPackets += apackets;
 562     RX_TS_FPQ_COMPUTE_LIMITS;
 563     MUTEX_EXIT(&rx_packets_mutex);
 564
 565     for (e = p + apackets; p < e; p++) {
 566         RX_PACKET_IOV_INIT(p);
 567         p->niovecs = 2;
 568
 569         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 570
 571         NETPRI;
 572         MUTEX_ENTER(&rx_freePktQ_lock);
 573 #ifdef RXDEBUG_PACKET
 574         p->packetId = rx_packet_id++;
 575         p->allNextp = rx_mallocedP;
 576 #endif /* RXDEBUG_PACKET */
 577         rx_mallocedP = p;
 578         MUTEX_EXIT(&rx_freePktQ_lock);
 579         USERPRI;
 580     }
 581     rx_ts_info->_FPQ.delta += apackets;
 582
 583     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 584         NETPRI;
 585         MUTEX_ENTER(&rx_freePktQ_lock);
 586
 587         RX_TS_FPQ_LTOG(rx_ts_info);
 588         rxi_NeedMorePackets = FALSE;
 589         rxi_PacketsUnWait();
 590
 591         MUTEX_EXIT(&rx_freePktQ_lock);
 592         USERPRI;
 593     }
 594 }
 595 #else /* RX_ENABLE_TSFPQ */
 596 void
 597 rxi_MorePackets(int apackets)
 598 {
 599     struct rx_packet *p, *e;
 600     int getme;
 601     SPLVAR;
 602
 603     getme = apackets * sizeof(struct rx_packet);
 604     p = osi_Alloc(getme);
 605     osi_Assert(p);
 606
 607     PIN(p, getme);              /* XXXXX */
 608     memset(p, 0, getme);
 609     NETPRI;
 610     MUTEX_ENTER(&rx_freePktQ_lock);
 611
 612     for (e = p + apackets; p < e; p++) {
 613         RX_PACKET_IOV_INIT(p);
 614 #ifdef RX_TRACK_PACKETS
 615         p->flags |= RX_PKTFLAG_FREE;
 616 #endif
 617         p->niovecs = 2;
 618
 619         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 620 #ifdef RXDEBUG_PACKET
 621         p->packetId = rx_packet_id++;
 622         p->allNextp = rx_mallocedP;
 623 #endif /* RXDEBUG_PACKET */
 624         rx_mallocedP = p;
 625     }
 626
 627     rx_nPackets += apackets;
 628     rx_nFreePackets += apackets;
 629     rxi_NeedMorePackets = FALSE;
 630     rxi_PacketsUnWait();
 631
 632     MUTEX_EXIT(&rx_freePktQ_lock);
 633     USERPRI;
 634 }
 635 #endif /* RX_ENABLE_TSFPQ */
 636
 637 #ifdef RX_ENABLE_TSFPQ
 638 void
 639 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 640 {
 641     struct rx_packet *p, *e;
 642     struct rx_ts_info_t * rx_ts_info;
 643     int getme;
 644     SPLVAR;
 645
 646     getme = apackets * sizeof(struct rx_packet);
 647     p = osi_Alloc(getme);
 648
 649     PIN(p, getme);              /* XXXXX */
 650     memset(p, 0, getme);
 651     RX_TS_INFO_GET(rx_ts_info);
 652
 653     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 654     /* TSFPQ patch also needs to keep track of total packets */
 655     MUTEX_ENTER(&rx_packets_mutex);
 656     rx_nPackets += apackets;
 657     RX_TS_FPQ_COMPUTE_LIMITS;
 658     MUTEX_EXIT(&rx_packets_mutex);
 659
 660     for (e = p + apackets; p < e; p++) {
 661         RX_PACKET_IOV_INIT(p);
 662         p->niovecs = 2;
 663         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 664
 665         NETPRI;
 666         MUTEX_ENTER(&rx_freePktQ_lock);
 667 #ifdef RXDEBUG_PACKET
 668         p->packetId = rx_packet_id++;
 669         p->allNextp = rx_mallocedP;
 670 #endif /* RXDEBUG_PACKET */
 671         rx_mallocedP = p;
 672         MUTEX_EXIT(&rx_freePktQ_lock);
 673         USERPRI;
 674     }
 675     rx_ts_info->_FPQ.delta += apackets;
 676
 677     if (flush_global &&
 678         (num_keep_local < apackets)) {
 679         NETPRI;
 680         MUTEX_ENTER(&rx_freePktQ_lock);
 681
 682         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 683         rxi_NeedMorePackets = FALSE;
 684         rxi_PacketsUnWait();
 685
 686         MUTEX_EXIT(&rx_freePktQ_lock);
 687         USERPRI;
 688     }
 689 }
 690 #endif /* RX_ENABLE_TSFPQ */
 691
 692 #ifndef KERNEL
 693 /* Add more packet buffers */
 694 static void
 695 rxi_MorePacketsNoLock(int apackets)
 696 {
 697 #ifdef RX_ENABLE_TSFPQ
 698     struct rx_ts_info_t * rx_ts_info;
 699 #endif /* RX_ENABLE_TSFPQ */
 700     struct rx_packet *p, *e;
 701     int getme;
 702
 703     /* allocate enough packets that 1/4 of the packets will be able
 704      * to hold maximal amounts of data */
 705     apackets += (apackets / 4)
 706         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 707     do {
 708         getme = apackets * sizeof(struct rx_packet);
 709         p = osi_Alloc(getme);
 710         if (p == NULL) {
 711             apackets -= apackets / 4;
 712             osi_Assert(apackets > 0);
 713         }
 714     } while(p == NULL);
 715     memset(p, 0, getme);
 716
 717 #ifdef RX_ENABLE_TSFPQ
 718     RX_TS_INFO_GET(rx_ts_info);
 719     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 720 #endif /* RX_ENABLE_TSFPQ */
 721
 722     for (e = p + apackets; p < e; p++) {
 723         RX_PACKET_IOV_INIT(p);
 724 #ifdef RX_TRACK_PACKETS
 725         p->flags |= RX_PKTFLAG_FREE;
 726 #endif
 727         p->niovecs = 2;
 728
 729         opr_queue_Append(&rx_freePacketQueue, &p->entry);
 730 #ifdef RXDEBUG_PACKET
 731         p->packetId = rx_packet_id++;
 732         p->allNextp = rx_mallocedP;
 733 #endif /* RXDEBUG_PACKET */
 734         rx_mallocedP = p;
 735     }
 736
 737     rx_nFreePackets += apackets;
 738     MUTEX_ENTER(&rx_packets_mutex);
 739     rx_nPackets += apackets;
 740 #ifdef RX_ENABLE_TSFPQ
 741     RX_TS_FPQ_COMPUTE_LIMITS;
 742 #endif /* RX_ENABLE_TSFPQ */
 743     MUTEX_EXIT(&rx_packets_mutex);
 744     rxi_NeedMorePackets = FALSE;
 745     rxi_PacketsUnWait();
 746 }
 747 #endif /* !KERNEL */
 748
 749 void
 750 rxi_FreeAllPackets(void)
 751 {
 752     /* must be called at proper interrupt level, etcetera */
 753     /* MTUXXX need to free all Packets */
 754     osi_Free(rx_mallocedP,
 755              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 756     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 757 }
 758
 759 #ifdef RX_ENABLE_TSFPQ
 760 static void
 761 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 762 {
 763     struct rx_ts_info_t * rx_ts_info;
 764     int xfer;
 765     SPLVAR;
 766
 767     RX_TS_INFO_GET(rx_ts_info);
 768
 769     if (num_keep_local != rx_ts_info->_FPQ.len) {
 770         NETPRI;
 771         MUTEX_ENTER(&rx_freePktQ_lock);
 772         if (num_keep_local < rx_ts_info->_FPQ.len) {
 773             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 774             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 775             rxi_PacketsUnWait();
 776         } else {
 777             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 778             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 779                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 780             if (rx_nFreePackets < xfer) {
 781                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 782             }
 783             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 784         }
 785         MUTEX_EXIT(&rx_freePktQ_lock);
 786         USERPRI;
 787     }
 788 }
 789
 790 void
 791 rxi_FlushLocalPacketsTSFPQ(void)
 792 {
 793     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 794 }
 795 #endif /* RX_ENABLE_TSFPQ */
 796
 797 /* Allocate more packets iff we need more continuation buffers */
 798 /* In kernel, can't page in memory with interrupts disabled, so we
 799  * don't use the event mechanism. */
 800 void
 801 rx_CheckPackets(void)
 802 {
 803     if (rxi_NeedMorePackets) {
 804         rxi_MorePackets(rx_maxSendWindow);
 805     }
 806 }
 807
 808 /* In the packet freeing routine below, the assumption is that
 809    we want all of the packets to be used equally frequently, so that we
 810    don't get packet buffers paging out.  It would be just as valid to
 811    assume that we DO want them to page out if not many are being used.
 812    In any event, we assume the former, and append the packets to the end
 813    of the free list.  */
 814 /* This explanation is bogus.  The free list doesn't remain in any kind of
 815    useful order for afs_int32: the packets in use get pretty much randomly scattered
 816    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 817    must be stored so that packets which are adjacent in memory are adjacent in the
 818    free list.  An array springs rapidly to mind.
 819    */
 820
 821 /* Actually free the packet p. */
 822 #ifndef RX_ENABLE_TSFPQ
 823 static void
 824 rxi_FreePacketNoLock(struct rx_packet *p)
 825 {
 826     dpf(("Free %"AFS_PTR_FMT"\n", p));
 827
 828     RX_FPQ_MARK_FREE(p);
 829     rx_nFreePackets++;
 830     opr_queue_Append(&rx_freePacketQueue, &p->entry);
 831 }
 832 #endif /* RX_ENABLE_TSFPQ */
 833
 834 #ifdef RX_ENABLE_TSFPQ
 835 static void
 836 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 837 {
 838     struct rx_ts_info_t * rx_ts_info;
 839     dpf(("Free %"AFS_PTR_FMT"\n", p));
 840
 841     RX_TS_INFO_GET(rx_ts_info);
 842     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 843
 844     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 845         NETPRI;
 846         MUTEX_ENTER(&rx_freePktQ_lock);
 847
 848         RX_TS_FPQ_LTOG(rx_ts_info);
 849
 850         /* Wakeup anyone waiting for packets */
 851         rxi_PacketsUnWait();
 852
 853         MUTEX_EXIT(&rx_freePktQ_lock);
 854         USERPRI;
 855     }
 856 }
 857 #endif /* RX_ENABLE_TSFPQ */
 858
 859 /*
 860  * free continuation buffers off a packet into a queue
 861  *
 862  * [IN] p      -- packet from which continuation buffers will be freed
 863  * [IN] first  -- iovec offset of first continuation buffer to free
 864  * [IN] q      -- queue into which continuation buffers will be chained
 865  *
 866  * returns:
 867  *   number of continuation buffers freed
 868  */
 869 #ifndef RX_ENABLE_TSFPQ
 870 static int
 871 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct opr_queue * q)
 872 {
 873     struct iovec *iov;
 874     struct rx_packet * cb;
 875     int count = 0;
 876
 877     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 878         iov = &p->wirevec[first];
 879         if (!iov->iov_base)
 880             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 881         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 882         RX_FPQ_MARK_FREE(cb);
 883         opr_queue_Append(q, &cb->entry);
 884     }
 885     p->length = 0;
 886     p->niovecs = 0;
 887
 888     return count;
 889 }
 890
 891 /*
 892  * free packet continuation buffers into the global free packet pool
 893  *
 894  * [IN] p      -- packet from which to free continuation buffers
 895  * [IN] first  -- iovec offset of first continuation buffer to free
 896  *
 897  * returns:
 898  *   zero always
 899  */
 900 static int
 901 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 902 {
 903     struct iovec *iov;
 904
 905     for (first = MAX(2, first); first < p->niovecs; first++) {
 906         iov = &p->wirevec[first];
 907         if (!iov->iov_base)
 908             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 909         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 910     }
 911     p->length = 0;
 912     p->niovecs = 0;
 913
 914     return 0;
 915 }
 916
 917 #else
 918
 919 /*
 920  * free packet continuation buffers into the thread-local free pool
 921  *
 922  * [IN] p             -- packet from which continuation buffers will be freed
 923  * [IN] first         -- iovec offset of first continuation buffer to free
 924  *                       any value less than 2, the min number of iovecs,
 925  *                       is treated as if it is 2.
 926  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 927  *                       global free pool before returning
 928  *
 929  * returns:
 930  *   zero always
 931  */
 932 static int
 933 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 934 {
 935     struct iovec *iov;
 936     struct rx_ts_info_t * rx_ts_info;
 937
 938     RX_TS_INFO_GET(rx_ts_info);
 939
 940     for (first = MAX(2, first); first < p->niovecs; first++) {
 941         iov = &p->wirevec[first];
 942         if (!iov->iov_base)
 943             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 944         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 945     }
 946     p->length = 0;
 947     p->niovecs = 0;
 948
 949     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 950         NETPRI;
 951         MUTEX_ENTER(&rx_freePktQ_lock);
 952
 953         RX_TS_FPQ_LTOG(rx_ts_info);
 954
 955         /* Wakeup anyone waiting for packets */
 956         rxi_PacketsUnWait();
 957
 958         MUTEX_EXIT(&rx_freePktQ_lock);
 959         USERPRI;
 960     }
 961     return 0;
 962 }
 963 #endif /* RX_ENABLE_TSFPQ */
 964
 965 int rxi_nBadIovecs = 0;
 966
 967 /* rxi_RestoreDataBufs
 968  *
 969  * Restore the correct sizes to the iovecs. Called when reusing a packet
 970  * for reading off the wire.
 971  */
 972 void
 973 rxi_RestoreDataBufs(struct rx_packet *p)
 974 {
 975     unsigned int i;
 976     struct iovec *iov;
 977
 978     RX_PACKET_IOV_INIT(p);
 979
 980     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 981         if (!iov->iov_base) {
 982             rxi_nBadIovecs++;
 983             p->niovecs = i;
 984             break;
 985         }
 986         iov->iov_len = RX_CBUFFERSIZE;
 987     }
 988 }
 989
 990 #ifdef RX_ENABLE_TSFPQ
 991 int
 992 rxi_TrimDataBufs(struct rx_packet *p, int first)
 993 {
 994     int length;
 995     struct iovec *iov, *end;
 996     struct rx_ts_info_t * rx_ts_info;
 997     SPLVAR;
 998
 999     if (first != 1)
1000         osi_Panic("TrimDataBufs 1: first must be 1");
1001
1002     /* Skip over continuation buffers containing message data */
1003     iov = &p->wirevec[2];
1004     end = iov + (p->niovecs - 2);
1005     length = p->length - p->wirevec[1].iov_len;
1006     for (; iov < end && length > 0; iov++) {
1007         if (!iov->iov_base)
1008             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1009         length -= iov->iov_len;
1010     }
1011
1012     /* iov now points to the first empty data buffer. */
1013     if (iov >= end)
1014         return 0;
1015
1016     RX_TS_INFO_GET(rx_ts_info);
1017     for (; iov < end; iov++) {
1018         if (!iov->iov_base)
1019             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1020         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1021         p->niovecs--;
1022     }
1023     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1024         NETPRI;
1025         MUTEX_ENTER(&rx_freePktQ_lock);
1026
1027         RX_TS_FPQ_LTOG(rx_ts_info);
1028         rxi_PacketsUnWait();
1029
1030         MUTEX_EXIT(&rx_freePktQ_lock);
1031         USERPRI;
1032     }
1033
1034     return 0;
1035 }
1036 #else /* RX_ENABLE_TSFPQ */
1037 int
1038 rxi_TrimDataBufs(struct rx_packet *p, int first)
1039 {
1040     int length;
1041     struct iovec *iov, *end;
1042     SPLVAR;
1043
1044     if (first != 1)
1045         osi_Panic("TrimDataBufs 1: first must be 1");
1046
1047     /* Skip over continuation buffers containing message data */
1048     iov = &p->wirevec[2];
1049     end = iov + (p->niovecs - 2);
1050     length = p->length - p->wirevec[1].iov_len;
1051     for (; iov < end && length > 0; iov++) {
1052         if (!iov->iov_base)
1053             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1054         length -= iov->iov_len;
1055     }
1056
1057     /* iov now points to the first empty data buffer. */
1058     if (iov >= end)
1059         return 0;
1060
1061     NETPRI;
1062     MUTEX_ENTER(&rx_freePktQ_lock);
1063
1064     for (; iov < end; iov++) {
1065         if (!iov->iov_base)
1066             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1067         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1068         p->niovecs--;
1069     }
1070     rxi_PacketsUnWait();
1071
1072     MUTEX_EXIT(&rx_freePktQ_lock);
1073     USERPRI;
1074
1075     return 0;
1076 }
1077 #endif /* RX_ENABLE_TSFPQ */
1078
1079 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1080  * remove it yourself first if you call this routine. */
1081 #ifdef RX_ENABLE_TSFPQ
1082 void
1083 rxi_FreePacket(struct rx_packet *p)
1084 {
1085     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1086     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1087 }
1088 #else /* RX_ENABLE_TSFPQ */
1089 void
1090 rxi_FreePacket(struct rx_packet *p)
1091 {
1092     SPLVAR;
1093
1094     NETPRI;
1095     MUTEX_ENTER(&rx_freePktQ_lock);
1096
1097     rxi_FreeDataBufsNoLock(p, 2);
1098     rxi_FreePacketNoLock(p);
1099     /* Wakeup anyone waiting for packets */
1100     rxi_PacketsUnWait();
1101
1102     MUTEX_EXIT(&rx_freePktQ_lock);
1103     USERPRI;
1104 }
1105 #endif /* RX_ENABLE_TSFPQ */
1106
1107 /* rxi_AllocPacket sets up p->length so it reflects the number of
1108  * bytes in the packet at this point, **not including** the header.
1109  * The header is absolutely necessary, besides, this is the way the
1110  * length field is usually used */
1111 #ifdef RX_ENABLE_TSFPQ
1112 static struct rx_packet *
1113 rxi_AllocPacketNoLock(int class)
1114 {
1115     struct rx_packet *p;
1116     struct rx_ts_info_t * rx_ts_info;
1117
1118     RX_TS_INFO_GET(rx_ts_info);
1119
1120     if (rx_stats_active)
1121         rx_atomic_inc(&rx_stats.packetRequests);
1122     if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1123
1124 #ifdef KERNEL
1125         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1126             osi_Panic("rxi_AllocPacket error");
1127 #else /* KERNEL */
1128         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1129             rxi_MorePacketsNoLock(rx_maxSendWindow);
1130 #endif /* KERNEL */
1131
1132
1133         RX_TS_FPQ_GTOL(rx_ts_info);
1134     }
1135
1136     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1137
1138     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1139
1140
1141     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1142      * order to truncate outbound packets.  In the near future, may need
1143      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1144      */
1145     RX_PACKET_IOV_FULLINIT(p);
1146     return p;
1147 }
1148 #else /* RX_ENABLE_TSFPQ */
1149 static struct rx_packet *
1150 rxi_AllocPacketNoLock(int class)
1151 {
1152     struct rx_packet *p;
1153
1154 #ifdef KERNEL
1155     if (rxi_OverQuota(class)) {
1156         rxi_NeedMorePackets = TRUE;
1157         if (rx_stats_active) {
1158             switch (class) {
1159             case RX_PACKET_CLASS_RECEIVE:
1160                 rx_atomic_inc(&rx_stats.receivePktAllocFailures);
1161                 break;
1162             case RX_PACKET_CLASS_SEND:
1163                 rx_atomic_inc(&rx_stats.sendPktAllocFailures);
1164                 break;
1165             case RX_PACKET_CLASS_SPECIAL:
1166                 rx_atomic_inc(&rx_stats.specialPktAllocFailures);
1167                 break;
1168             case RX_PACKET_CLASS_RECV_CBUF:
1169                 rx_atomic_inc(&rx_stats.receiveCbufPktAllocFailures);
1170                 break;
1171             case RX_PACKET_CLASS_SEND_CBUF:
1172                 rx_atomic_inc(&rx_stats.sendCbufPktAllocFailures);
1173                 break;
1174             }
1175         }
1176         return (struct rx_packet *)0;
1177     }
1178 #endif /* KERNEL */
1179
1180     if (rx_stats_active)
1181         rx_atomic_inc(&rx_stats.packetRequests);
1182
1183 #ifdef KERNEL
1184     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1185         osi_Panic("rxi_AllocPacket error");
1186 #else /* KERNEL */
1187     if (opr_queue_IsEmpty(&rx_freePacketQueue))
1188         rxi_MorePacketsNoLock(rx_maxSendWindow);
1189 #endif /* KERNEL */
1190
1191     rx_nFreePackets--;
1192     p = opr_queue_First(&rx_freePacketQueue, struct rx_packet, entry);
1193     opr_queue_Remove(&p->entry);
1194     RX_FPQ_MARK_USED(p);
1195
1196     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1197
1198
1199     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1200      * order to truncate outbound packets.  In the near future, may need
1201      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1202      */
1203     RX_PACKET_IOV_FULLINIT(p);
1204     return p;
1205 }
1206 #endif /* RX_ENABLE_TSFPQ */
1207
1208 #ifdef RX_ENABLE_TSFPQ
1209 static struct rx_packet *
1210 rxi_AllocPacketTSFPQ(int class, int pull_global)
1211 {
1212     struct rx_packet *p;
1213     struct rx_ts_info_t * rx_ts_info;
1214
1215     RX_TS_INFO_GET(rx_ts_info);
1216
1217     if (rx_stats_active)
1218         rx_atomic_inc(&rx_stats.packetRequests);
1219     if (pull_global && opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1220         MUTEX_ENTER(&rx_freePktQ_lock);
1221
1222         if (opr_queue_IsEmpty(&rx_freePacketQueue))
1223             rxi_MorePacketsNoLock(rx_maxSendWindow);
1224
1225         RX_TS_FPQ_GTOL(rx_ts_info);
1226
1227         MUTEX_EXIT(&rx_freePktQ_lock);
1228     } else if (opr_queue_IsEmpty(&rx_ts_info->_FPQ.queue)) {
1229         return NULL;
1230     }
1231
1232     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1233
1234     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1235
1236     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1237      * order to truncate outbound packets.  In the near future, may need
1238      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1239      */
1240     RX_PACKET_IOV_FULLINIT(p);
1241     return p;
1242 }
1243 #endif /* RX_ENABLE_TSFPQ */
1244
1245 #ifdef RX_ENABLE_TSFPQ
1246 struct rx_packet *
1247 rxi_AllocPacket(int class)
1248 {
1249     struct rx_packet *p;
1250
1251     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1252     return p;
1253 }
1254 #else /* RX_ENABLE_TSFPQ */
1255 struct rx_packet *
1256 rxi_AllocPacket(int class)
1257 {
1258     struct rx_packet *p;
1259
1260     MUTEX_ENTER(&rx_freePktQ_lock);
1261     p = rxi_AllocPacketNoLock(class);
1262     MUTEX_EXIT(&rx_freePktQ_lock);
1263     return p;
1264 }
1265 #endif /* RX_ENABLE_TSFPQ */
1266
1267 /* This guy comes up with as many buffers as it {takes,can get} given
1268  * the MTU for this call. It also sets the packet length before
1269  * returning.  caution: this is often called at NETPRI
1270  * Called with call locked.
1271  */
1272 struct rx_packet *
1273 rxi_AllocSendPacket(struct rx_call *call, int want)
1274 {
1275     struct rx_packet *p = (struct rx_packet *)0;
1276     int mud;
1277     unsigned delta;
1278
1279     SPLVAR;
1280     mud = call->MTU - RX_HEADER_SIZE;
1281     delta =
1282         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1283         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1284
1285 #ifdef RX_ENABLE_TSFPQ
1286     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1287         want += delta;
1288         want = MIN(want, mud);
1289
1290         if ((unsigned)want > p->length)
1291             (void)rxi_AllocDataBuf(p, (want - p->length),
1292                                    RX_PACKET_CLASS_SEND_CBUF);
1293
1294         if (p->length > mud)
1295             p->length = mud;
1296
1297         if (delta >= p->length) {
1298             rxi_FreePacket(p);
1299             p = NULL;
1300         } else {
1301             p->length -= delta;
1302         }
1303         return p;
1304     }
1305 #endif /* RX_ENABLE_TSFPQ */
1306
1307     while (!(call->error)) {
1308         MUTEX_ENTER(&rx_freePktQ_lock);
1309         /* if an error occurred, or we get the packet we want, we're done */
1310         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1311             MUTEX_EXIT(&rx_freePktQ_lock);
1312
1313             want += delta;
1314             want = MIN(want, mud);
1315
1316             if ((unsigned)want > p->length)
1317                 (void)rxi_AllocDataBuf(p, (want - p->length),
1318                                        RX_PACKET_CLASS_SEND_CBUF);
1319
1320             if (p->length > mud)
1321                 p->length = mud;
1322
1323             if (delta >= p->length) {
1324                 rxi_FreePacket(p);
1325                 p = NULL;
1326             } else {
1327                 p->length -= delta;
1328             }
1329             break;
1330         }
1331
1332         /* no error occurred, and we didn't get a packet, so we sleep.
1333          * At this point, we assume that packets will be returned
1334          * sooner or later, as packets are acknowledged, and so we
1335          * just wait.  */
1336         NETPRI;
1337         call->flags |= RX_CALL_WAIT_PACKETS;
1338         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1339         MUTEX_EXIT(&call->lock);
1340         rx_waitingForPackets = 1;
1341
1342 #ifdef  RX_ENABLE_LOCKS
1343         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1344 #else
1345         osi_rxSleep(&rx_waitingForPackets);
1346 #endif
1347         MUTEX_EXIT(&rx_freePktQ_lock);
1348         MUTEX_ENTER(&call->lock);
1349         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1350         call->flags &= ~RX_CALL_WAIT_PACKETS;
1351         USERPRI;
1352     }
1353
1354     return p;
1355 }
1356
1357 #ifndef KERNEL
1358 #ifdef AFS_NT40_ENV
1359 /* Windows does not use file descriptors. */
1360 #define CountFDs(amax) 0
1361 #else
1362 /* count the number of used FDs */
1363 static int
1364 CountFDs(int amax)
1365 {
1366     struct stat tstat;
1367     int i, code;
1368     int count;
1369
1370     count = 0;
1371     for (i = 0; i < amax; i++) {
1372         code = fstat(i, &tstat);
1373         if (code == 0)
1374             count++;
1375     }
1376     return count;
1377 }
1378 #endif /* AFS_NT40_ENV */
1379 #else /* KERNEL */
1380
1381 #define CountFDs(amax) amax
1382
1383 #endif /* KERNEL */
1384
1385 #if !defined(KERNEL) || defined(UKERNEL)
1386
1387 /* This function reads a single packet from the interface into the
1388  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1389  * (host,port) of the sender are stored in the supplied variables, and
1390  * the data length of the packet is stored in the packet structure.
1391  * The header is decoded. */
1392 int
1393 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1394                u_short * port)
1395 {
1396     struct sockaddr_in from;
1397     int nbytes;
1398     afs_int32 rlen;
1399     afs_uint32 tlen, savelen;
1400     struct msghdr msg;
1401     rx_computelen(p, tlen);
1402     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1403
1404     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1405     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1406                                  * it once in order to avoid races.  */
1407     tlen = rlen - tlen;
1408     if (tlen > 0) {
1409         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1410         if (tlen > 0) {
1411             tlen = rlen - tlen;
1412         } else
1413             tlen = rlen;
1414     } else
1415         tlen = rlen;
1416
1417     /* Extend the last iovec for padding, it's just to make sure that the
1418      * read doesn't return more data than we expect, and is done to get around
1419      * our problems caused by the lack of a length field in the rx header.
1420      * Use the extra buffer that follows the localdata in each packet
1421      * structure. */
1422     savelen = p->wirevec[p->niovecs - 1].iov_len;
1423     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1424
1425     memset(&msg, 0, sizeof(msg));
1426     msg.msg_name = (char *)&from;
1427     msg.msg_namelen = sizeof(struct sockaddr_in);
1428     msg.msg_iov = p->wirevec;
1429     msg.msg_iovlen = p->niovecs;
1430     nbytes = rxi_Recvmsg(socket, &msg, 0);
1431
1432     /* restore the vec to its correct state */
1433     p->wirevec[p->niovecs - 1].iov_len = savelen;
1434
1435     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1436     if (nbytes < 0 || (nbytes > tlen) || (p->length & 0x8000)) { /* Bogus packet */
1437         if (nbytes < 0 && errno == EWOULDBLOCK) {
1438             if (rx_stats_active)
1439                 rx_atomic_inc(&rx_stats.noPacketOnRead);
1440         } else if (nbytes <= 0) {
1441             if (rx_stats_active) {
1442                 rx_atomic_inc(&rx_stats.bogusPacketOnRead);
1443                 rx_stats.bogusHost = from.sin_addr.s_addr;
1444             }
1445             dpf(("B: bogus packet from [%x,%d] nb=%d\n", ntohl(from.sin_addr.s_addr),
1446                  ntohs(from.sin_port), nbytes));
1447         }
1448         return 0;
1449     }
1450 #ifdef RXDEBUG
1451     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1452                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1453         rxi_DecodePacketHeader(p);
1454
1455         *host = from.sin_addr.s_addr;
1456         *port = from.sin_port;
1457
1458         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d\n",
1459               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1460               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1461               p->length));
1462 #ifdef RX_TRIMDATABUFS
1463         rxi_TrimDataBufs(p, 1);
1464 #endif
1465         return 0;
1466     }
1467 #endif
1468     else {
1469         /* Extract packet header. */
1470         rxi_DecodePacketHeader(p);
1471
1472         *host = from.sin_addr.s_addr;
1473         *port = from.sin_port;
1474         if (rx_stats_active
1475             && p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1476
1477                 rx_atomic_inc(&rx_stats.packetsRead[p->header.type - 1]);
1478         }
1479
1480 #ifdef RX_TRIMDATABUFS
1481         /* Free any empty packet buffers at the end of this packet */
1482         rxi_TrimDataBufs(p, 1);
1483 #endif
1484         return 1;
1485     }
1486 }
1487
1488 #endif /* !KERNEL || UKERNEL */
1489
1490 /* This function splits off the first packet in a jumbo packet.
1491  * As of AFS 3.5, jumbograms contain more than one fixed size
1492  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1493  * last packet header. All packets (except the last) are padded to
1494  * fall on RX_CBUFFERSIZE boundaries.
1495  * HACK: We store the length of the first n-1 packets in the
1496  * last two pad bytes. */
1497
1498 struct rx_packet *
1499 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1500                      int first)
1501 {
1502     struct rx_packet *np;
1503     struct rx_jumboHeader *jp;
1504     int niov, i;
1505     struct iovec *iov;
1506     int length;
1507     afs_uint32 temp;
1508
1509     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1510      * bytes in length. All but the first packet are preceded by
1511      * an abbreviated four byte header. The length of the last packet
1512      * is calculated from the size of the jumbogram. */
1513     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1514
1515     if ((int)p->length < length) {
1516         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1517         return NULL;
1518     }
1519     niov = p->niovecs - 2;
1520     if (niov < 1) {
1521         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1522         return NULL;
1523     }
1524     iov = &p->wirevec[2];
1525     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1526
1527     /* Get a pointer to the abbreviated packet header */
1528     jp = (struct rx_jumboHeader *)
1529         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1530
1531     /* Set up the iovecs for the next packet */
1532     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1533     np->wirevec[0].iov_len = sizeof(struct rx_header);
1534     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1535     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1536     np->niovecs = niov + 1;
1537     for (i = 2, iov++; i <= niov; i++, iov++) {
1538         np->wirevec[i] = *iov;
1539     }
1540     np->length = p->length - length;
1541     p->length = RX_JUMBOBUFFERSIZE;
1542     p->niovecs = 2;
1543
1544     /* Convert the jumbo packet header to host byte order */
1545     temp = ntohl(*(afs_uint32 *) jp);
1546     jp->flags = (u_char) (temp >> 24);
1547     jp->cksum = (u_short) (temp);
1548
1549     /* Fill in the packet header */
1550     np->header = p->header;
1551     np->header.serial = p->header.serial + 1;
1552     np->header.seq = p->header.seq + 1;
1553     np->header.flags = jp->flags;
1554     np->header.spare = jp->cksum;
1555
1556     return np;
1557 }
1558
1559 #ifndef KERNEL
1560 /* Send a udp datagram */
1561 int
1562 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1563             int length, int istack)
1564 {
1565     struct msghdr msg;
1566         int ret;
1567
1568     memset(&msg, 0, sizeof(msg));
1569     msg.msg_iov = dvec;
1570     msg.msg_iovlen = nvecs;
1571     msg.msg_name = addr;
1572     msg.msg_namelen = sizeof(struct sockaddr_in);
1573
1574     ret = rxi_Sendmsg(socket, &msg, 0);
1575
1576     return ret;
1577 }
1578 #elif !defined(UKERNEL)
1579 /*
1580  * message receipt is done in rxk_input or rx_put.
1581  */
1582
1583 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1584 /*
1585  * Copy an mblock to the contiguous area pointed to by cp.
1586  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1587  * but it doesn't really.
1588  * Returns the number of bytes not transferred.
1589  * The message is NOT changed.
1590  */
1591 static int
1592 cpytoc(mblk_t * mp, int off, int len, char *cp)
1593 {
1594     int n;
1595
1596     for (; mp && len > 0; mp = mp->b_cont) {
1597         if (mp->b_datap->db_type != M_DATA) {
1598             return -1;
1599         }
1600         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1601         memcpy(cp, (char *)mp->b_rptr, n);
1602         cp += n;
1603         len -= n;
1604         mp->b_rptr += n;
1605     }
1606     return (len);
1607 }
1608
1609 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1610  * but it doesn't really.
1611  * This sucks, anyway, do it like m_cpy.... below
1612  */
1613 static int
1614 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1615            int niovs)
1616 {
1617     int m, n, o, t, i;
1618
1619     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1620         if (mp->b_datap->db_type != M_DATA) {
1621             return -1;
1622         }
1623         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1624         len -= n;
1625         while (n) {
1626             if (!t) {
1627                 o = 0;
1628                 i++;
1629                 t = iovs[i].iov_len;
1630             }
1631             m = MIN(n, t);
1632             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1633             mp->b_rptr += m;
1634             o += m;
1635             t -= m;
1636             n -= m;
1637         }
1638     }
1639     return (len);
1640 }
1641
1642 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1643 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1644 #else
1645 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1646 static int
1647 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1648 {
1649     caddr_t p1, p2;
1650     unsigned int l1, l2, i, t;
1651
1652     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1653         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1654
1655     while (off && m)
1656         if (m->m_len <= off) {
1657             off -= m->m_len;
1658             m = m->m_next;
1659             continue;
1660         } else
1661             break;
1662
1663     if (m == NULL)
1664         return len;
1665
1666     p1 = mtod(m, caddr_t) + off;
1667     l1 = m->m_len - off;
1668     i = 0;
1669     p2 = iovs[0].iov_base;
1670     l2 = iovs[0].iov_len;
1671
1672     while (len) {
1673         t = MIN(l1, MIN(l2, (unsigned int)len));
1674         memcpy(p2, p1, t);
1675         p1 += t;
1676         p2 += t;
1677         l1 -= t;
1678         l2 -= t;
1679         len -= t;
1680         if (!l1) {
1681             m = m->m_next;
1682             if (!m)
1683                 break;
1684             p1 = mtod(m, caddr_t);
1685             l1 = m->m_len;
1686         }
1687         if (!l2) {
1688             if (++i >= niovs)
1689                 break;
1690             p2 = iovs[i].iov_base;
1691             l2 = iovs[i].iov_len;
1692         }
1693
1694     }
1695
1696     return len;
1697 }
1698 #endif /* LINUX */
1699 #endif /* AFS_SUN5_ENV */
1700
1701 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1702 #if defined(AFS_NBSD_ENV)
1703 int
1704 rx_mb_to_packet(struct mbuf *amb, void (*free) (struct mbuf *), int hdr_len, int data_len, struct rx_packet *phandle)
1705 #else
1706 int
1707 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1708 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1709      mblk_t *amb;
1710 #else
1711      struct mbuf *amb;
1712 #endif
1713      void (*free) ();
1714      struct rx_packet *phandle;
1715      int hdr_len, data_len;
1716 #endif /* AFS_NBSD_ENV */
1717 {
1718     int code;
1719
1720     code =
1721         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1722                      phandle->niovecs);
1723     (*free) (amb);
1724
1725     return code;
1726 }
1727 #endif /* LINUX */
1728 #endif /*KERNEL && !UKERNEL */
1729
1730
1731 /* send a response to a debug packet */
1732
1733 struct rx_packet *
1734 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1735                        afs_uint32 ahost, short aport, int istack)
1736 {
1737     struct rx_debugIn tin;
1738     afs_int32 tl;
1739
1740     /*
1741      * Only respond to client-initiated Rx debug packets,
1742      * and clear the client flag in the response.
1743      */
1744     if (ap->header.flags & RX_CLIENT_INITIATED) {
1745         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1746         rxi_EncodePacketHeader(ap);
1747     } else {
1748         return ap;
1749     }
1750
1751     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1752     /* all done with packet, now set length to the truth, so we can
1753      * reuse this packet */
1754     rx_computelen(ap, ap->length);
1755
1756     tin.type = ntohl(tin.type);
1757     tin.index = ntohl(tin.index);
1758     switch (tin.type) {
1759     case RX_DEBUGI_GETSTATS:{
1760             struct rx_debugStats tstat;
1761
1762             /* get basic stats */
1763             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1764             tstat.version = RX_DEBUGI_VERSION;
1765 #ifndef RX_ENABLE_LOCKS
1766             tstat.waitingForPackets = rx_waitingForPackets;
1767 #endif
1768             MUTEX_ENTER(&rx_serverPool_lock);
1769             tstat.nFreePackets = htonl(rx_nFreePackets);
1770             tstat.nPackets = htonl(rx_nPackets);
1771             tstat.callsExecuted = htonl(rxi_nCalls);
1772             tstat.packetReclaims = htonl(rx_packetReclaims);
1773             tstat.usedFDs = CountFDs(64);
1774             tstat.nWaiting = htonl(rx_atomic_read(&rx_nWaiting));
1775             tstat.nWaited = htonl(rx_atomic_read(&rx_nWaited));
1776             tstat.idleThreads = opr_queue_Count(&rx_idleServerQueue);
1777             MUTEX_EXIT(&rx_serverPool_lock);
1778             tstat.idleThreads = htonl(tstat.idleThreads);
1779             tl = sizeof(struct rx_debugStats) - ap->length;
1780             if (tl > 0)
1781                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1782
1783             if (tl <= 0) {
1784                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1785                                (char *)&tstat);
1786                 ap->length = sizeof(struct rx_debugStats);
1787                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1788                 rx_computelen(ap, ap->length);
1789             }
1790             break;
1791         }
1792
1793     case RX_DEBUGI_GETALLCONN:
1794     case RX_DEBUGI_GETCONN:{
1795             unsigned int i, j;
1796             struct rx_connection *tc;
1797             struct rx_call *tcall;
1798             struct rx_debugConn tconn;
1799             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1800
1801
1802             tl = sizeof(struct rx_debugConn) - ap->length;
1803             if (tl > 0)
1804                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1805             if (tl > 0)
1806                 return ap;
1807
1808             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1809             /* get N'th (maybe) "interesting" connection info */
1810             for (i = 0; i < rx_hashTableSize; i++) {
1811 #if !defined(KERNEL)
1812                 /* the time complexity of the algorithm used here
1813                  * exponentially increses with the number of connections.
1814                  */
1815 #ifdef AFS_PTHREAD_ENV
1816                 pthread_yield();
1817 #else
1818                 (void)IOMGR_Poll();
1819 #endif
1820 #endif
1821                 MUTEX_ENTER(&rx_connHashTable_lock);
1822                 /* We might be slightly out of step since we are not
1823                  * locking each call, but this is only debugging output.
1824                  */
1825                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1826                     if ((all || rxi_IsConnInteresting(tc))
1827                         && tin.index-- <= 0) {
1828                         tconn.host = tc->peer->host;
1829                         tconn.port = tc->peer->port;
1830                         tconn.cid = htonl(tc->cid);
1831                         tconn.epoch = htonl(tc->epoch);
1832                         tconn.serial = htonl(tc->serial);
1833                         for (j = 0; j < RX_MAXCALLS; j++) {
1834                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1835                             if ((tcall = tc->call[j])) {
1836                                 tconn.callState[j] = tcall->state;
1837                                 tconn.callMode[j] = tcall->app.mode;
1838                                 tconn.callFlags[j] = tcall->flags;
1839                                 if (!opr_queue_IsEmpty(&tcall->rq))
1840                                     tconn.callOther[j] |= RX_OTHER_IN;
1841                                 if (!opr_queue_IsEmpty(&tcall->tq))
1842                                     tconn.callOther[j] |= RX_OTHER_OUT;
1843                             } else
1844                                 tconn.callState[j] = RX_STATE_NOTINIT;
1845                         }
1846
1847                         tconn.natMTU = htonl(tc->peer->natMTU);
1848                         tconn.error = htonl(tc->error);
1849                         tconn.flags = tc->flags;
1850                         tconn.type = tc->type;
1851                         tconn.securityIndex = tc->securityIndex;
1852                         if (tc->securityObject) {
1853                             RXS_GetStats(tc->securityObject, tc,
1854                                          &tconn.secStats);
1855 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1856 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1857                             DOHTONL(flags);
1858                             DOHTONL(expires);
1859                             DOHTONL(packetsReceived);
1860                             DOHTONL(packetsSent);
1861                             DOHTONL(bytesReceived);
1862                             DOHTONL(bytesSent);
1863                             for (i = 0;
1864                                  i <
1865                                  sizeof(tconn.secStats.spares) /
1866                                  sizeof(short); i++)
1867                                 DOHTONS(spares[i]);
1868                             for (i = 0;
1869                                  i <
1870                                  sizeof(tconn.secStats.sparel) /
1871                                  sizeof(afs_int32); i++)
1872                                 DOHTONL(sparel[i]);
1873                         }
1874
1875                         MUTEX_EXIT(&rx_connHashTable_lock);
1876                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1877                                        (char *)&tconn);
1878                         tl = ap->length;
1879                         ap->length = sizeof(struct rx_debugConn);
1880                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1881                                             istack);
1882                         ap->length = tl;
1883                         return ap;
1884                     }
1885                 }
1886                 MUTEX_EXIT(&rx_connHashTable_lock);
1887             }
1888             /* if we make it here, there are no interesting packets */
1889             tconn.cid = htonl(0xffffffff);      /* means end */
1890             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1891                            (char *)&tconn);
1892             tl = ap->length;
1893             ap->length = sizeof(struct rx_debugConn);
1894             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1895             ap->length = tl;
1896             break;
1897         }
1898
1899         /*
1900          * Pass back all the peer structures we have available
1901          */
1902
1903     case RX_DEBUGI_GETPEER:{
1904             unsigned int i;
1905             struct rx_peer *tp;
1906             struct rx_debugPeer tpeer;
1907
1908
1909             tl = sizeof(struct rx_debugPeer) - ap->length;
1910             if (tl > 0)
1911                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1912             if (tl > 0)
1913                 return ap;
1914
1915             memset(&tpeer, 0, sizeof(tpeer));
1916             for (i = 0; i < rx_hashTableSize; i++) {
1917 #if !defined(KERNEL)
1918                 /* the time complexity of the algorithm used here
1919                  * exponentially increses with the number of peers.
1920                  *
1921                  * Yielding after processing each hash table entry
1922                  * and dropping rx_peerHashTable_lock.
1923                  * also increases the risk that we will miss a new
1924                  * entry - but we are willing to live with this
1925                  * limitation since this is meant for debugging only
1926                  */
1927 #ifdef AFS_PTHREAD_ENV
1928                 pthread_yield();
1929 #else
1930                 (void)IOMGR_Poll();
1931 #endif
1932 #endif
1933                 MUTEX_ENTER(&rx_peerHashTable_lock);
1934                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1935                     if (tin.index-- <= 0) {
1936                         tp->refCount++;
1937                         MUTEX_EXIT(&rx_peerHashTable_lock);
1938
1939                         MUTEX_ENTER(&tp->peer_lock);
1940                         tpeer.host = tp->host;
1941                         tpeer.port = tp->port;
1942                         tpeer.ifMTU = htons(tp->ifMTU);
1943                         tpeer.idleWhen = htonl(tp->idleWhen);
1944                         tpeer.refCount = htons(tp->refCount);
1945                         tpeer.burstSize = 0;
1946                         tpeer.burst = 0;
1947                         tpeer.burstWait.sec = 0;
1948                         tpeer.burstWait.usec = 0;
1949                         tpeer.rtt = htonl(tp->rtt);
1950                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1951                         tpeer.nSent = htonl(tp->nSent);
1952                         tpeer.reSends = htonl(tp->reSends);
1953                         tpeer.natMTU = htons(tp->natMTU);
1954                         tpeer.maxMTU = htons(tp->maxMTU);
1955                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
1956                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
1957                         tpeer.MTU = htons(tp->MTU);
1958                         tpeer.cwind = htons(tp->cwind);
1959                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
1960                         tpeer.congestSeq = htons(tp->congestSeq);
1961                         tpeer.bytesSent.high =
1962                             htonl(tp->bytesSent >> 32);
1963                         tpeer.bytesSent.low =
1964                             htonl(tp->bytesSent & MAX_AFS_UINT32);
1965                         tpeer.bytesReceived.high =
1966                             htonl(tp->bytesReceived >> 32);
1967                         tpeer.bytesReceived.low =
1968                             htonl(tp->bytesReceived & MAX_AFS_UINT32);
1969                         MUTEX_EXIT(&tp->peer_lock);
1970
1971                         MUTEX_ENTER(&rx_peerHashTable_lock);
1972                         tp->refCount--;
1973                         MUTEX_EXIT(&rx_peerHashTable_lock);
1974
1975                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1976                                        (char *)&tpeer);
1977                         tl = ap->length;
1978                         ap->length = sizeof(struct rx_debugPeer);
1979                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1980                                             istack);
1981                         ap->length = tl;
1982                         return ap;
1983                     }
1984                 }
1985                 MUTEX_EXIT(&rx_peerHashTable_lock);
1986             }
1987             /* if we make it here, there are no interesting packets */
1988             tpeer.host = htonl(0xffffffff);     /* means end */
1989             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
1990                            (char *)&tpeer);
1991             tl = ap->length;
1992             ap->length = sizeof(struct rx_debugPeer);
1993             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1994             ap->length = tl;
1995             break;
1996         }
1997
1998     case RX_DEBUGI_RXSTATS:{
1999             int i;
2000             afs_int32 *s;
2001
2002             tl = sizeof(rx_stats) - ap->length;
2003             if (tl > 0)
2004                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2005             if (tl > 0)
2006                 return ap;
2007
2008             /* Since its all int32s convert to network order with a loop. */
2009         if (rx_stats_active)
2010             MUTEX_ENTER(&rx_stats_mutex);
2011             s = (afs_int32 *) & rx_stats;
2012             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2013                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2014
2015             tl = ap->length;
2016             ap->length = sizeof(rx_stats);
2017         if (rx_stats_active)
2018             MUTEX_EXIT(&rx_stats_mutex);
2019             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2020             ap->length = tl;
2021             break;
2022         }
2023
2024     default:
2025         /* error response packet */
2026         tin.type = htonl(RX_DEBUGI_BADTYPE);
2027         tin.index = tin.type;
2028         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2029         tl = ap->length;
2030         ap->length = sizeof(struct rx_debugIn);
2031         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2032         ap->length = tl;
2033         break;
2034     }
2035     return ap;
2036 }
2037
2038 struct rx_packet *
2039 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2040                          afs_uint32 ahost, short aport, int istack)
2041 {
2042     afs_int32 tl;
2043
2044     /*
2045      * Only respond to client-initiated version requests, and
2046      * clear that flag in the response.
2047      */
2048     if (ap->header.flags & RX_CLIENT_INITIATED) {
2049         char buf[66];
2050
2051         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2052         rxi_EncodePacketHeader(ap);
2053         memset(buf, 0, sizeof(buf));
2054         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2055         rx_packetwrite(ap, 0, 65, buf);
2056         tl = ap->length;
2057         ap->length = 65;
2058         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2059         ap->length = tl;
2060     }
2061
2062     return ap;
2063 }
2064
2065
2066 /* send a debug packet back to the sender */
2067 static void
2068 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2069                     afs_uint32 ahost, short aport, afs_int32 istack)
2070 {
2071     struct sockaddr_in taddr;
2072     unsigned int i, nbytes, savelen = 0;
2073     int saven = 0;
2074 #ifdef KERNEL
2075     int waslocked = ISAFS_GLOCK();
2076 #endif
2077
2078     taddr.sin_family = AF_INET;
2079     taddr.sin_port = aport;
2080     taddr.sin_addr.s_addr = ahost;
2081 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2082     taddr.sin_len = sizeof(struct sockaddr_in);
2083 #endif
2084
2085     /* We need to trim the niovecs. */
2086     nbytes = apacket->length;
2087     for (i = 1; i < apacket->niovecs; i++) {
2088         if (nbytes <= apacket->wirevec[i].iov_len) {
2089             savelen = apacket->wirevec[i].iov_len;
2090             saven = apacket->niovecs;
2091             apacket->wirevec[i].iov_len = nbytes;
2092             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2093         } else
2094             nbytes -= apacket->wirevec[i].iov_len;
2095     }
2096 #ifdef KERNEL
2097 #ifdef RX_KERNEL_TRACE
2098     if (ICL_SETACTIVE(afs_iclSetp)) {
2099         if (!waslocked)
2100             AFS_GLOCK();
2101         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2102                    "before osi_NetSend()");
2103         AFS_GUNLOCK();
2104     }
2105 #else
2106     if (waslocked)
2107         AFS_GUNLOCK();
2108 #endif
2109 #endif
2110     /* debug packets are not reliably delivered, hence the cast below. */
2111     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2112                       apacket->length + RX_HEADER_SIZE, istack);
2113 #ifdef KERNEL
2114 #ifdef RX_KERNEL_TRACE
2115     if (ICL_SETACTIVE(afs_iclSetp)) {
2116         AFS_GLOCK();
2117         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2118                    "after osi_NetSend()");
2119         if (!waslocked)
2120             AFS_GUNLOCK();
2121     }
2122 #else
2123     if (waslocked)
2124         AFS_GLOCK();
2125 #endif
2126 #endif
2127     if (saven) {                /* means we truncated the packet above. */
2128         apacket->wirevec[i - 1].iov_len = savelen;
2129         apacket->niovecs = saven;
2130     }
2131
2132 }
2133
2134 static void
2135 rxi_NetSendError(struct rx_call *call, int code)
2136 {
2137     int down = 0;
2138 #ifdef AFS_NT40_ENV
2139     if (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) {
2140         down = 1;
2141     }
2142     if (code == -WSAEHOSTUNREACH) {
2143         down = 1;
2144     }
2145 #elif defined(AFS_LINUX20_ENV)
2146     if (code == -ENETUNREACH) {
2147         down = 1;
2148     }
2149 #elif defined(AFS_DARWIN_ENV)
2150     if (code == EHOSTUNREACH) {
2151         down = 1;
2152     }
2153 #endif
2154     if (down) {
2155         call->lastReceiveTime = 0;
2156     }
2157 }
2158
2159 /* Send the packet to appropriate destination for the specified
2160  * call.  The header is first encoded and placed in the packet.
2161  */
2162 void
2163 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2164                struct rx_packet *p, int istack)
2165 {
2166 #if defined(KERNEL)
2167     int waslocked;
2168 #endif
2169     int code;
2170     struct sockaddr_in addr;
2171     struct rx_peer *peer = conn->peer;
2172     osi_socket socket;
2173 #ifdef RXDEBUG
2174     char deliveryType = 'S';
2175 #endif
2176     /* The address we're sending the packet to */
2177     memset(&addr, 0, sizeof(addr));
2178     addr.sin_family = AF_INET;
2179     addr.sin_port = peer->port;
2180     addr.sin_addr.s_addr = peer->host;
2181
2182     /* This stuff should be revamped, I think, so that most, if not
2183      * all, of the header stuff is always added here.  We could
2184      * probably do away with the encode/decode routines. XXXXX */
2185
2186     /* Stamp each packet with a unique serial number.  The serial
2187      * number is maintained on a connection basis because some types
2188      * of security may be based on the serial number of the packet,
2189      * and security is handled on a per authenticated-connection
2190      * basis. */
2191     /* Pre-increment, to guarantee no zero serial number; a zero
2192      * serial number means the packet was never sent. */
2193     MUTEX_ENTER(&conn->conn_data_lock);
2194     p->header.serial = ++conn->serial;
2195     if (p->length > conn->peer->maxPacketSize) {
2196         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2197             (p->header.flags & RX_REQUEST_ACK)) {
2198             conn->lastPingSize = p->length;
2199             conn->lastPingSizeSer = p->header.serial;
2200         } else if (p->header.seq != 0) {
2201             conn->lastPacketSize = p->length;
2202             conn->lastPacketSizeSeq = p->header.seq;
2203         }
2204     }
2205     MUTEX_EXIT(&conn->conn_data_lock);
2206     /* This is so we can adjust retransmit time-outs better in the face of
2207      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2208      */
2209     if (p->firstSerial == 0) {
2210         p->firstSerial = p->header.serial;
2211     }
2212 #ifdef RXDEBUG
2213     /* If an output tracer function is defined, call it with the packet and
2214      * network address.  Note this function may modify its arguments. */
2215     if (rx_almostSent) {
2216         int drop = (*rx_almostSent) (p, &addr);
2217         /* drop packet if return value is non-zero? */
2218         if (drop)
2219             deliveryType = 'D'; /* Drop the packet */
2220     }
2221 #endif
2222
2223     /* Get network byte order header */
2224     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2225                                  * touch ALL the fields */
2226
2227     /* Send the packet out on the same socket that related packets are being
2228      * received on */
2229     socket =
2230         (conn->type ==
2231          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2232
2233 #ifdef RXDEBUG
2234     /* Possibly drop this packet,  for testing purposes */
2235     if ((deliveryType == 'D')
2236         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2237             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2238         deliveryType = 'D';     /* Drop the packet */
2239     } else {
2240         deliveryType = 'S';     /* Send the packet */
2241 #endif /* RXDEBUG */
2242
2243         /* Loop until the packet is sent.  We'd prefer just to use a
2244          * blocking socket, but unfortunately the interface doesn't
2245          * allow us to have the socket block in send mode, and not
2246          * block in receive mode */
2247 #ifdef KERNEL
2248         waslocked = ISAFS_GLOCK();
2249 #ifdef RX_KERNEL_TRACE
2250         if (ICL_SETACTIVE(afs_iclSetp)) {
2251             if (!waslocked)
2252                 AFS_GLOCK();
2253             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2254                        "before osi_NetSend()");
2255             AFS_GUNLOCK();
2256         }
2257 #else
2258         if (waslocked)
2259             AFS_GUNLOCK();
2260 #endif
2261 #endif
2262         if ((code =
2263              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2264                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2265             /* send failed, so let's hurry up the resend, eh? */
2266             if (rx_stats_active)
2267                 rx_atomic_inc(&rx_stats.netSendFailures);
2268             p->flags &= ~RX_PKTFLAG_SENT; /* resend it very soon */
2269
2270             /* Some systems are nice and tell us right away that we cannot
2271              * reach this recipient by returning an error code.
2272              * So, when this happens let's "down" the host NOW so
2273              * we don't sit around waiting for this host to timeout later.
2274              */
2275             if (call) {
2276                 rxi_NetSendError(call, code);
2277             }
2278         }
2279 #ifdef KERNEL
2280 #ifdef RX_KERNEL_TRACE
2281         if (ICL_SETACTIVE(afs_iclSetp)) {
2282             AFS_GLOCK();
2283             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2284                        "after osi_NetSend()");
2285             if (!waslocked)
2286                 AFS_GUNLOCK();
2287         }
2288 #else
2289         if (waslocked)
2290             AFS_GLOCK();
2291 #endif
2292 #endif
2293 #ifdef RXDEBUG
2294     }
2295     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2296           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2297           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2298           p->header.seq, p->header.flags, p, p->length));
2299 #endif
2300     if (rx_stats_active) {
2301         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2302         MUTEX_ENTER(&peer->peer_lock);
2303         peer->bytesSent += p->length;
2304         MUTEX_EXIT(&peer->peer_lock);
2305     }
2306 }
2307
2308 /* Send a list of packets to appropriate destination for the specified
2309  * connection.  The headers are first encoded and placed in the packets.
2310  */
2311 void
2312 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2313                    struct rx_packet **list, int len, int istack)
2314 {
2315 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2316     int waslocked;
2317 #endif
2318     struct sockaddr_in addr;
2319     struct rx_peer *peer = conn->peer;
2320     osi_socket socket;
2321     struct rx_packet *p = NULL;
2322     struct iovec wirevec[RX_MAXIOVECS];
2323     int i, length, code;
2324     afs_uint32 serial;
2325     afs_uint32 temp;
2326     struct rx_jumboHeader *jp;
2327 #ifdef RXDEBUG
2328     char deliveryType = 'S';
2329 #endif
2330     /* The address we're sending the packet to */
2331     addr.sin_family = AF_INET;
2332     addr.sin_port = peer->port;
2333     addr.sin_addr.s_addr = peer->host;
2334
2335     if (len + 1 > RX_MAXIOVECS) {
2336         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2337     }
2338
2339     /*
2340      * Stamp the packets in this jumbogram with consecutive serial numbers
2341      */
2342     MUTEX_ENTER(&conn->conn_data_lock);
2343     serial = conn->serial;
2344     conn->serial += len;
2345     for (i = 0; i < len; i++) {
2346         p = list[i];
2347         if (p->length > conn->peer->maxPacketSize) {
2348             /* a ping *or* a sequenced packet can count */
2349             if ((p->length > conn->peer->maxPacketSize)) {
2350                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2351                      (p->header.flags & RX_REQUEST_ACK)) &&
2352                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2353                     conn->lastPingSize = p->length;
2354                     conn->lastPingSizeSer = serial + i;
2355                 } else if ((p->header.seq != 0) &&
2356                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2357                     conn->lastPacketSize = p->length;
2358                     conn->lastPacketSizeSeq = p->header.seq;
2359                 }
2360             }
2361         }
2362     }
2363     MUTEX_EXIT(&conn->conn_data_lock);
2364
2365
2366     /* This stuff should be revamped, I think, so that most, if not
2367      * all, of the header stuff is always added here.  We could
2368      * probably do away with the encode/decode routines. XXXXX */
2369
2370     jp = NULL;
2371     length = RX_HEADER_SIZE;
2372     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2373     wirevec[0].iov_len = RX_HEADER_SIZE;
2374     for (i = 0; i < len; i++) {
2375         p = list[i];
2376
2377         /* The whole 3.5 jumbogram scheme relies on packets fitting
2378          * in a single packet buffer. */
2379         if (p->niovecs > 2) {
2380             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2381         }
2382
2383         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2384          * in this chunk.  */
2385         if (i < len - 1) {
2386             if (p->length != RX_JUMBOBUFFERSIZE) {
2387                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2388             }
2389             p->header.flags |= RX_JUMBO_PACKET;
2390             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2391             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2392         } else {
2393             wirevec[i + 1].iov_len = p->length;
2394             length += p->length;
2395         }
2396         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2397         if (jp != NULL) {
2398             /* Convert jumbo packet header to network byte order */
2399             temp = (afs_uint32) (p->header.flags) << 24;
2400             temp |= (afs_uint32) (p->header.spare);
2401             *(afs_uint32 *) jp = htonl(temp);
2402         }
2403         jp = (struct rx_jumboHeader *)
2404             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2405
2406         /* Stamp each packet with a unique serial number.  The serial
2407          * number is maintained on a connection basis because some types
2408          * of security may be based on the serial number of the packet,
2409          * and security is handled on a per authenticated-connection
2410          * basis. */
2411         /* Pre-increment, to guarantee no zero serial number; a zero
2412          * serial number means the packet was never sent. */
2413         p->header.serial = ++serial;
2414         /* This is so we can adjust retransmit time-outs better in the face of
2415          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2416          */
2417         if (p->firstSerial == 0) {
2418             p->firstSerial = p->header.serial;
2419         }
2420 #ifdef RXDEBUG
2421         /* If an output tracer function is defined, call it with the packet and
2422          * network address.  Note this function may modify its arguments. */
2423         if (rx_almostSent) {
2424             int drop = (*rx_almostSent) (p, &addr);
2425             /* drop packet if return value is non-zero? */
2426             if (drop)
2427                 deliveryType = 'D';     /* Drop the packet */
2428         }
2429 #endif
2430
2431         /* Get network byte order header */
2432         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2433                                          * touch ALL the fields */
2434     }
2435
2436     /* Send the packet out on the same socket that related packets are being
2437      * received on */
2438     socket =
2439         (conn->type ==
2440          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2441
2442 #ifdef RXDEBUG
2443     /* Possibly drop this packet,  for testing purposes */
2444     if ((deliveryType == 'D')
2445         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2446             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2447         deliveryType = 'D';     /* Drop the packet */
2448     } else {
2449         deliveryType = 'S';     /* Send the packet */
2450 #endif /* RXDEBUG */
2451
2452         /* Loop until the packet is sent.  We'd prefer just to use a
2453          * blocking socket, but unfortunately the interface doesn't
2454          * allow us to have the socket block in send mode, and not
2455          * block in receive mode */
2456 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2457         waslocked = ISAFS_GLOCK();
2458         if (!istack && waslocked)
2459             AFS_GUNLOCK();
2460 #endif
2461         if ((code =
2462              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2463                          istack)) != 0) {
2464             /* send failed, so let's hurry up the resend, eh? */
2465             if (rx_stats_active)
2466                 rx_atomic_inc(&rx_stats.netSendFailures);
2467             for (i = 0; i < len; i++) {
2468                 p = list[i];
2469                 p->flags &= ~RX_PKTFLAG_SENT;  /* resend it very soon */
2470             }
2471             /* Some systems are nice and tell us right away that we cannot
2472              * reach this recipient by returning an error code.
2473              * So, when this happens let's "down" the host NOW so
2474              * we don't sit around waiting for this host to timeout later.
2475              */
2476             if (call) {
2477                 rxi_NetSendError(call, code);
2478             }
2479         }
2480 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2481         if (!istack && waslocked)
2482             AFS_GLOCK();
2483 #endif
2484 #ifdef RXDEBUG
2485     }
2486
2487     osi_Assert(p != NULL);
2488
2489     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" len %d\n",
2490           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2491           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2492           p->header.seq, p->header.flags, p, p->length));
2493
2494 #endif
2495     if (rx_stats_active) {
2496         rx_atomic_inc(&rx_stats.packetsSent[p->header.type - 1]);
2497         MUTEX_ENTER(&peer->peer_lock);
2498         peer->bytesSent += p->length;
2499         MUTEX_EXIT(&peer->peer_lock);
2500     }
2501 }
2502
2503 /* Send a raw abort packet, without any call or connection structures */
2504 void
2505 rxi_SendRawAbort(osi_socket socket, afs_uint32 host, u_short port,
2506                  afs_int32 error, struct rx_packet *source, int istack)
2507 {
2508     struct rx_header theader;
2509     struct sockaddr_in addr;
2510     struct iovec iov[2];
2511
2512     memset(&theader, 0, sizeof(theader));
2513     theader.epoch = htonl(source->header.epoch);
2514     theader.callNumber = htonl(source->header.callNumber);
2515     theader.serial = htonl(1);
2516     theader.type = RX_PACKET_TYPE_ABORT;
2517     theader.serviceId = htons(source->header.serviceId);
2518     theader.securityIndex = source->header.securityIndex;
2519     theader.cid = htonl(source->header.cid);
2520
2521     /*
2522      * If the abort is being sent in response to a server initiated packet,
2523      * set client_initiated in the abort to ensure it is not associated by
2524      * the receiver with a connection in the opposite direction.
2525      */
2526     if ((source->header.flags & RX_CLIENT_INITIATED) != RX_CLIENT_INITIATED)
2527         theader.flags |= RX_CLIENT_INITIATED;
2528
2529     error = htonl(error);
2530
2531     iov[0].iov_base = &theader;
2532     iov[0].iov_len = sizeof(struct rx_header);
2533     iov[1].iov_base = &error;
2534     iov[1].iov_len = sizeof(error);
2535
2536     addr.sin_family = AF_INET;
2537     addr.sin_addr.s_addr = host;
2538     addr.sin_port = port;
2539 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2540     addr.sin_len = sizeof(struct sockaddr_in);
2541 #endif
2542
2543     osi_NetSend(socket, &addr, iov, 2,
2544                 sizeof(struct rx_header) + sizeof(error), istack);
2545 }
2546
2547 /* Send a "special" packet to the peer connection.  If call is
2548  * specified, then the packet is directed to a specific call channel
2549  * associated with the connection, otherwise it is directed to the
2550  * connection only. Uses optionalPacket if it is supplied, rather than
2551  * allocating a new packet buffer.  Nbytes is the length of the data
2552  * portion of the packet.  If data is non-null, nbytes of data are
2553  * copied into the packet.  Type is the type of the packet, as defined
2554  * in rx.h.  Bug: there's a lot of duplication between this and other
2555  * routines.  This needs to be cleaned up. */
2556 struct rx_packet *
2557 rxi_SendSpecial(struct rx_call *call,
2558                 struct rx_connection *conn,
2559                 struct rx_packet *optionalPacket, int type, char *data,
2560                 int nbytes, int istack)
2561 {
2562     /* Some of the following stuff should be common code for all
2563      * packet sends (it's repeated elsewhere) */
2564     struct rx_packet *p;
2565     unsigned int i = 0;
2566     int savelen = 0, saven = 0;
2567     int channel, callNumber;
2568     if (call) {
2569         channel = call->channel;
2570         callNumber = *call->callNumber;
2571         /* BUSY packets refer to the next call on this connection */
2572         if (type == RX_PACKET_TYPE_BUSY) {
2573             callNumber++;
2574         }
2575     } else {
2576         channel = 0;
2577         callNumber = 0;
2578     }
2579     p = optionalPacket;
2580     if (!p) {
2581         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2582         if (!p)
2583             osi_Panic("rxi_SendSpecial failure");
2584     }
2585
2586     if (nbytes != -1)
2587         p->length = nbytes;
2588     else
2589         nbytes = p->length;
2590     p->header.serviceId = conn->serviceId;
2591     p->header.securityIndex = conn->securityIndex;
2592     p->header.cid = (conn->cid | channel);
2593     p->header.callNumber = callNumber;
2594     p->header.seq = 0;
2595     p->header.epoch = conn->epoch;
2596     p->header.type = type;
2597     p->header.flags = 0;
2598     if (conn->type == RX_CLIENT_CONNECTION)
2599         p->header.flags |= RX_CLIENT_INITIATED;
2600     if (data)
2601         rx_packetwrite(p, 0, nbytes, data);
2602
2603     for (i = 1; i < p->niovecs; i++) {
2604         if (nbytes <= p->wirevec[i].iov_len) {
2605             savelen = p->wirevec[i].iov_len;
2606             saven = p->niovecs;
2607             p->wirevec[i].iov_len = nbytes;
2608             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2609         } else
2610             nbytes -= p->wirevec[i].iov_len;
2611     }
2612
2613     if (call)
2614         rxi_Send(call, p, istack);
2615     else
2616         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2617     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2618         /* really need to do this, but it seems safer this way, given that  */
2619         /* sneaky optionalPacket... */
2620         p->wirevec[i - 1].iov_len = savelen;
2621         p->niovecs = saven;
2622     }
2623     if (!optionalPacket)
2624         rxi_FreePacket(p);
2625     return optionalPacket;
2626 }
2627
2628
2629 /* Encode the packet's header (from the struct header in the packet to
2630  * the net byte order representation in the wire representation of the
2631  * packet, which is what is actually sent out on the wire) */
2632 void
2633 rxi_EncodePacketHeader(struct rx_packet *p)
2634 {
2635     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2636
2637     memset(buf, 0, RX_HEADER_SIZE);
2638     *buf++ = htonl(p->header.epoch);
2639     *buf++ = htonl(p->header.cid);
2640     *buf++ = htonl(p->header.callNumber);
2641     *buf++ = htonl(p->header.seq);
2642     *buf++ = htonl(p->header.serial);
2643     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2644                    | (((afs_uint32) p->header.flags) << 16)
2645                    | (p->header.userStatus << 8) | p->header.securityIndex);
2646     /* Note: top 16 bits of this next word were reserved */
2647     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2648 }
2649
2650 /* Decode the packet's header (from net byte order to a struct header) */
2651 void
2652 rxi_DecodePacketHeader(struct rx_packet *p)
2653 {
2654     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2655     afs_uint32 temp;
2656
2657     p->header.epoch = ntohl(*buf);
2658     buf++;
2659     p->header.cid = ntohl(*buf);
2660     buf++;
2661     p->header.callNumber = ntohl(*buf);
2662     buf++;
2663     p->header.seq = ntohl(*buf);
2664     buf++;
2665     p->header.serial = ntohl(*buf);
2666     buf++;
2667
2668     temp = ntohl(*buf);
2669     buf++;
2670
2671     /* C will truncate byte fields to bytes for me */
2672     p->header.type = temp >> 24;
2673     p->header.flags = temp >> 16;
2674     p->header.userStatus = temp >> 8;
2675     p->header.securityIndex = temp >> 0;
2676
2677     temp = ntohl(*buf);
2678     buf++;
2679
2680     p->header.serviceId = (temp & 0xffff);
2681     p->header.spare = temp >> 16;
2682     /* Note: top 16 bits of this last word are the security checksum */
2683 }
2684
2685 /*
2686  * LOCKS HELD: called with call->lock held.
2687  *
2688  * PrepareSendPacket is the only place in the code that
2689  * can increment call->tnext.  This could become an atomic
2690  * in the future.  Beyond that there is nothing in this
2691  * function that requires the call being locked.  This
2692  * function can only be called by the application thread.
2693  */
2694 void
2695 rxi_PrepareSendPacket(struct rx_call *call,
2696                       struct rx_packet *p, int last)
2697 {
2698     struct rx_connection *conn = call->conn;
2699     afs_uint32 seq = call->tnext++;
2700     unsigned int i;
2701     afs_int32 len;              /* len must be a signed type; it can go negative */
2702     int code;
2703
2704     /* No data packets on call 0. Where do these come from? */
2705     if (*call->callNumber == 0)
2706         *call->callNumber = 1;
2707
2708     MUTEX_EXIT(&call->lock);
2709     p->flags &= ~(RX_PKTFLAG_ACKED | RX_PKTFLAG_SENT);
2710
2711     p->header.cid = (conn->cid | call->channel);
2712     p->header.serviceId = conn->serviceId;
2713     p->header.securityIndex = conn->securityIndex;
2714
2715     p->header.callNumber = *call->callNumber;
2716     p->header.seq = seq;
2717     p->header.epoch = conn->epoch;
2718     p->header.type = RX_PACKET_TYPE_DATA;
2719     p->header.flags = 0;
2720     p->header.spare = 0;
2721     if (conn->type == RX_CLIENT_CONNECTION)
2722         p->header.flags |= RX_CLIENT_INITIATED;
2723
2724     if (last)
2725         p->header.flags |= RX_LAST_PACKET;
2726
2727     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2728     p->header.serial = 0;       /* Another way of saying never transmitted... */
2729
2730     /* Now that we're sure this is the last data on the call, make sure
2731      * that the "length" and the sum of the iov_lens matches. */
2732     len = p->length + call->conn->securityHeaderSize;
2733
2734     for (i = 1; i < p->niovecs && len > 0; i++) {
2735         len -= p->wirevec[i].iov_len;
2736     }
2737     if (len > 0) {
2738         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2739     } else if (i < p->niovecs) {
2740         /* Free any extra elements in the wirevec */
2741 #if defined(RX_ENABLE_TSFPQ)
2742         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2743 #else /* !RX_ENABLE_TSFPQ */
2744         MUTEX_ENTER(&rx_freePktQ_lock);
2745         rxi_FreeDataBufsNoLock(p, i);
2746         MUTEX_EXIT(&rx_freePktQ_lock);
2747 #endif /* !RX_ENABLE_TSFPQ */
2748
2749         p->niovecs = i;
2750     }
2751     if (len)
2752         p->wirevec[i - 1].iov_len += len;
2753     MUTEX_ENTER(&call->lock);
2754     code = RXS_PreparePacket(conn->securityObject, call, p);
2755     if (code) {
2756         MUTEX_EXIT(&call->lock);
2757         rxi_ConnectionError(conn, code);
2758         MUTEX_ENTER(&conn->conn_data_lock);
2759         p = rxi_SendConnectionAbort(conn, p, 0, 0);
2760         MUTEX_EXIT(&conn->conn_data_lock);
2761         MUTEX_ENTER(&call->lock);
2762         /* setting a connection error means all calls for that conn are also
2763          * error'd. if this call does not have an error by now, something is
2764          * very wrong, and we risk sending data in the clear that is supposed
2765          * to be encrypted. */
2766         osi_Assert(call->error);
2767     }
2768 }
2769
2770 /* Given an interface MTU size, calculate an adjusted MTU size that
2771  * will make efficient use of the RX buffers when the peer is sending
2772  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2773 int
2774 rxi_AdjustIfMTU(int mtu)
2775 {
2776     int adjMTU;
2777     int frags;
2778
2779     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2780         return mtu;
2781     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2782     if (mtu <= adjMTU) {
2783         return mtu;
2784     }
2785     mtu -= adjMTU;
2786     if (mtu <= 0) {
2787         return adjMTU;
2788     }
2789     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2790     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2791 }
2792
2793 /* Given an interface MTU size, and the peer's advertised max receive
2794  * size, calculate an adjisted maxMTU size that makes efficient use
2795  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2796 int
2797 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2798 {
2799     int maxMTU = mtu * rxi_nSendFrags;
2800     maxMTU = MIN(maxMTU, peerMaxMTU);
2801     return rxi_AdjustIfMTU(maxMTU);
2802 }
2803
2804 /* Given a packet size, figure out how many datagram packet will fit.
2805  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2806  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2807  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2808 int
2809 rxi_AdjustDgramPackets(int frags, int mtu)
2810 {
2811     int maxMTU;
2812     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2813         return 1;
2814     }
2815     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2816     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2817     /* subtract the size of the first and last packets */
2818     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2819     if (maxMTU < 0) {
2820         return 1;
2821     }
2822     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2823 }
2824
2825 #ifndef KERNEL
2826 /*
2827  * This function can be used by the Windows Cache Manager
2828  * to dump the list of all rx packets so that we can determine
2829  * where the packet leakage is.
2830  */
2831 int rx_DumpPackets(FILE *outputFile, char *cookie)
2832 {
2833 #ifdef RXDEBUG_PACKET
2834     struct rx_packet *p;
2835 #ifdef AFS_NT40_ENV
2836     int zilch;
2837     char output[2048];
2838 #define RXDPRINTF sprintf
2839 #define RXDPRINTOUT output
2840 #else
2841 #define RXDPRINTF fprintf
2842 #define RXDPRINTOUT outputFile
2843 #endif
2844
2845     NETPRI;
2846     MUTEX_ENTER(&rx_freePktQ_lock);
2847     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2848 #ifdef AFS_NT40_ENV
2849     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2850 #endif
2851
2852     for (p = rx_mallocedP; p; p = p->allNextp) {
2853         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2854                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec,
2855                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->length,
2856                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2857                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2858                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2859 #ifdef AFS_NT40_ENV
2860         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2861 #endif
2862     }
2863
2864     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2865 #ifdef AFS_NT40_ENV
2866     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2867 #endif
2868
2869     MUTEX_EXIT(&rx_freePktQ_lock);
2870     USERPRI;
2871 #endif /* RXDEBUG_PACKET */
2872     return 0;
2873 }
2874 #endif