src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #ifdef KERNEL
  12 #include "afs/param.h"
  13 #else
  14 #include <afs/param.h>
  15 #endif
  16
  17
  18 #ifdef KERNEL
  19 #if defined(UKERNEL)
  20 #include "afs/sysincludes.h"
  21 #include "afsincludes.h"
  22 #include "rx/rx_kcommon.h"
  23 #include "rx/rx_clock.h"
  24 #include "rx/rx_queue.h"
  25 #include "rx/rx_packet.h"
  26 #else /* defined(UKERNEL) */
  27 #ifdef RX_KERNEL_TRACE
  28 #include "../rx/rx_kcommon.h"
  29 #endif
  30 #include "h/types.h"
  31 #ifndef AFS_LINUX20_ENV
  32 #include "h/systm.h"
  33 #endif
  34 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
  35 #include "afs/sysincludes.h"
  36 #endif
  37 #if defined(AFS_OBSD_ENV)
  38 #include "h/proc.h"
  39 #endif
  40 #include "h/socket.h"
  41 #if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  42 #if     !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  43 #include "sys/mount.h"          /* it gets pulled in by something later anyway */
  44 #endif
  45 #include "h/mbuf.h"
  46 #endif
  47 #include "netinet/in.h"
  48 #include "afs/afs_osi.h"
  49 #include "rx_kmutex.h"
  50 #include "rx/rx_clock.h"
  51 #include "rx/rx_queue.h"
  52 #ifdef  AFS_SUN5_ENV
  53 #include <sys/sysmacros.h>
  54 #endif
  55 #include "rx/rx_packet.h"
  56 #endif /* defined(UKERNEL) */
  57 #include "rx/rx_globals.h"
  58 #else /* KERNEL */
  59 #include "sys/types.h"
  60 #include <sys/stat.h>
  61 #include <errno.h>
  62 #if defined(AFS_NT40_ENV)
  63 #include <winsock2.h>
  64 #ifndef EWOULDBLOCK
  65 #define EWOULDBLOCK WSAEWOULDBLOCK
  66 #endif
  67 #include "rx_user.h"
  68 #include "rx_xmit_nt.h"
  69 #include <stdlib.h>
  70 #else
  71 #include <sys/socket.h>
  72 #include <netinet/in.h>
  73 #endif
  74 #include "rx_clock.h"
  75 #include "rx.h"
  76 #include "rx_queue.h"
  77 #ifdef  AFS_SUN5_ENV
  78 #include <sys/sysmacros.h>
  79 #endif
  80 #include "rx_packet.h"
  81 #include "rx_globals.h"
  82 #include <lwp.h>
  83 #include <assert.h>
  84 #include <string.h>
  85 #ifdef HAVE_UNISTD_H
  86 #include <unistd.h>
  87 #endif
  88 #endif /* KERNEL */
  89
  90 #ifdef RX_LOCKS_DB
  91 /* rxdb_fileID is used to identify the lock location, along with line#. */
  92 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  93 #endif /* RX_LOCKS_DB */
  94 static struct rx_packet *rx_mallocedP = 0;
  95 #ifdef RXDEBUG_PACKET
  96 static afs_uint32       rx_packet_id = 0;
  97 #endif
  98
  99 extern char cml_version_number[];
 100
 101 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
 102
 103 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
 104                                 afs_uint32 ahost, short aport,
 105                                 afs_int32 istack);
 106
 107 #ifdef RX_ENABLE_TSFPQ
 108 static int
 109 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
 110 #else
 111 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
 112                                    afs_uint32 first,
 113                                    struct rx_queue * q);
 114 #endif
 115
 116 /* some rules about packets:
 117  * 1.  When a packet is allocated, the final iov_buf contains room for
 118  * a security trailer, but iov_len masks that fact.  If the security
 119  * package wants to add the trailer, it may do so, and then extend
 120  * iov_len appropriately.  For this reason, packet's niovecs and
 121  * iov_len fields should be accurate before calling PreparePacket.
 122 */
 123
 124 /* Preconditions:
 125  *        all packet buffers (iov_base) are integral multiples of
 126  *        the word size.
 127  *        offset is an integral multiple of the word size.
 128  */
 129 afs_int32
 130 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 131 {
 132     unsigned int i;
 133     size_t l;
 134     for (l = 0, i = 1; i < packet->niovecs; i++) {
 135         if (l + packet->wirevec[i].iov_len > offset) {
 136             return
 137                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 138                                  (offset - l)));
 139         }
 140         l += packet->wirevec[i].iov_len;
 141     }
 142
 143     return 0;
 144 }
 145
 146 /* Preconditions:
 147  *        all packet buffers (iov_base) are integral multiples of the word size.
 148  *        offset is an integral multiple of the word size.
 149  */
 150 afs_int32
 151 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 152 {
 153     unsigned int i;
 154     size_t l;
 155     for (l = 0, i = 1; i < packet->niovecs; i++) {
 156         if (l + packet->wirevec[i].iov_len > offset) {
 157             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 158                              (offset - l))) = data;
 159             return 0;
 160         }
 161         l += packet->wirevec[i].iov_len;
 162     }
 163
 164     return 0;
 165 }
 166
 167 /* Preconditions:
 168  *        all packet buffers (iov_base) are integral multiples of the
 169  *        word size.
 170  *        offset is an integral multiple of the word size.
 171  * Packet Invariants:
 172  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 173  */
 174 afs_int32
 175 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 176                   char *out)
 177 {
 178     unsigned int i, j, l, r;
 179     for (l = 0, i = 1; i < packet->niovecs; i++) {
 180         if (l + packet->wirevec[i].iov_len > offset) {
 181             break;
 182         }
 183         l += packet->wirevec[i].iov_len;
 184     }
 185
 186     /* i is the iovec which contains the first little bit of data in which we
 187      * are interested.  l is the total length of everything prior to this iovec.
 188      * j is the number of bytes we can safely copy out of this iovec.
 189      * offset only applies to the first iovec.
 190      */
 191     r = resid;
 192     while ((r > 0) && (i < packet->niovecs)) {
 193         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 194         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 195         r -= j;
 196         out += j;
 197         l += packet->wirevec[i].iov_len;
 198         offset = l;
 199         i++;
 200     }
 201
 202     return (r ? (resid - r) : resid);
 203 }
 204
 205
 206 /* Preconditions:
 207  *        all packet buffers (iov_base) are integral multiples of the
 208  *        word size.
 209  *        offset is an integral multiple of the word size.
 210  */
 211 afs_int32
 212 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 213 {
 214     unsigned int i, j, l, o, r;
 215     char *b;
 216
 217     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 218         if (l + packet->wirevec[i].iov_len > o) {
 219             break;
 220         }
 221         l += packet->wirevec[i].iov_len;
 222     }
 223
 224     /* i is the iovec which contains the first little bit of data in which we
 225      * are interested.  l is the total length of everything prior to this iovec.
 226      * j is the number of bytes we can safely copy out of this iovec.
 227      * offset only applies to the first iovec.
 228      */
 229     r = resid;
 230     while ((r > 0) && (i <= RX_MAXWVECS)) {
 231         if (i >= packet->niovecs)
 232             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 233                 break;
 234
 235         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 236         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 237         memcpy(b, in, j);
 238         r -= j;
 239         in += j;
 240         l += packet->wirevec[i].iov_len;
 241         offset = l;
 242         i++;
 243     }
 244
 245     return (r ? (resid - r) : resid);
 246 }
 247
 248 int
 249 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 250 {
 251     struct rx_packet *p, *np;
 252
 253     num_pkts = AllocPacketBufs(class, num_pkts, q);
 254
 255     for (queue_Scan(q, p, np, rx_packet)) {
 256         RX_PACKET_IOV_FULLINIT(p);
 257     }
 258
 259     return num_pkts;
 260 }
 261
 262 #ifdef RX_ENABLE_TSFPQ
 263 static int
 264 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 265 {
 266     struct rx_ts_info_t * rx_ts_info;
 267     int transfer;
 268     SPLVAR;
 269
 270     RX_TS_INFO_GET(rx_ts_info);
 271
 272     transfer = num_pkts - rx_ts_info->_FPQ.len;
 273     if (transfer > 0) {
 274         NETPRI;
 275         MUTEX_ENTER(&rx_freePktQ_lock);
 276         transfer = MAX(transfer, rx_TSFPQGlobSize);
 277         if (transfer > rx_nFreePackets) {
 278             /* alloc enough for us, plus a few globs for other threads */
 279             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 280         }
 281
 282         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 283
 284         MUTEX_EXIT(&rx_freePktQ_lock);
 285         USERPRI;
 286     }
 287
 288     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 289
 290     return num_pkts;
 291 }
 292 #else /* RX_ENABLE_TSFPQ */
 293 static int
 294 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 295 {
 296     struct rx_packet *c;
 297     int i;
 298 #ifdef KERNEL
 299     int overq = 0;
 300 #endif
 301     SPLVAR;
 302
 303     NETPRI;
 304
 305     MUTEX_ENTER(&rx_freePktQ_lock);
 306
 307 #ifdef KERNEL
 308     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 309          num_pkts--, overq++);
 310
 311     if (overq) {
 312         rxi_NeedMorePackets = TRUE;
 313         if (rx_stats_active) {
 314             switch (class) {
 315             case RX_PACKET_CLASS_RECEIVE:
 316                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
 317                 break;
 318             case RX_PACKET_CLASS_SEND:
 319                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
 320                 break;
 321             case RX_PACKET_CLASS_SPECIAL:
 322                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
 323                 break;
 324             case RX_PACKET_CLASS_RECV_CBUF:
 325                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
 326                 break;
 327             case RX_PACKET_CLASS_SEND_CBUF:
 328                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
 329                 break;
 330             }
 331         }
 332     }
 333
 334     if (rx_nFreePackets < num_pkts)
 335         num_pkts = rx_nFreePackets;
 336
 337     if (!num_pkts) {
 338         rxi_NeedMorePackets = TRUE;
 339         goto done;
 340     }
 341 #else /* KERNEL */
 342     if (rx_nFreePackets < num_pkts) {
 343         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 344     }
 345 #endif /* KERNEL */
 346
 347     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 348          i < num_pkts;
 349          i++, c=queue_Next(c, rx_packet)) {
 350         RX_FPQ_MARK_USED(c);
 351     }
 352
 353     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 354
 355     rx_nFreePackets -= num_pkts;
 356
 357 #ifdef KERNEL
 358   done:
 359 #endif
 360     MUTEX_EXIT(&rx_freePktQ_lock);
 361
 362     USERPRI;
 363     return num_pkts;
 364 }
 365 #endif /* RX_ENABLE_TSFPQ */
 366
 367 /*
 368  * Free a packet currently used as a continuation buffer
 369  */
 370 #ifdef RX_ENABLE_TSFPQ
 371 /* num_pkts=0 means queue length is unknown */
 372 int
 373 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 374 {
 375     struct rx_ts_info_t * rx_ts_info;
 376     struct rx_packet *c, *nc;
 377     SPLVAR;
 378
 379     osi_Assert(num_pkts >= 0);
 380     RX_TS_INFO_GET(rx_ts_info);
 381
 382     if (!num_pkts) {
 383         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 384             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 385         }
 386     } else {
 387         for (queue_Scan(q, c, nc, rx_packet)) {
 388             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 389         }
 390     }
 391
 392     if (num_pkts) {
 393         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 394     }
 395
 396     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 397         NETPRI;
 398         MUTEX_ENTER(&rx_freePktQ_lock);
 399
 400         RX_TS_FPQ_LTOG(rx_ts_info);
 401
 402         /* Wakeup anyone waiting for packets */
 403         rxi_PacketsUnWait();
 404
 405         MUTEX_EXIT(&rx_freePktQ_lock);
 406         USERPRI;
 407     }
 408
 409     return num_pkts;
 410 }
 411 #else /* RX_ENABLE_TSFPQ */
 412 /* num_pkts=0 means queue length is unknown */
 413 int
 414 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 415 {
 416     struct rx_queue cbs;
 417     struct rx_packet *p, *np;
 418     int qlen = 0;
 419     SPLVAR;
 420
 421     osi_Assert(num_pkts >= 0);
 422     queue_Init(&cbs);
 423
 424     if (!num_pkts) {
 425         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 426             if (p->niovecs > 2) {
 427                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 428             }
 429             RX_FPQ_MARK_FREE(p);
 430         }
 431         if (!num_pkts)
 432             return 0;
 433     } else {
 434         for (queue_Scan(q, p, np, rx_packet)) {
 435             if (p->niovecs > 2) {
 436                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 437             }
 438             RX_FPQ_MARK_FREE(p);
 439         }
 440     }
 441
 442     if (qlen) {
 443         queue_SpliceAppend(q, &cbs);
 444         qlen += num_pkts;
 445     } else
 446         qlen = num_pkts;
 447
 448     NETPRI;
 449     MUTEX_ENTER(&rx_freePktQ_lock);
 450
 451     queue_SpliceAppend(&rx_freePacketQueue, q);
 452     rx_nFreePackets += qlen;
 453
 454     /* Wakeup anyone waiting for packets */
 455     rxi_PacketsUnWait();
 456
 457     MUTEX_EXIT(&rx_freePktQ_lock);
 458     USERPRI;
 459
 460     return num_pkts;
 461 }
 462 #endif /* RX_ENABLE_TSFPQ */
 463
 464 /* this one is kind of awful.
 465  * In rxkad, the packet has been all shortened, and everything, ready for
 466  * sending.  All of a sudden, we discover we need some of that space back.
 467  * This isn't terribly general, because it knows that the packets are only
 468  * rounded up to the EBS (userdata + security header).
 469  */
 470 int
 471 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 472 {
 473     int i;
 474     i = p->niovecs - 1;
 475     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 476         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 477             p->wirevec[i].iov_len += nb;
 478             return 0;
 479         }
 480     } else {
 481         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 482             p->wirevec[i].iov_len += nb;
 483             return 0;
 484         }
 485     }
 486
 487     return 0;
 488 }
 489
 490 /* get sufficient space to store nb bytes of data (or more), and hook
 491  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 492  * returns the number of bytes >0 which it failed to come up with.
 493  * Don't need to worry about locking on packet, since only
 494  * one thread can manipulate one at a time. Locking on continution
 495  * packets is handled by AllocPacketBufs */
 496 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 497 int
 498 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 499 {
 500     int i, nv;
 501     struct rx_queue q;
 502     struct rx_packet *cb, *ncb;
 503
 504     /* compute the number of cbuf's we need */
 505     nv = nb / RX_CBUFFERSIZE;
 506     if ((nv * RX_CBUFFERSIZE) < nb)
 507         nv++;
 508     if ((nv + p->niovecs) > RX_MAXWVECS)
 509         nv = RX_MAXWVECS - p->niovecs;
 510     if (nv < 1)
 511         return nb;
 512
 513     /* allocate buffers */
 514     queue_Init(&q);
 515     nv = AllocPacketBufs(class, nv, &q);
 516
 517     /* setup packet iovs */
 518     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 519         queue_Remove(cb);
 520         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 521         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 522     }
 523
 524     nb -= (nv * RX_CBUFFERSIZE);
 525     p->length += (nv * RX_CBUFFERSIZE);
 526     p->niovecs += nv;
 527
 528     return nb;
 529 }
 530
 531 /* Add more packet buffers */
 532 #ifdef RX_ENABLE_TSFPQ
 533 void
 534 rxi_MorePackets(int apackets)
 535 {
 536     struct rx_packet *p, *e;
 537     struct rx_ts_info_t * rx_ts_info;
 538     int getme;
 539     SPLVAR;
 540
 541     getme = apackets * sizeof(struct rx_packet);
 542     p = (struct rx_packet *)osi_Alloc(getme);
 543     osi_Assert(p);
 544
 545     PIN(p, getme);              /* XXXXX */
 546     memset(p, 0, getme);
 547     RX_TS_INFO_GET(rx_ts_info);
 548
 549     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 550     /* TSFPQ patch also needs to keep track of total packets */
 551
 552     MUTEX_ENTER(&rx_packets_mutex);
 553     rx_nPackets += apackets;
 554     RX_TS_FPQ_COMPUTE_LIMITS;
 555     MUTEX_EXIT(&rx_packets_mutex);
 556
 557     for (e = p + apackets; p < e; p++) {
 558         RX_PACKET_IOV_INIT(p);
 559         p->niovecs = 2;
 560
 561         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 562
 563         NETPRI;
 564         MUTEX_ENTER(&rx_freePktQ_lock);
 565 #ifdef RXDEBUG_PACKET
 566         p->packetId = rx_packet_id++;
 567         p->allNextp = rx_mallocedP;
 568 #endif /* RXDEBUG_PACKET */
 569         rx_mallocedP = p;
 570         MUTEX_EXIT(&rx_freePktQ_lock);
 571         USERPRI;
 572     }
 573     rx_ts_info->_FPQ.delta += apackets;
 574
 575     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 576         NETPRI;
 577         MUTEX_ENTER(&rx_freePktQ_lock);
 578
 579         RX_TS_FPQ_LTOG(rx_ts_info);
 580         rxi_NeedMorePackets = FALSE;
 581         rxi_PacketsUnWait();
 582
 583         MUTEX_EXIT(&rx_freePktQ_lock);
 584         USERPRI;
 585     }
 586 }
 587 #else /* RX_ENABLE_TSFPQ */
 588 void
 589 rxi_MorePackets(int apackets)
 590 {
 591     struct rx_packet *p, *e;
 592     int getme;
 593     SPLVAR;
 594
 595     getme = apackets * sizeof(struct rx_packet);
 596     p = (struct rx_packet *)osi_Alloc(getme);
 597     osi_Assert(p);
 598
 599     PIN(p, getme);              /* XXXXX */
 600     memset(p, 0, getme);
 601     NETPRI;
 602     MUTEX_ENTER(&rx_freePktQ_lock);
 603
 604     for (e = p + apackets; p < e; p++) {
 605         RX_PACKET_IOV_INIT(p);
 606         p->flags |= RX_PKTFLAG_FREE;
 607         p->niovecs = 2;
 608
 609         queue_Append(&rx_freePacketQueue, p);
 610 #ifdef RXDEBUG_PACKET
 611         p->packetId = rx_packet_id++;
 612         p->allNextp = rx_mallocedP;
 613 #endif /* RXDEBUG_PACKET */
 614         rx_mallocedP = p;
 615     }
 616
 617     rx_nPackets += apackets;
 618     rx_nFreePackets += apackets;
 619     rxi_NeedMorePackets = FALSE;
 620     rxi_PacketsUnWait();
 621
 622     MUTEX_EXIT(&rx_freePktQ_lock);
 623     USERPRI;
 624 }
 625 #endif /* RX_ENABLE_TSFPQ */
 626
 627 #ifdef RX_ENABLE_TSFPQ
 628 void
 629 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 630 {
 631     struct rx_packet *p, *e;
 632     struct rx_ts_info_t * rx_ts_info;
 633     int getme;
 634     SPLVAR;
 635
 636     getme = apackets * sizeof(struct rx_packet);
 637     p = (struct rx_packet *)osi_Alloc(getme);
 638
 639     PIN(p, getme);              /* XXXXX */
 640     memset(p, 0, getme);
 641     RX_TS_INFO_GET(rx_ts_info);
 642
 643     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 644     /* TSFPQ patch also needs to keep track of total packets */
 645     MUTEX_ENTER(&rx_packets_mutex);
 646     rx_nPackets += apackets;
 647     RX_TS_FPQ_COMPUTE_LIMITS;
 648     MUTEX_EXIT(&rx_packets_mutex);
 649
 650     for (e = p + apackets; p < e; p++) {
 651         RX_PACKET_IOV_INIT(p);
 652         p->niovecs = 2;
 653         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 654
 655         NETPRI;
 656         MUTEX_ENTER(&rx_freePktQ_lock);
 657 #ifdef RXDEBUG_PACKET
 658         p->packetId = rx_packet_id++;
 659         p->allNextp = rx_mallocedP;
 660 #endif /* RXDEBUG_PACKET */
 661         rx_mallocedP = p;
 662         MUTEX_EXIT(&rx_freePktQ_lock);
 663         USERPRI;
 664     }
 665     rx_ts_info->_FPQ.delta += apackets;
 666
 667     if (flush_global &&
 668         (num_keep_local < apackets)) {
 669         NETPRI;
 670         MUTEX_ENTER(&rx_freePktQ_lock);
 671
 672         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 673         rxi_NeedMorePackets = FALSE;
 674         rxi_PacketsUnWait();
 675
 676         MUTEX_EXIT(&rx_freePktQ_lock);
 677         USERPRI;
 678     }
 679 }
 680 #endif /* RX_ENABLE_TSFPQ */
 681
 682 #ifndef KERNEL
 683 /* Add more packet buffers */
 684 void
 685 rxi_MorePacketsNoLock(int apackets)
 686 {
 687 #ifdef RX_ENABLE_TSFPQ
 688     struct rx_ts_info_t * rx_ts_info;
 689 #endif /* RX_ENABLE_TSFPQ */
 690     struct rx_packet *p, *e;
 691     int getme;
 692
 693     /* allocate enough packets that 1/4 of the packets will be able
 694      * to hold maximal amounts of data */
 695     apackets += (apackets / 4)
 696         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 697     do {
 698         getme = apackets * sizeof(struct rx_packet);
 699         p = (struct rx_packet *)osi_Alloc(getme);
 700         if (p == NULL) {
 701             apackets -= apackets / 4;
 702             osi_Assert(apackets > 0);
 703         }
 704     } while(p == NULL);
 705     memset(p, 0, getme);
 706
 707 #ifdef RX_ENABLE_TSFPQ
 708     RX_TS_INFO_GET(rx_ts_info);
 709     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 710 #endif /* RX_ENABLE_TSFPQ */
 711
 712     for (e = p + apackets; p < e; p++) {
 713         RX_PACKET_IOV_INIT(p);
 714         p->flags |= RX_PKTFLAG_FREE;
 715         p->niovecs = 2;
 716
 717         queue_Append(&rx_freePacketQueue, p);
 718 #ifdef RXDEBUG_PACKET
 719         p->packetId = rx_packet_id++;
 720         p->allNextp = rx_mallocedP;
 721 #endif /* RXDEBUG_PACKET */
 722         rx_mallocedP = p;
 723     }
 724
 725     rx_nFreePackets += apackets;
 726     MUTEX_ENTER(&rx_packets_mutex);
 727     rx_nPackets += apackets;
 728 #ifdef RX_ENABLE_TSFPQ
 729     RX_TS_FPQ_COMPUTE_LIMITS;
 730 #endif /* RX_ENABLE_TSFPQ */
 731     MUTEX_EXIT(&rx_packets_mutex);
 732     rxi_NeedMorePackets = FALSE;
 733     rxi_PacketsUnWait();
 734 }
 735 #endif /* !KERNEL */
 736
 737 void
 738 rxi_FreeAllPackets(void)
 739 {
 740     /* must be called at proper interrupt level, etcetera */
 741     /* MTUXXX need to free all Packets */
 742     osi_Free(rx_mallocedP,
 743              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 744     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 745 }
 746
 747 #ifdef RX_ENABLE_TSFPQ
 748 void
 749 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 750 {
 751     struct rx_ts_info_t * rx_ts_info;
 752     int xfer;
 753     SPLVAR;
 754
 755     RX_TS_INFO_GET(rx_ts_info);
 756
 757     if (num_keep_local != rx_ts_info->_FPQ.len) {
 758         NETPRI;
 759         MUTEX_ENTER(&rx_freePktQ_lock);
 760         if (num_keep_local < rx_ts_info->_FPQ.len) {
 761             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 762             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 763             rxi_PacketsUnWait();
 764         } else {
 765             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 766             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 767                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 768             if (rx_nFreePackets < xfer) {
 769                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 770             }
 771             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 772         }
 773         MUTEX_EXIT(&rx_freePktQ_lock);
 774         USERPRI;
 775     }
 776 }
 777
 778 void
 779 rxi_FlushLocalPacketsTSFPQ(void)
 780 {
 781     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 782 }
 783 #endif /* RX_ENABLE_TSFPQ */
 784
 785 /* Allocate more packets iff we need more continuation buffers */
 786 /* In kernel, can't page in memory with interrupts disabled, so we
 787  * don't use the event mechanism. */
 788 void
 789 rx_CheckPackets(void)
 790 {
 791     if (rxi_NeedMorePackets) {
 792         rxi_MorePackets(rx_maxSendWindow);
 793     }
 794 }
 795
 796 /* In the packet freeing routine below, the assumption is that
 797    we want all of the packets to be used equally frequently, so that we
 798    don't get packet buffers paging out.  It would be just as valid to
 799    assume that we DO want them to page out if not many are being used.
 800    In any event, we assume the former, and append the packets to the end
 801    of the free list.  */
 802 /* This explanation is bogus.  The free list doesn't remain in any kind of
 803    useful order for afs_int32: the packets in use get pretty much randomly scattered
 804    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 805    must be stored so that packets which are adjacent in memory are adjacent in the
 806    free list.  An array springs rapidly to mind.
 807    */
 808
 809 /* Actually free the packet p. */
 810 #ifdef RX_ENABLE_TSFPQ
 811 void
 812 rxi_FreePacketNoLock(struct rx_packet *p)
 813 {
 814     struct rx_ts_info_t * rx_ts_info;
 815     dpf(("Free %"AFS_PTR_FMT"\n", p));
 816
 817     RX_TS_INFO_GET(rx_ts_info);
 818     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 819     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 820         RX_TS_FPQ_LTOG(rx_ts_info);
 821     }
 822 }
 823 #else /* RX_ENABLE_TSFPQ */
 824 void
 825 rxi_FreePacketNoLock(struct rx_packet *p)
 826 {
 827     dpf(("Free %"AFS_PTR_FMT"\n", p));
 828
 829     RX_FPQ_MARK_FREE(p);
 830     rx_nFreePackets++;
 831     queue_Append(&rx_freePacketQueue, p);
 832 }
 833 #endif /* RX_ENABLE_TSFPQ */
 834
 835 #ifdef RX_ENABLE_TSFPQ
 836 void
 837 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 838 {
 839     struct rx_ts_info_t * rx_ts_info;
 840     dpf(("Free %"AFS_PTR_FMT"\n", p));
 841
 842     RX_TS_INFO_GET(rx_ts_info);
 843     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 844
 845     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 846         NETPRI;
 847         MUTEX_ENTER(&rx_freePktQ_lock);
 848
 849         RX_TS_FPQ_LTOG(rx_ts_info);
 850
 851         /* Wakeup anyone waiting for packets */
 852         rxi_PacketsUnWait();
 853
 854         MUTEX_EXIT(&rx_freePktQ_lock);
 855         USERPRI;
 856     }
 857 }
 858 #endif /* RX_ENABLE_TSFPQ */
 859
 860 /*
 861  * free continuation buffers off a packet into a queue
 862  *
 863  * [IN] p      -- packet from which continuation buffers will be freed
 864  * [IN] first  -- iovec offset of first continuation buffer to free
 865  * [IN] q      -- queue into which continuation buffers will be chained
 866  *
 867  * returns:
 868  *   number of continuation buffers freed
 869  */
 870 #ifndef RX_ENABLE_TSFPQ
 871 static int
 872 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 873 {
 874     struct iovec *iov;
 875     struct rx_packet * cb;
 876     int count = 0;
 877
 878     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 879         iov = &p->wirevec[first];
 880         if (!iov->iov_base)
 881             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 882         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 883         RX_FPQ_MARK_FREE(cb);
 884         queue_Append(q, cb);
 885     }
 886     p->length = 0;
 887     p->niovecs = 0;
 888
 889     return count;
 890 }
 891 #endif
 892
 893 /*
 894  * free packet continuation buffers into the global free packet pool
 895  *
 896  * [IN] p      -- packet from which to free continuation buffers
 897  * [IN] first  -- iovec offset of first continuation buffer to free
 898  *
 899  * returns:
 900  *   zero always
 901  */
 902 int
 903 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 904 {
 905     struct iovec *iov;
 906
 907     for (first = MAX(2, first); first < p->niovecs; first++) {
 908         iov = &p->wirevec[first];
 909         if (!iov->iov_base)
 910             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 911         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 912     }
 913     p->length = 0;
 914     p->niovecs = 0;
 915
 916     return 0;
 917 }
 918
 919 #ifdef RX_ENABLE_TSFPQ
 920 /*
 921  * free packet continuation buffers into the thread-local free pool
 922  *
 923  * [IN] p             -- packet from which continuation buffers will be freed
 924  * [IN] first         -- iovec offset of first continuation buffer to free
 925  *                       any value less than 2, the min number of iovecs,
 926  *                       is treated as if it is 2.
 927  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 928  *                       global free pool before returning
 929  *
 930  * returns:
 931  *   zero always
 932  */
 933 static int
 934 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 935 {
 936     struct iovec *iov;
 937     struct rx_ts_info_t * rx_ts_info;
 938
 939     RX_TS_INFO_GET(rx_ts_info);
 940
 941     for (first = MAX(2, first); first < p->niovecs; first++) {
 942         iov = &p->wirevec[first];
 943         if (!iov->iov_base)
 944             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 945         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 946     }
 947     p->length = 0;
 948     p->niovecs = 0;
 949
 950     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 951         NETPRI;
 952         MUTEX_ENTER(&rx_freePktQ_lock);
 953
 954         RX_TS_FPQ_LTOG(rx_ts_info);
 955
 956         /* Wakeup anyone waiting for packets */
 957         rxi_PacketsUnWait();
 958
 959         MUTEX_EXIT(&rx_freePktQ_lock);
 960         USERPRI;
 961     }
 962     return 0;
 963 }
 964 #endif /* RX_ENABLE_TSFPQ */
 965
 966 int rxi_nBadIovecs = 0;
 967
 968 /* rxi_RestoreDataBufs
 969  *
 970  * Restore the correct sizes to the iovecs. Called when reusing a packet
 971  * for reading off the wire.
 972  */
 973 void
 974 rxi_RestoreDataBufs(struct rx_packet *p)
 975 {
 976     unsigned int i;
 977     struct iovec *iov = &p->wirevec[2];
 978
 979     RX_PACKET_IOV_INIT(p);
 980
 981     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 982         if (!iov->iov_base) {
 983             rxi_nBadIovecs++;
 984             p->niovecs = i;
 985             break;
 986         }
 987         iov->iov_len = RX_CBUFFERSIZE;
 988     }
 989 }
 990
 991 #ifdef RX_ENABLE_TSFPQ
 992 int
 993 rxi_TrimDataBufs(struct rx_packet *p, int first)
 994 {
 995     int length;
 996     struct iovec *iov, *end;
 997     struct rx_ts_info_t * rx_ts_info;
 998     SPLVAR;
 999
1000     if (first != 1)
1001         osi_Panic("TrimDataBufs 1: first must be 1");
1002
1003     /* Skip over continuation buffers containing message data */
1004     iov = &p->wirevec[2];
1005     end = iov + (p->niovecs - 2);
1006     length = p->length - p->wirevec[1].iov_len;
1007     for (; iov < end && length > 0; iov++) {
1008         if (!iov->iov_base)
1009             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1010         length -= iov->iov_len;
1011     }
1012
1013     /* iov now points to the first empty data buffer. */
1014     if (iov >= end)
1015         return 0;
1016
1017     RX_TS_INFO_GET(rx_ts_info);
1018     for (; iov < end; iov++) {
1019         if (!iov->iov_base)
1020             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1021         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1022         p->niovecs--;
1023     }
1024     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1025         NETPRI;
1026         MUTEX_ENTER(&rx_freePktQ_lock);
1027
1028         RX_TS_FPQ_LTOG(rx_ts_info);
1029         rxi_PacketsUnWait();
1030
1031         MUTEX_EXIT(&rx_freePktQ_lock);
1032         USERPRI;
1033     }
1034
1035     return 0;
1036 }
1037 #else /* RX_ENABLE_TSFPQ */
1038 int
1039 rxi_TrimDataBufs(struct rx_packet *p, int first)
1040 {
1041     int length;
1042     struct iovec *iov, *end;
1043     SPLVAR;
1044
1045     if (first != 1)
1046         osi_Panic("TrimDataBufs 1: first must be 1");
1047
1048     /* Skip over continuation buffers containing message data */
1049     iov = &p->wirevec[2];
1050     end = iov + (p->niovecs - 2);
1051     length = p->length - p->wirevec[1].iov_len;
1052     for (; iov < end && length > 0; iov++) {
1053         if (!iov->iov_base)
1054             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1055         length -= iov->iov_len;
1056     }
1057
1058     /* iov now points to the first empty data buffer. */
1059     if (iov >= end)
1060         return 0;
1061
1062     NETPRI;
1063     MUTEX_ENTER(&rx_freePktQ_lock);
1064
1065     for (; iov < end; iov++) {
1066         if (!iov->iov_base)
1067             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1068         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1069         p->niovecs--;
1070     }
1071     rxi_PacketsUnWait();
1072
1073     MUTEX_EXIT(&rx_freePktQ_lock);
1074     USERPRI;
1075
1076     return 0;
1077 }
1078 #endif /* RX_ENABLE_TSFPQ */
1079
1080 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1081  * remove it yourself first if you call this routine. */
1082 #ifdef RX_ENABLE_TSFPQ
1083 void
1084 rxi_FreePacket(struct rx_packet *p)
1085 {
1086     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1087     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1088 }
1089 #else /* RX_ENABLE_TSFPQ */
1090 void
1091 rxi_FreePacket(struct rx_packet *p)
1092 {
1093     SPLVAR;
1094
1095     NETPRI;
1096     MUTEX_ENTER(&rx_freePktQ_lock);
1097
1098     rxi_FreeDataBufsNoLock(p, 2);
1099     rxi_FreePacketNoLock(p);
1100     /* Wakeup anyone waiting for packets */
1101     rxi_PacketsUnWait();
1102
1103     MUTEX_EXIT(&rx_freePktQ_lock);
1104     USERPRI;
1105 }
1106 #endif /* RX_ENABLE_TSFPQ */
1107
1108 /* rxi_AllocPacket sets up p->length so it reflects the number of
1109  * bytes in the packet at this point, **not including** the header.
1110  * The header is absolutely necessary, besides, this is the way the
1111  * length field is usually used */
1112 #ifdef RX_ENABLE_TSFPQ
1113 struct rx_packet *
1114 rxi_AllocPacketNoLock(int class)
1115 {
1116     struct rx_packet *p;
1117     struct rx_ts_info_t * rx_ts_info;
1118
1119     RX_TS_INFO_GET(rx_ts_info);
1120
1121 #ifdef KERNEL
1122     if (rxi_OverQuota(class)) {
1123         rxi_NeedMorePackets = TRUE;
1124         if (rx_stats_active) {
1125             switch (class) {
1126             case RX_PACKET_CLASS_RECEIVE:
1127                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1128                 break;
1129             case RX_PACKET_CLASS_SEND:
1130                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1131                 break;
1132             case RX_PACKET_CLASS_SPECIAL:
1133                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1134                 break;
1135             case RX_PACKET_CLASS_RECV_CBUF:
1136                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1137                 break;
1138             case RX_PACKET_CLASS_SEND_CBUF:
1139                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1140                 break;
1141             }
1142         }
1143         return (struct rx_packet *)0;
1144     }
1145 #endif /* KERNEL */
1146
1147     if (rx_stats_active)
1148         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1149     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1150
1151 #ifdef KERNEL
1152         if (queue_IsEmpty(&rx_freePacketQueue))
1153             osi_Panic("rxi_AllocPacket error");
1154 #else /* KERNEL */
1155         if (queue_IsEmpty(&rx_freePacketQueue))
1156             rxi_MorePacketsNoLock(rx_maxSendWindow);
1157 #endif /* KERNEL */
1158
1159
1160         RX_TS_FPQ_GTOL(rx_ts_info);
1161     }
1162
1163     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1164
1165     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1166
1167
1168     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1169      * order to truncate outbound packets.  In the near future, may need
1170      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1171      */
1172     RX_PACKET_IOV_FULLINIT(p);
1173     return p;
1174 }
1175 #else /* RX_ENABLE_TSFPQ */
1176 struct rx_packet *
1177 rxi_AllocPacketNoLock(int class)
1178 {
1179     struct rx_packet *p;
1180
1181 #ifdef KERNEL
1182     if (rxi_OverQuota(class)) {
1183         rxi_NeedMorePackets = TRUE;
1184         if (rx_stats_active) {
1185             switch (class) {
1186             case RX_PACKET_CLASS_RECEIVE:
1187                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1188                 break;
1189             case RX_PACKET_CLASS_SEND:
1190                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1191                 break;
1192             case RX_PACKET_CLASS_SPECIAL:
1193                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1194                 break;
1195             case RX_PACKET_CLASS_RECV_CBUF:
1196                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1197                 break;
1198             case RX_PACKET_CLASS_SEND_CBUF:
1199                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1200                 break;
1201             }
1202         }
1203         return (struct rx_packet *)0;
1204     }
1205 #endif /* KERNEL */
1206
1207     if (rx_stats_active)
1208         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1209
1210 #ifdef KERNEL
1211     if (queue_IsEmpty(&rx_freePacketQueue))
1212         osi_Panic("rxi_AllocPacket error");
1213 #else /* KERNEL */
1214     if (queue_IsEmpty(&rx_freePacketQueue))
1215         rxi_MorePacketsNoLock(rx_maxSendWindow);
1216 #endif /* KERNEL */
1217
1218     rx_nFreePackets--;
1219     p = queue_First(&rx_freePacketQueue, rx_packet);
1220     queue_Remove(p);
1221     RX_FPQ_MARK_USED(p);
1222
1223     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1224
1225
1226     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1227      * order to truncate outbound packets.  In the near future, may need
1228      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1229      */
1230     RX_PACKET_IOV_FULLINIT(p);
1231     return p;
1232 }
1233 #endif /* RX_ENABLE_TSFPQ */
1234
1235 #ifdef RX_ENABLE_TSFPQ
1236 struct rx_packet *
1237 rxi_AllocPacketTSFPQ(int class, int pull_global)
1238 {
1239     struct rx_packet *p;
1240     struct rx_ts_info_t * rx_ts_info;
1241
1242     RX_TS_INFO_GET(rx_ts_info);
1243
1244     if (rx_stats_active)
1245         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1246     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1247         MUTEX_ENTER(&rx_freePktQ_lock);
1248
1249         if (queue_IsEmpty(&rx_freePacketQueue))
1250             rxi_MorePacketsNoLock(rx_maxSendWindow);
1251
1252         RX_TS_FPQ_GTOL(rx_ts_info);
1253
1254         MUTEX_EXIT(&rx_freePktQ_lock);
1255     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1256         return NULL;
1257     }
1258
1259     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1260
1261     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1262
1263     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1264      * order to truncate outbound packets.  In the near future, may need
1265      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1266      */
1267     RX_PACKET_IOV_FULLINIT(p);
1268     return p;
1269 }
1270 #endif /* RX_ENABLE_TSFPQ */
1271
1272 #ifdef RX_ENABLE_TSFPQ
1273 struct rx_packet *
1274 rxi_AllocPacket(int class)
1275 {
1276     struct rx_packet *p;
1277
1278     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1279     return p;
1280 }
1281 #else /* RX_ENABLE_TSFPQ */
1282 struct rx_packet *
1283 rxi_AllocPacket(int class)
1284 {
1285     struct rx_packet *p;
1286
1287     MUTEX_ENTER(&rx_freePktQ_lock);
1288     p = rxi_AllocPacketNoLock(class);
1289     MUTEX_EXIT(&rx_freePktQ_lock);
1290     return p;
1291 }
1292 #endif /* RX_ENABLE_TSFPQ */
1293
1294 /* This guy comes up with as many buffers as it {takes,can get} given
1295  * the MTU for this call. It also sets the packet length before
1296  * returning.  caution: this is often called at NETPRI
1297  * Called with call locked.
1298  */
1299 struct rx_packet *
1300 rxi_AllocSendPacket(struct rx_call *call, int want)
1301 {
1302     struct rx_packet *p = (struct rx_packet *)0;
1303     int mud;
1304     unsigned delta;
1305
1306     SPLVAR;
1307     mud = call->MTU - RX_HEADER_SIZE;
1308     delta =
1309         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1310         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1311
1312 #ifdef RX_ENABLE_TSFPQ
1313     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1314         want += delta;
1315         want = MIN(want, mud);
1316
1317         if ((unsigned)want > p->length)
1318             (void)rxi_AllocDataBuf(p, (want - p->length),
1319                                    RX_PACKET_CLASS_SEND_CBUF);
1320
1321         if (p->length > mud)
1322             p->length = mud;
1323
1324         if (delta >= p->length) {
1325             rxi_FreePacket(p);
1326             p = NULL;
1327         } else {
1328             p->length -= delta;
1329         }
1330         return p;
1331     }
1332 #endif /* RX_ENABLE_TSFPQ */
1333
1334     while (!(call->error)) {
1335         MUTEX_ENTER(&rx_freePktQ_lock);
1336         /* if an error occurred, or we get the packet we want, we're done */
1337         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1338             MUTEX_EXIT(&rx_freePktQ_lock);
1339
1340             want += delta;
1341             want = MIN(want, mud);
1342
1343             if ((unsigned)want > p->length)
1344                 (void)rxi_AllocDataBuf(p, (want - p->length),
1345                                        RX_PACKET_CLASS_SEND_CBUF);
1346
1347             if (p->length > mud)
1348                 p->length = mud;
1349
1350             if (delta >= p->length) {
1351                 rxi_FreePacket(p);
1352                 p = NULL;
1353             } else {
1354                 p->length -= delta;
1355             }
1356             break;
1357         }
1358
1359         /* no error occurred, and we didn't get a packet, so we sleep.
1360          * At this point, we assume that packets will be returned
1361          * sooner or later, as packets are acknowledged, and so we
1362          * just wait.  */
1363         NETPRI;
1364         call->flags |= RX_CALL_WAIT_PACKETS;
1365         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1366         MUTEX_EXIT(&call->lock);
1367         rx_waitingForPackets = 1;
1368
1369 #ifdef  RX_ENABLE_LOCKS
1370         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1371 #else
1372         osi_rxSleep(&rx_waitingForPackets);
1373 #endif
1374         MUTEX_EXIT(&rx_freePktQ_lock);
1375         MUTEX_ENTER(&call->lock);
1376         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1377         call->flags &= ~RX_CALL_WAIT_PACKETS;
1378         USERPRI;
1379     }
1380
1381     return p;
1382 }
1383
1384 #ifndef KERNEL
1385 #ifdef AFS_NT40_ENV
1386 /* Windows does not use file descriptors. */
1387 #define CountFDs(amax) 0
1388 #else
1389 /* count the number of used FDs */
1390 static int
1391 CountFDs(int amax)
1392 {
1393     struct stat tstat;
1394     int i, code;
1395     int count;
1396
1397     count = 0;
1398     for (i = 0; i < amax; i++) {
1399         code = fstat(i, &tstat);
1400         if (code == 0)
1401             count++;
1402     }
1403     return count;
1404 }
1405 #endif /* AFS_NT40_ENV */
1406 #else /* KERNEL */
1407
1408 #define CountFDs(amax) amax
1409
1410 #endif /* KERNEL */
1411
1412 #if !defined(KERNEL) || defined(UKERNEL)
1413
1414 /* This function reads a single packet from the interface into the
1415  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1416  * (host,port) of the sender are stored in the supplied variables, and
1417  * the data length of the packet is stored in the packet structure.
1418  * The header is decoded. */
1419 int
1420 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1421                u_short * port)
1422 {
1423     struct sockaddr_in from;
1424     unsigned int nbytes;
1425     afs_int32 rlen;
1426     afs_uint32 tlen, savelen;
1427     struct msghdr msg;
1428     rx_computelen(p, tlen);
1429     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1430
1431     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1432     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1433                                  * it once in order to avoid races.  */
1434     tlen = rlen - tlen;
1435     if (tlen > 0) {
1436         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1437         if (tlen > 0) {
1438             tlen = rlen - tlen;
1439         } else
1440             tlen = rlen;
1441     } else
1442         tlen = rlen;
1443
1444     /* Extend the last iovec for padding, it's just to make sure that the
1445      * read doesn't return more data than we expect, and is done to get around
1446      * our problems caused by the lack of a length field in the rx header.
1447      * Use the extra buffer that follows the localdata in each packet
1448      * structure. */
1449     savelen = p->wirevec[p->niovecs - 1].iov_len;
1450     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1451
1452     memset(&msg, 0, sizeof(msg));
1453     msg.msg_name = (char *)&from;
1454     msg.msg_namelen = sizeof(struct sockaddr_in);
1455     msg.msg_iov = p->wirevec;
1456     msg.msg_iovlen = p->niovecs;
1457     nbytes = rxi_Recvmsg(socket, &msg, 0);
1458
1459     /* restore the vec to its correct state */
1460     p->wirevec[p->niovecs - 1].iov_len = savelen;
1461
1462     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1463     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1464         if (nbytes < 0 && errno == EWOULDBLOCK) {
1465             if (rx_stats_active)
1466                 rx_MutexIncrement(rx_stats.noPacketOnRead, rx_stats_mutex);
1467         } else if (nbytes <= 0) {
1468             if (rx_stats_active) {
1469                 MUTEX_ENTER(&rx_stats_mutex);
1470                 rx_stats.bogusPacketOnRead++;
1471                 rx_stats.bogusHost = from.sin_addr.s_addr;
1472                 MUTEX_EXIT(&rx_stats_mutex);
1473             }
1474             dpf(("B: bogus packet from [%x,%d] nb=%d", ntohl(from.sin_addr.s_addr),
1475                  ntohs(from.sin_port), nbytes));
1476         }
1477         return 0;
1478     }
1479 #ifdef RXDEBUG
1480     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1481                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1482         rxi_DecodePacketHeader(p);
1483
1484         *host = from.sin_addr.s_addr;
1485         *port = from.sin_port;
1486
1487         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d",
1488               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1489               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1490               p->length));
1491 #ifdef RX_TRIMDATABUFS
1492         rxi_TrimDataBufs(p, 1);
1493 #endif
1494         return 0;
1495     }
1496 #endif
1497     else {
1498         /* Extract packet header. */
1499         rxi_DecodePacketHeader(p);
1500
1501         *host = from.sin_addr.s_addr;
1502         *port = from.sin_port;
1503         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1504             struct rx_peer *peer;
1505             if (rx_stats_active)
1506                 rx_MutexIncrement(rx_stats.packetsRead[p->header.type - 1], rx_stats_mutex);
1507             /*
1508              * Try to look up this peer structure.  If it doesn't exist,
1509              * don't create a new one -
1510              * we don't keep count of the bytes sent/received if a peer
1511              * structure doesn't already exist.
1512              *
1513              * The peer/connection cleanup code assumes that there is 1 peer
1514              * per connection.  If we actually created a peer structure here
1515              * and this packet was an rxdebug packet, the peer structure would
1516              * never be cleaned up.
1517              */
1518             peer = rxi_FindPeer(*host, *port, 0, 0);
1519             /* Since this may not be associated with a connection,
1520              * it may have no refCount, meaning we could race with
1521              * ReapConnections
1522              */
1523             if (peer && (peer->refCount > 0)) {
1524                 MUTEX_ENTER(&peer->peer_lock);
1525                 hadd32(peer->bytesReceived, p->length);
1526                 MUTEX_EXIT(&peer->peer_lock);
1527             }
1528         }
1529
1530 #ifdef RX_TRIMDATABUFS
1531         /* Free any empty packet buffers at the end of this packet */
1532         rxi_TrimDataBufs(p, 1);
1533 #endif
1534         return 1;
1535     }
1536 }
1537
1538 #endif /* !KERNEL || UKERNEL */
1539
1540 /* This function splits off the first packet in a jumbo packet.
1541  * As of AFS 3.5, jumbograms contain more than one fixed size
1542  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1543  * last packet header. All packets (except the last) are padded to
1544  * fall on RX_CBUFFERSIZE boundaries.
1545  * HACK: We store the length of the first n-1 packets in the
1546  * last two pad bytes. */
1547
1548 struct rx_packet *
1549 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1550                      int first)
1551 {
1552     struct rx_packet *np;
1553     struct rx_jumboHeader *jp;
1554     int niov, i;
1555     struct iovec *iov;
1556     int length;
1557     afs_uint32 temp;
1558
1559     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1560      * bytes in length. All but the first packet are preceded by
1561      * an abbreviated four byte header. The length of the last packet
1562      * is calculated from the size of the jumbogram. */
1563     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1564
1565     if ((int)p->length < length) {
1566         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1567         return NULL;
1568     }
1569     niov = p->niovecs - 2;
1570     if (niov < 1) {
1571         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1572         return NULL;
1573     }
1574     iov = &p->wirevec[2];
1575     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1576
1577     /* Get a pointer to the abbreviated packet header */
1578     jp = (struct rx_jumboHeader *)
1579         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1580
1581     /* Set up the iovecs for the next packet */
1582     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1583     np->wirevec[0].iov_len = sizeof(struct rx_header);
1584     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1585     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1586     np->niovecs = niov + 1;
1587     for (i = 2, iov++; i <= niov; i++, iov++) {
1588         np->wirevec[i] = *iov;
1589     }
1590     np->length = p->length - length;
1591     p->length = RX_JUMBOBUFFERSIZE;
1592     p->niovecs = 2;
1593
1594     /* Convert the jumbo packet header to host byte order */
1595     temp = ntohl(*(afs_uint32 *) jp);
1596     jp->flags = (u_char) (temp >> 24);
1597     jp->cksum = (u_short) (temp);
1598
1599     /* Fill in the packet header */
1600     np->header = p->header;
1601     np->header.serial = p->header.serial + 1;
1602     np->header.seq = p->header.seq + 1;
1603     np->header.flags = jp->flags;
1604     np->header.spare = jp->cksum;
1605
1606     return np;
1607 }
1608
1609 #ifndef KERNEL
1610 /* Send a udp datagram */
1611 int
1612 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1613             int length, int istack)
1614 {
1615     struct msghdr msg;
1616         int ret;
1617
1618     memset(&msg, 0, sizeof(msg));
1619     msg.msg_iov = dvec;
1620     msg.msg_iovlen = nvecs;
1621     msg.msg_name = addr;
1622     msg.msg_namelen = sizeof(struct sockaddr_in);
1623
1624     ret = rxi_Sendmsg(socket, &msg, 0);
1625
1626     return ret;
1627 }
1628 #elif !defined(UKERNEL)
1629 /*
1630  * message receipt is done in rxk_input or rx_put.
1631  */
1632
1633 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1634 /*
1635  * Copy an mblock to the contiguous area pointed to by cp.
1636  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1637  * but it doesn't really.
1638  * Returns the number of bytes not transferred.
1639  * The message is NOT changed.
1640  */
1641 static int
1642 cpytoc(mblk_t * mp, int off, int len, char *cp)
1643 {
1644     int n;
1645
1646     for (; mp && len > 0; mp = mp->b_cont) {
1647         if (mp->b_datap->db_type != M_DATA) {
1648             return -1;
1649         }
1650         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1651         memcpy(cp, (char *)mp->b_rptr, n);
1652         cp += n;
1653         len -= n;
1654         mp->b_rptr += n;
1655     }
1656     return (len);
1657 }
1658
1659 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1660  * but it doesn't really.
1661  * This sucks, anyway, do it like m_cpy.... below
1662  */
1663 static int
1664 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1665            int niovs)
1666 {
1667     int m, n, o, t, i;
1668
1669     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1670         if (mp->b_datap->db_type != M_DATA) {
1671             return -1;
1672         }
1673         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1674         len -= n;
1675         while (n) {
1676             if (!t) {
1677                 o = 0;
1678                 i++;
1679                 t = iovs[i].iov_len;
1680             }
1681             m = MIN(n, t);
1682             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1683             mp->b_rptr += m;
1684             o += m;
1685             t -= m;
1686             n -= m;
1687         }
1688     }
1689     return (len);
1690 }
1691
1692 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1693 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1694 #else
1695 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1696 static int
1697 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1698 {
1699     caddr_t p1, p2;
1700     unsigned int l1, l2, i, t;
1701
1702     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1703         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1704
1705     while (off && m)
1706         if (m->m_len <= off) {
1707             off -= m->m_len;
1708             m = m->m_next;
1709             continue;
1710         } else
1711             break;
1712
1713     if (m == NULL)
1714         return len;
1715
1716     p1 = mtod(m, caddr_t) + off;
1717     l1 = m->m_len - off;
1718     i = 0;
1719     p2 = iovs[0].iov_base;
1720     l2 = iovs[0].iov_len;
1721
1722     while (len) {
1723         t = MIN(l1, MIN(l2, (unsigned int)len));
1724         memcpy(p2, p1, t);
1725         p1 += t;
1726         p2 += t;
1727         l1 -= t;
1728         l2 -= t;
1729         len -= t;
1730         if (!l1) {
1731             m = m->m_next;
1732             if (!m)
1733                 break;
1734             p1 = mtod(m, caddr_t);
1735             l1 = m->m_len;
1736         }
1737         if (!l2) {
1738             if (++i >= niovs)
1739                 break;
1740             p2 = iovs[i].iov_base;
1741             l2 = iovs[i].iov_len;
1742         }
1743
1744     }
1745
1746     return len;
1747 }
1748 #endif /* LINUX */
1749 #endif /* AFS_SUN5_ENV */
1750
1751 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1752 int
1753 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1754 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1755      mblk_t *amb;
1756 #else
1757      struct mbuf *amb;
1758 #endif
1759      void (*free) ();
1760      struct rx_packet *phandle;
1761      int hdr_len, data_len;
1762 {
1763     int code;
1764
1765     code =
1766         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1767                      phandle->niovecs);
1768     (*free) (amb);
1769
1770     return code;
1771 }
1772 #endif /* LINUX */
1773 #endif /*KERNEL && !UKERNEL */
1774
1775
1776 /* send a response to a debug packet */
1777
1778 struct rx_packet *
1779 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1780                        afs_uint32 ahost, short aport, int istack)
1781 {
1782     struct rx_debugIn tin;
1783     afs_int32 tl;
1784     struct rx_serverQueueEntry *np, *nqe;
1785
1786     /*
1787      * Only respond to client-initiated Rx debug packets,
1788      * and clear the client flag in the response.
1789      */
1790     if (ap->header.flags & RX_CLIENT_INITIATED) {
1791         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1792         rxi_EncodePacketHeader(ap);
1793     } else {
1794         return ap;
1795     }
1796
1797     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1798     /* all done with packet, now set length to the truth, so we can
1799      * reuse this packet */
1800     rx_computelen(ap, ap->length);
1801
1802     tin.type = ntohl(tin.type);
1803     tin.index = ntohl(tin.index);
1804     switch (tin.type) {
1805     case RX_DEBUGI_GETSTATS:{
1806             struct rx_debugStats tstat;
1807
1808             /* get basic stats */
1809             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1810             tstat.version = RX_DEBUGI_VERSION;
1811 #ifndef RX_ENABLE_LOCKS
1812             tstat.waitingForPackets = rx_waitingForPackets;
1813 #endif
1814             MUTEX_ENTER(&rx_serverPool_lock);
1815             tstat.nFreePackets = htonl(rx_nFreePackets);
1816             tstat.nPackets = htonl(rx_nPackets);
1817             tstat.callsExecuted = htonl(rxi_nCalls);
1818             tstat.packetReclaims = htonl(rx_packetReclaims);
1819             tstat.usedFDs = CountFDs(64);
1820             tstat.nWaiting = htonl(rx_nWaiting);
1821             tstat.nWaited = htonl(rx_nWaited);
1822             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1823                         tstat.idleThreads);
1824             MUTEX_EXIT(&rx_serverPool_lock);
1825             tstat.idleThreads = htonl(tstat.idleThreads);
1826             tl = sizeof(struct rx_debugStats) - ap->length;
1827             if (tl > 0)
1828                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1829
1830             if (tl <= 0) {
1831                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1832                                (char *)&tstat);
1833                 ap->length = sizeof(struct rx_debugStats);
1834                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1835                 rx_computelen(ap, ap->length);
1836             }
1837             break;
1838         }
1839
1840     case RX_DEBUGI_GETALLCONN:
1841     case RX_DEBUGI_GETCONN:{
1842             unsigned int i, j;
1843             struct rx_connection *tc;
1844             struct rx_call *tcall;
1845             struct rx_debugConn tconn;
1846             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1847
1848
1849             tl = sizeof(struct rx_debugConn) - ap->length;
1850             if (tl > 0)
1851                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1852             if (tl > 0)
1853                 return ap;
1854
1855             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1856             /* get N'th (maybe) "interesting" connection info */
1857             for (i = 0; i < rx_hashTableSize; i++) {
1858 #if !defined(KERNEL)
1859                 /* the time complexity of the algorithm used here
1860                  * exponentially increses with the number of connections.
1861                  */
1862 #ifdef AFS_PTHREAD_ENV
1863                 pthread_yield();
1864 #else
1865                 (void)IOMGR_Poll();
1866 #endif
1867 #endif
1868                 MUTEX_ENTER(&rx_connHashTable_lock);
1869                 /* We might be slightly out of step since we are not
1870                  * locking each call, but this is only debugging output.
1871                  */
1872                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1873                     if ((all || rxi_IsConnInteresting(tc))
1874                         && tin.index-- <= 0) {
1875                         tconn.host = tc->peer->host;
1876                         tconn.port = tc->peer->port;
1877                         tconn.cid = htonl(tc->cid);
1878                         tconn.epoch = htonl(tc->epoch);
1879                         tconn.serial = htonl(tc->serial);
1880                         for (j = 0; j < RX_MAXCALLS; j++) {
1881                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1882                             if ((tcall = tc->call[j])) {
1883                                 tconn.callState[j] = tcall->state;
1884                                 tconn.callMode[j] = tcall->mode;
1885                                 tconn.callFlags[j] = tcall->flags;
1886                                 if (queue_IsNotEmpty(&tcall->rq))
1887                                     tconn.callOther[j] |= RX_OTHER_IN;
1888                                 if (queue_IsNotEmpty(&tcall->tq))
1889                                     tconn.callOther[j] |= RX_OTHER_OUT;
1890                             } else
1891                                 tconn.callState[j] = RX_STATE_NOTINIT;
1892                         }
1893
1894                         tconn.natMTU = htonl(tc->peer->natMTU);
1895                         tconn.error = htonl(tc->error);
1896                         tconn.flags = tc->flags;
1897                         tconn.type = tc->type;
1898                         tconn.securityIndex = tc->securityIndex;
1899                         if (tc->securityObject) {
1900                             RXS_GetStats(tc->securityObject, tc,
1901                                          &tconn.secStats);
1902 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1903 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1904                             DOHTONL(flags);
1905                             DOHTONL(expires);
1906                             DOHTONL(packetsReceived);
1907                             DOHTONL(packetsSent);
1908                             DOHTONL(bytesReceived);
1909                             DOHTONL(bytesSent);
1910                             for (i = 0;
1911                                  i <
1912                                  sizeof(tconn.secStats.spares) /
1913                                  sizeof(short); i++)
1914                                 DOHTONS(spares[i]);
1915                             for (i = 0;
1916                                  i <
1917                                  sizeof(tconn.secStats.sparel) /
1918                                  sizeof(afs_int32); i++)
1919                                 DOHTONL(sparel[i]);
1920                         }
1921
1922                         MUTEX_EXIT(&rx_connHashTable_lock);
1923                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1924                                        (char *)&tconn);
1925                         tl = ap->length;
1926                         ap->length = sizeof(struct rx_debugConn);
1927                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1928                                             istack);
1929                         ap->length = tl;
1930                         return ap;
1931                     }
1932                 }
1933                 MUTEX_EXIT(&rx_connHashTable_lock);
1934             }
1935             /* if we make it here, there are no interesting packets */
1936             tconn.cid = htonl(0xffffffff);      /* means end */
1937             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1938                            (char *)&tconn);
1939             tl = ap->length;
1940             ap->length = sizeof(struct rx_debugConn);
1941             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1942             ap->length = tl;
1943             break;
1944         }
1945
1946         /*
1947          * Pass back all the peer structures we have available
1948          */
1949
1950     case RX_DEBUGI_GETPEER:{
1951             unsigned int i;
1952             struct rx_peer *tp;
1953             struct rx_debugPeer tpeer;
1954
1955
1956             tl = sizeof(struct rx_debugPeer) - ap->length;
1957             if (tl > 0)
1958                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1959             if (tl > 0)
1960                 return ap;
1961
1962             memset(&tpeer, 0, sizeof(tpeer));
1963             for (i = 0; i < rx_hashTableSize; i++) {
1964 #if !defined(KERNEL)
1965                 /* the time complexity of the algorithm used here
1966                  * exponentially increses with the number of peers.
1967                  *
1968                  * Yielding after processing each hash table entry
1969                  * and dropping rx_peerHashTable_lock.
1970                  * also increases the risk that we will miss a new
1971                  * entry - but we are willing to live with this
1972                  * limitation since this is meant for debugging only
1973                  */
1974 #ifdef AFS_PTHREAD_ENV
1975                 pthread_yield();
1976 #else
1977                 (void)IOMGR_Poll();
1978 #endif
1979 #endif
1980                 MUTEX_ENTER(&rx_peerHashTable_lock);
1981                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1982                     if (tin.index-- <= 0) {
1983                         tp->refCount++;
1984                         MUTEX_EXIT(&rx_peerHashTable_lock);
1985
1986                         MUTEX_ENTER(&tp->peer_lock);
1987                         tpeer.host = tp->host;
1988                         tpeer.port = tp->port;
1989                         tpeer.ifMTU = htons(tp->ifMTU);
1990                         tpeer.idleWhen = htonl(tp->idleWhen);
1991                         tpeer.refCount = htons(tp->refCount);
1992                         tpeer.burstSize = tp->burstSize;
1993                         tpeer.burst = tp->burst;
1994                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1995                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
1996                         tpeer.rtt = htonl(tp->rtt);
1997                         tpeer.rtt_dev = htonl(tp->rtt_dev);
1998                         tpeer.timeout.sec = htonl(tp->timeout.sec);
1999                         tpeer.timeout.usec = htonl(tp->timeout.usec);
2000                         tpeer.nSent = htonl(tp->nSent);
2001                         tpeer.reSends = htonl(tp->reSends);
2002                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
2003                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
2004                         tpeer.rateFlag = htonl(tp->rateFlag);
2005                         tpeer.natMTU = htons(tp->natMTU);
2006                         tpeer.maxMTU = htons(tp->maxMTU);
2007                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2008                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2009                         tpeer.MTU = htons(tp->MTU);
2010                         tpeer.cwind = htons(tp->cwind);
2011                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2012                         tpeer.congestSeq = htons(tp->congestSeq);
2013                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2014                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2015                         tpeer.bytesReceived.high =
2016                             htonl(tp->bytesReceived.high);
2017                         tpeer.bytesReceived.low =
2018                             htonl(tp->bytesReceived.low);
2019                         MUTEX_EXIT(&tp->peer_lock);
2020
2021                         MUTEX_ENTER(&rx_peerHashTable_lock);
2022                         tp->refCount--;
2023                         MUTEX_EXIT(&rx_peerHashTable_lock);
2024
2025                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2026                                        (char *)&tpeer);
2027                         tl = ap->length;
2028                         ap->length = sizeof(struct rx_debugPeer);
2029                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2030                                             istack);
2031                         ap->length = tl;
2032                         return ap;
2033                     }
2034                 }
2035                 MUTEX_EXIT(&rx_peerHashTable_lock);
2036             }
2037             /* if we make it here, there are no interesting packets */
2038             tpeer.host = htonl(0xffffffff);     /* means end */
2039             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2040                            (char *)&tpeer);
2041             tl = ap->length;
2042             ap->length = sizeof(struct rx_debugPeer);
2043             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2044             ap->length = tl;
2045             break;
2046         }
2047
2048     case RX_DEBUGI_RXSTATS:{
2049             int i;
2050             afs_int32 *s;
2051
2052             tl = sizeof(rx_stats) - ap->length;
2053             if (tl > 0)
2054                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2055             if (tl > 0)
2056                 return ap;
2057
2058             /* Since its all int32s convert to network order with a loop. */
2059         if (rx_stats_active)
2060             MUTEX_ENTER(&rx_stats_mutex);
2061             s = (afs_int32 *) & rx_stats;
2062             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2063                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2064
2065             tl = ap->length;
2066             ap->length = sizeof(rx_stats);
2067         if (rx_stats_active)
2068             MUTEX_EXIT(&rx_stats_mutex);
2069             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2070             ap->length = tl;
2071             break;
2072         }
2073
2074     default:
2075         /* error response packet */
2076         tin.type = htonl(RX_DEBUGI_BADTYPE);
2077         tin.index = tin.type;
2078         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2079         tl = ap->length;
2080         ap->length = sizeof(struct rx_debugIn);
2081         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2082         ap->length = tl;
2083         break;
2084     }
2085     return ap;
2086 }
2087
2088 struct rx_packet *
2089 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2090                          afs_uint32 ahost, short aport, int istack)
2091 {
2092     afs_int32 tl;
2093
2094     /*
2095      * Only respond to client-initiated version requests, and
2096      * clear that flag in the response.
2097      */
2098     if (ap->header.flags & RX_CLIENT_INITIATED) {
2099         char buf[66];
2100
2101         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2102         rxi_EncodePacketHeader(ap);
2103         memset(buf, 0, sizeof(buf));
2104         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2105         rx_packetwrite(ap, 0, 65, buf);
2106         tl = ap->length;
2107         ap->length = 65;
2108         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2109         ap->length = tl;
2110     }
2111
2112     return ap;
2113 }
2114
2115
2116 /* send a debug packet back to the sender */
2117 static void
2118 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2119                     afs_uint32 ahost, short aport, afs_int32 istack)
2120 {
2121     struct sockaddr_in taddr;
2122     unsigned int i, nbytes, savelen = 0;
2123     int saven = 0;
2124 #ifdef KERNEL
2125     int waslocked = ISAFS_GLOCK();
2126 #endif
2127
2128     taddr.sin_family = AF_INET;
2129     taddr.sin_port = aport;
2130     taddr.sin_addr.s_addr = ahost;
2131 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2132     taddr.sin_len = sizeof(struct sockaddr_in);
2133 #endif
2134
2135     /* We need to trim the niovecs. */
2136     nbytes = apacket->length;
2137     for (i = 1; i < apacket->niovecs; i++) {
2138         if (nbytes <= apacket->wirevec[i].iov_len) {
2139             savelen = apacket->wirevec[i].iov_len;
2140             saven = apacket->niovecs;
2141             apacket->wirevec[i].iov_len = nbytes;
2142             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2143         } else
2144             nbytes -= apacket->wirevec[i].iov_len;
2145     }
2146 #ifdef KERNEL
2147 #ifdef RX_KERNEL_TRACE
2148     if (ICL_SETACTIVE(afs_iclSetp)) {
2149         if (!waslocked)
2150             AFS_GLOCK();
2151         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2152                    "before osi_NetSend()");
2153         AFS_GUNLOCK();
2154     } else
2155 #else
2156     if (waslocked)
2157         AFS_GUNLOCK();
2158 #endif
2159 #endif
2160     /* debug packets are not reliably delivered, hence the cast below. */
2161     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2162                       apacket->length + RX_HEADER_SIZE, istack);
2163 #ifdef KERNEL
2164 #ifdef RX_KERNEL_TRACE
2165     if (ICL_SETACTIVE(afs_iclSetp)) {
2166         AFS_GLOCK();
2167         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2168                    "after osi_NetSend()");
2169         if (!waslocked)
2170             AFS_GUNLOCK();
2171     } else
2172 #else
2173     if (waslocked)
2174         AFS_GLOCK();
2175 #endif
2176 #endif
2177     if (saven) {                /* means we truncated the packet above. */
2178         apacket->wirevec[i - 1].iov_len = savelen;
2179         apacket->niovecs = saven;
2180     }
2181
2182 }
2183
2184 /* Send the packet to appropriate destination for the specified
2185  * call.  The header is first encoded and placed in the packet.
2186  */
2187 void
2188 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2189                struct rx_packet *p, int istack)
2190 {
2191 #if defined(KERNEL)
2192     int waslocked;
2193 #endif
2194     int code;
2195     struct sockaddr_in addr;
2196     struct rx_peer *peer = conn->peer;
2197     osi_socket socket;
2198 #ifdef RXDEBUG
2199     char deliveryType = 'S';
2200 #endif
2201     /* The address we're sending the packet to */
2202     memset(&addr, 0, sizeof(addr));
2203     addr.sin_family = AF_INET;
2204     addr.sin_port = peer->port;
2205     addr.sin_addr.s_addr = peer->host;
2206
2207     /* This stuff should be revamped, I think, so that most, if not
2208      * all, of the header stuff is always added here.  We could
2209      * probably do away with the encode/decode routines. XXXXX */
2210
2211     /* Stamp each packet with a unique serial number.  The serial
2212      * number is maintained on a connection basis because some types
2213      * of security may be based on the serial number of the packet,
2214      * and security is handled on a per authenticated-connection
2215      * basis. */
2216     /* Pre-increment, to guarantee no zero serial number; a zero
2217      * serial number means the packet was never sent. */
2218     MUTEX_ENTER(&conn->conn_data_lock);
2219     p->header.serial = ++conn->serial;
2220     if (p->length > conn->peer->maxPacketSize) {
2221         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2222             (p->header.flags & RX_REQUEST_ACK)) {
2223             conn->lastPingSize = p->length;
2224             conn->lastPingSizeSer = p->header.serial;
2225         } else if (p->header.seq != 0) {
2226             conn->lastPacketSize = p->length;
2227             conn->lastPacketSizeSeq = p->header.seq;
2228         }
2229     }
2230     MUTEX_EXIT(&conn->conn_data_lock);
2231     /* This is so we can adjust retransmit time-outs better in the face of
2232      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2233      */
2234     if (p->firstSerial == 0) {
2235         p->firstSerial = p->header.serial;
2236     }
2237 #ifdef RXDEBUG
2238     /* If an output tracer function is defined, call it with the packet and
2239      * network address.  Note this function may modify its arguments. */
2240     if (rx_almostSent) {
2241         int drop = (*rx_almostSent) (p, &addr);
2242         /* drop packet if return value is non-zero? */
2243         if (drop)
2244             deliveryType = 'D'; /* Drop the packet */
2245     }
2246 #endif
2247
2248     /* Get network byte order header */
2249     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2250                                  * touch ALL the fields */
2251
2252     /* Send the packet out on the same socket that related packets are being
2253      * received on */
2254     socket =
2255         (conn->type ==
2256          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2257
2258 #ifdef RXDEBUG
2259     /* Possibly drop this packet,  for testing purposes */
2260     if ((deliveryType == 'D')
2261         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2262             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2263         deliveryType = 'D';     /* Drop the packet */
2264     } else {
2265         deliveryType = 'S';     /* Send the packet */
2266 #endif /* RXDEBUG */
2267
2268         /* Loop until the packet is sent.  We'd prefer just to use a
2269          * blocking socket, but unfortunately the interface doesn't
2270          * allow us to have the socket block in send mode, and not
2271          * block in receive mode */
2272 #ifdef KERNEL
2273         waslocked = ISAFS_GLOCK();
2274 #ifdef RX_KERNEL_TRACE
2275         if (ICL_SETACTIVE(afs_iclSetp)) {
2276             if (!waslocked)
2277                 AFS_GLOCK();
2278             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2279                        "before osi_NetSend()");
2280             AFS_GUNLOCK();
2281         } else
2282 #else
2283         if (waslocked)
2284             AFS_GUNLOCK();
2285 #endif
2286 #endif
2287         if ((code =
2288              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2289                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2290             /* send failed, so let's hurry up the resend, eh? */
2291             if (rx_stats_active)
2292                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2293             p->retryTime = p->timeSent; /* resend it very soon */
2294             clock_Addmsec(&(p->retryTime),
2295                           10 + (((afs_uint32) p->backoff) << 8));
2296             /* Some systems are nice and tell us right away that we cannot
2297              * reach this recipient by returning an error code.
2298              * So, when this happens let's "down" the host NOW so
2299              * we don't sit around waiting for this host to timeout later.
2300              */
2301             if (call &&
2302 #ifdef AFS_NT40_ENV
2303                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2304 #elif defined(AFS_LINUX20_ENV)
2305                 code == -ENETUNREACH
2306 #elif defined(AFS_DARWIN_ENV)
2307                 code == EHOSTUNREACH
2308 #else
2309                 0
2310 #endif
2311                 )
2312                 call->lastReceiveTime = 0;
2313         }
2314 #ifdef KERNEL
2315 #ifdef RX_KERNEL_TRACE
2316         if (ICL_SETACTIVE(afs_iclSetp)) {
2317             AFS_GLOCK();
2318             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2319                        "after osi_NetSend()");
2320             if (!waslocked)
2321                 AFS_GUNLOCK();
2322         } else
2323 #else
2324         if (waslocked)
2325             AFS_GLOCK();
2326 #endif
2327 #endif
2328 #ifdef RXDEBUG
2329     }
2330     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2331           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2332           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2333           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2334 #endif
2335     if (rx_stats_active)
2336         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2337     MUTEX_ENTER(&peer->peer_lock);
2338     hadd32(peer->bytesSent, p->length);
2339     MUTEX_EXIT(&peer->peer_lock);
2340 }
2341
2342 /* Send a list of packets to appropriate destination for the specified
2343  * connection.  The headers are first encoded and placed in the packets.
2344  */
2345 void
2346 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2347                    struct rx_packet **list, int len, int istack)
2348 {
2349 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2350     int waslocked;
2351 #endif
2352     struct sockaddr_in addr;
2353     struct rx_peer *peer = conn->peer;
2354     osi_socket socket;
2355     struct rx_packet *p = NULL;
2356     struct iovec wirevec[RX_MAXIOVECS];
2357     int i, length, code;
2358     afs_uint32 serial;
2359     afs_uint32 temp;
2360     struct rx_jumboHeader *jp;
2361 #ifdef RXDEBUG
2362     char deliveryType = 'S';
2363 #endif
2364     /* The address we're sending the packet to */
2365     addr.sin_family = AF_INET;
2366     addr.sin_port = peer->port;
2367     addr.sin_addr.s_addr = peer->host;
2368
2369     if (len + 1 > RX_MAXIOVECS) {
2370         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2371     }
2372
2373     /*
2374      * Stamp the packets in this jumbogram with consecutive serial numbers
2375      */
2376     MUTEX_ENTER(&conn->conn_data_lock);
2377     serial = conn->serial;
2378     conn->serial += len;
2379     for (i = 0; i < len; i++) {
2380         p = list[i];
2381         if (p->length > conn->peer->maxPacketSize) {
2382             /* a ping *or* a sequenced packet can count */
2383             if ((p->length > conn->peer->maxPacketSize)) {
2384                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2385                      (p->header.flags & RX_REQUEST_ACK)) &&
2386                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2387                     conn->lastPingSize = p->length;
2388                     conn->lastPingSizeSer = serial + i;
2389                 } else if ((p->header.seq != 0) &&
2390                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2391                     conn->lastPacketSize = p->length;
2392                     conn->lastPacketSizeSeq = p->header.seq;
2393                 }
2394             }
2395         }
2396     }
2397     MUTEX_EXIT(&conn->conn_data_lock);
2398
2399
2400     /* This stuff should be revamped, I think, so that most, if not
2401      * all, of the header stuff is always added here.  We could
2402      * probably do away with the encode/decode routines. XXXXX */
2403
2404     jp = NULL;
2405     length = RX_HEADER_SIZE;
2406     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2407     wirevec[0].iov_len = RX_HEADER_SIZE;
2408     for (i = 0; i < len; i++) {
2409         p = list[i];
2410
2411         /* The whole 3.5 jumbogram scheme relies on packets fitting
2412          * in a single packet buffer. */
2413         if (p->niovecs > 2) {
2414             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2415         }
2416
2417         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2418          * in this chunk.  */
2419         if (i < len - 1) {
2420             if (p->length != RX_JUMBOBUFFERSIZE) {
2421                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2422             }
2423             p->header.flags |= RX_JUMBO_PACKET;
2424             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2425             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2426         } else {
2427             wirevec[i + 1].iov_len = p->length;
2428             length += p->length;
2429         }
2430         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2431         if (jp != NULL) {
2432             /* Convert jumbo packet header to network byte order */
2433             temp = (afs_uint32) (p->header.flags) << 24;
2434             temp |= (afs_uint32) (p->header.spare);
2435             *(afs_uint32 *) jp = htonl(temp);
2436         }
2437         jp = (struct rx_jumboHeader *)
2438             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2439
2440         /* Stamp each packet with a unique serial number.  The serial
2441          * number is maintained on a connection basis because some types
2442          * of security may be based on the serial number of the packet,
2443          * and security is handled on a per authenticated-connection
2444          * basis. */
2445         /* Pre-increment, to guarantee no zero serial number; a zero
2446          * serial number means the packet was never sent. */
2447         p->header.serial = ++serial;
2448         /* This is so we can adjust retransmit time-outs better in the face of
2449          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2450          */
2451         if (p->firstSerial == 0) {
2452             p->firstSerial = p->header.serial;
2453         }
2454 #ifdef RXDEBUG
2455         /* If an output tracer function is defined, call it with the packet and
2456          * network address.  Note this function may modify its arguments. */
2457         if (rx_almostSent) {
2458             int drop = (*rx_almostSent) (p, &addr);
2459             /* drop packet if return value is non-zero? */
2460             if (drop)
2461                 deliveryType = 'D';     /* Drop the packet */
2462         }
2463 #endif
2464
2465         /* Get network byte order header */
2466         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2467                                          * touch ALL the fields */
2468     }
2469
2470     /* Send the packet out on the same socket that related packets are being
2471      * received on */
2472     socket =
2473         (conn->type ==
2474          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2475
2476 #ifdef RXDEBUG
2477     /* Possibly drop this packet,  for testing purposes */
2478     if ((deliveryType == 'D')
2479         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2480             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2481         deliveryType = 'D';     /* Drop the packet */
2482     } else {
2483         deliveryType = 'S';     /* Send the packet */
2484 #endif /* RXDEBUG */
2485
2486         /* Loop until the packet is sent.  We'd prefer just to use a
2487          * blocking socket, but unfortunately the interface doesn't
2488          * allow us to have the socket block in send mode, and not
2489          * block in receive mode */
2490 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2491         waslocked = ISAFS_GLOCK();
2492         if (!istack && waslocked)
2493             AFS_GUNLOCK();
2494 #endif
2495         if ((code =
2496              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2497                          istack)) != 0) {
2498             /* send failed, so let's hurry up the resend, eh? */
2499             if (rx_stats_active)
2500                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2501             for (i = 0; i < len; i++) {
2502                 p = list[i];
2503                 p->retryTime = p->timeSent;     /* resend it very soon */
2504                 clock_Addmsec(&(p->retryTime),
2505                               10 + (((afs_uint32) p->backoff) << 8));
2506             }
2507             /* Some systems are nice and tell us right away that we cannot
2508              * reach this recipient by returning an error code.
2509              * So, when this happens let's "down" the host NOW so
2510              * we don't sit around waiting for this host to timeout later.
2511              */
2512             if (call &&
2513 #ifdef AFS_NT40_ENV
2514                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2515 #elif defined(AFS_LINUX20_ENV)
2516                 code == -ENETUNREACH
2517 #elif defined(AFS_DARWIN_ENV)
2518                 code == EHOSTUNREACH
2519 #else
2520                 0
2521 #endif
2522                 )
2523                 call->lastReceiveTime = 0;
2524         }
2525 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2526         if (!istack && waslocked)
2527             AFS_GLOCK();
2528 #endif
2529 #ifdef RXDEBUG
2530     }
2531
2532     assert(p != NULL);
2533
2534     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2535           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2536           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2537           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2538
2539 #endif
2540     if (rx_stats_active)
2541         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2542     MUTEX_ENTER(&peer->peer_lock);
2543     hadd32(peer->bytesSent, p->length);
2544     MUTEX_EXIT(&peer->peer_lock);
2545 }
2546
2547
2548 /* Send a "special" packet to the peer connection.  If call is
2549  * specified, then the packet is directed to a specific call channel
2550  * associated with the connection, otherwise it is directed to the
2551  * connection only. Uses optionalPacket if it is supplied, rather than
2552  * allocating a new packet buffer.  Nbytes is the length of the data
2553  * portion of the packet.  If data is non-null, nbytes of data are
2554  * copied into the packet.  Type is the type of the packet, as defined
2555  * in rx.h.  Bug: there's a lot of duplication between this and other
2556  * routines.  This needs to be cleaned up. */
2557 struct rx_packet *
2558 rxi_SendSpecial(struct rx_call *call,
2559                 struct rx_connection *conn,
2560                 struct rx_packet *optionalPacket, int type, char *data,
2561                 int nbytes, int istack)
2562 {
2563     /* Some of the following stuff should be common code for all
2564      * packet sends (it's repeated elsewhere) */
2565     struct rx_packet *p;
2566     unsigned int i = 0;
2567     int savelen = 0, saven = 0;
2568     int channel, callNumber;
2569     if (call) {
2570         channel = call->channel;
2571         callNumber = *call->callNumber;
2572         /* BUSY packets refer to the next call on this connection */
2573         if (type == RX_PACKET_TYPE_BUSY) {
2574             callNumber++;
2575         }
2576     } else {
2577         channel = 0;
2578         callNumber = 0;
2579     }
2580     p = optionalPacket;
2581     if (!p) {
2582         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2583         if (!p)
2584             osi_Panic("rxi_SendSpecial failure");
2585     }
2586
2587     if (nbytes != -1)
2588         p->length = nbytes;
2589     else
2590         nbytes = p->length;
2591     p->header.serviceId = conn->serviceId;
2592     p->header.securityIndex = conn->securityIndex;
2593     p->header.cid = (conn->cid | channel);
2594     p->header.callNumber = callNumber;
2595     p->header.seq = 0;
2596     p->header.epoch = conn->epoch;
2597     p->header.type = type;
2598     p->header.flags = 0;
2599     if (conn->type == RX_CLIENT_CONNECTION)
2600         p->header.flags |= RX_CLIENT_INITIATED;
2601     if (data)
2602         rx_packetwrite(p, 0, nbytes, data);
2603
2604     for (i = 1; i < p->niovecs; i++) {
2605         if (nbytes <= p->wirevec[i].iov_len) {
2606             savelen = p->wirevec[i].iov_len;
2607             saven = p->niovecs;
2608             p->wirevec[i].iov_len = nbytes;
2609             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2610         } else
2611             nbytes -= p->wirevec[i].iov_len;
2612     }
2613
2614     if (call)
2615         rxi_Send(call, p, istack);
2616     else
2617         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2618     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2619         /* really need to do this, but it seems safer this way, given that  */
2620         /* sneaky optionalPacket... */
2621         p->wirevec[i - 1].iov_len = savelen;
2622         p->niovecs = saven;
2623     }
2624     if (!optionalPacket)
2625         rxi_FreePacket(p);
2626     return optionalPacket;
2627 }
2628
2629
2630 /* Encode the packet's header (from the struct header in the packet to
2631  * the net byte order representation in the wire representation of the
2632  * packet, which is what is actually sent out on the wire) */
2633 void
2634 rxi_EncodePacketHeader(struct rx_packet *p)
2635 {
2636     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2637
2638     memset(buf, 0, RX_HEADER_SIZE);
2639     *buf++ = htonl(p->header.epoch);
2640     *buf++ = htonl(p->header.cid);
2641     *buf++ = htonl(p->header.callNumber);
2642     *buf++ = htonl(p->header.seq);
2643     *buf++ = htonl(p->header.serial);
2644     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2645                    | (((afs_uint32) p->header.flags) << 16)
2646                    | (p->header.userStatus << 8) | p->header.securityIndex);
2647     /* Note: top 16 bits of this next word were reserved */
2648     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2649 }
2650
2651 /* Decode the packet's header (from net byte order to a struct header) */
2652 void
2653 rxi_DecodePacketHeader(struct rx_packet *p)
2654 {
2655     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2656     afs_uint32 temp;
2657
2658     p->header.epoch = ntohl(*buf);
2659     buf++;
2660     p->header.cid = ntohl(*buf);
2661     buf++;
2662     p->header.callNumber = ntohl(*buf);
2663     buf++;
2664     p->header.seq = ntohl(*buf);
2665     buf++;
2666     p->header.serial = ntohl(*buf);
2667     buf++;
2668
2669     temp = ntohl(*buf);
2670     buf++;
2671
2672     /* C will truncate byte fields to bytes for me */
2673     p->header.type = temp >> 24;
2674     p->header.flags = temp >> 16;
2675     p->header.userStatus = temp >> 8;
2676     p->header.securityIndex = temp >> 0;
2677
2678     temp = ntohl(*buf);
2679     buf++;
2680
2681     p->header.serviceId = (temp & 0xffff);
2682     p->header.spare = temp >> 16;
2683     /* Note: top 16 bits of this last word are the security checksum */
2684 }
2685
2686 void
2687 rxi_PrepareSendPacket(struct rx_call *call,
2688                       struct rx_packet *p, int last)
2689 {
2690     struct rx_connection *conn = call->conn;
2691     unsigned int i;
2692     afs_int32 len;              /* len must be a signed type; it can go negative */
2693
2694     p->flags &= ~RX_PKTFLAG_ACKED;
2695     p->header.cid = (conn->cid | call->channel);
2696     p->header.serviceId = conn->serviceId;
2697     p->header.securityIndex = conn->securityIndex;
2698
2699     /* No data packets on call 0. Where do these come from? */
2700     if (*call->callNumber == 0)
2701         *call->callNumber = 1;
2702
2703     p->header.callNumber = *call->callNumber;
2704     p->header.seq = call->tnext++;
2705     p->header.epoch = conn->epoch;
2706     p->header.type = RX_PACKET_TYPE_DATA;
2707     p->header.flags = 0;
2708     p->header.spare = 0;
2709     if (conn->type == RX_CLIENT_CONNECTION)
2710         p->header.flags |= RX_CLIENT_INITIATED;
2711
2712     if (last)
2713         p->header.flags |= RX_LAST_PACKET;
2714
2715     clock_Zero(&p->retryTime);  /* Never yet transmitted */
2716     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2717     p->header.serial = 0;       /* Another way of saying never transmitted... */
2718     p->backoff = 0;
2719
2720     /* Now that we're sure this is the last data on the call, make sure
2721      * that the "length" and the sum of the iov_lens matches. */
2722     len = p->length + call->conn->securityHeaderSize;
2723
2724     for (i = 1; i < p->niovecs && len > 0; i++) {
2725         len -= p->wirevec[i].iov_len;
2726     }
2727     if (len > 0) {
2728         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2729     } else if (i < p->niovecs) {
2730         /* Free any extra elements in the wirevec */
2731 #if defined(RX_ENABLE_TSFPQ)
2732         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2733 #else /* !RX_ENABLE_TSFPQ */
2734         MUTEX_ENTER(&rx_freePktQ_lock);
2735         rxi_FreeDataBufsNoLock(p, i);
2736         MUTEX_EXIT(&rx_freePktQ_lock);
2737 #endif /* !RX_ENABLE_TSFPQ */
2738
2739         p->niovecs = i;
2740     }
2741     if (len)
2742         p->wirevec[i - 1].iov_len += len;
2743     RXS_PreparePacket(conn->securityObject, call, p);
2744 }
2745
2746 /* Given an interface MTU size, calculate an adjusted MTU size that
2747  * will make efficient use of the RX buffers when the peer is sending
2748  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2749 int
2750 rxi_AdjustIfMTU(int mtu)
2751 {
2752     int adjMTU;
2753     int frags;
2754
2755     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2756         return mtu;
2757     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2758     if (mtu <= adjMTU) {
2759         return mtu;
2760     }
2761     mtu -= adjMTU;
2762     if (mtu <= 0) {
2763         return adjMTU;
2764     }
2765     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2766     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2767 }
2768
2769 /* Given an interface MTU size, and the peer's advertised max receive
2770  * size, calculate an adjisted maxMTU size that makes efficient use
2771  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2772 int
2773 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2774 {
2775     int maxMTU = mtu * rxi_nSendFrags;
2776     maxMTU = MIN(maxMTU, peerMaxMTU);
2777     return rxi_AdjustIfMTU(maxMTU);
2778 }
2779
2780 /* Given a packet size, figure out how many datagram packet will fit.
2781  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2782  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2783  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2784 int
2785 rxi_AdjustDgramPackets(int frags, int mtu)
2786 {
2787     int maxMTU;
2788     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2789         return 1;
2790     }
2791     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2792     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2793     /* subtract the size of the first and last packets */
2794     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2795     if (maxMTU < 0) {
2796         return 1;
2797     }
2798     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2799 }
2800
2801 #ifndef KERNEL
2802 /*
2803  * This function can be used by the Windows Cache Manager
2804  * to dump the list of all rx packets so that we can determine
2805  * where the packet leakage is.
2806  */
2807 int rx_DumpPackets(FILE *outputFile, char *cookie)
2808 {
2809 #ifdef RXDEBUG_PACKET
2810     struct rx_packet *p;
2811 #ifdef AFS_NT40_ENV
2812     int zilch;
2813     char output[2048];
2814 #define RXDPRINTF sprintf
2815 #define RXDPRINTOUT output
2816 #else
2817 #define RXDPRINTF fprintf
2818 #define RXDPRINTOUT outputFile
2819 #endif
2820
2821     NETPRI;
2822     MUTEX_ENTER(&rx_freePktQ_lock);
2823     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2824 #ifdef AFS_NT40_ENV
2825     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2826 #endif
2827
2828     for (p = rx_mallocedP; p; p = p->allNextp) {
2829         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2830                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2831                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2832                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2833                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2834                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2835 #ifdef AFS_NT40_ENV
2836         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2837 #endif
2838     }
2839
2840     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2841 #ifdef AFS_NT40_ENV
2842     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2843 #endif
2844
2845     MUTEX_EXIT(&rx_freePktQ_lock);
2846     USERPRI;
2847 #endif /* RXDEBUG_PACKET */
2848     return 0;
2849 }
2850 #endif