src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #ifdef KERNEL
  12 #include "afs/param.h"
  13 #else
  14 #include <afs/param.h>
  15 #endif
  16
  17
  18 #ifdef KERNEL
  19 #if defined(UKERNEL)
  20 #include "afs/sysincludes.h"
  21 #include "afsincludes.h"
  22 #include "rx/rx_kcommon.h"
  23 #include "rx/rx_clock.h"
  24 #include "rx/rx_queue.h"
  25 #include "rx/rx_packet.h"
  26 #else /* defined(UKERNEL) */
  27 #ifdef RX_KERNEL_TRACE
  28 #include "../rx/rx_kcommon.h"
  29 #endif
  30 #include "h/types.h"
  31 #ifndef AFS_LINUX20_ENV
  32 #include "h/systm.h"
  33 #endif
  34 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
  35 #include "afs/sysincludes.h"
  36 #endif
  37 #if defined(AFS_OBSD_ENV)
  38 #include "h/proc.h"
  39 #endif
  40 #include "h/socket.h"
  41 #if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  42 #if     !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  43 #include "sys/mount.h"          /* it gets pulled in by something later anyway */
  44 #endif
  45 #include "h/mbuf.h"
  46 #endif
  47 #include "netinet/in.h"
  48 #include "afs/afs_osi.h"
  49 #include "rx_kmutex.h"
  50 #include "rx/rx_clock.h"
  51 #include "rx/rx_queue.h"
  52 #ifdef  AFS_SUN5_ENV
  53 #include <sys/sysmacros.h>
  54 #endif
  55 #include "rx/rx_packet.h"
  56 #endif /* defined(UKERNEL) */
  57 #include "rx/rx_globals.h"
  58 #else /* KERNEL */
  59 #include "sys/types.h"
  60 #include <sys/stat.h>
  61 #include <errno.h>
  62 #if defined(AFS_NT40_ENV)
  63 #include <winsock2.h>
  64 #ifndef EWOULDBLOCK
  65 #define EWOULDBLOCK WSAEWOULDBLOCK
  66 #endif
  67 #include "rx_user.h"
  68 #include "rx_xmit_nt.h"
  69 #include <stdlib.h>
  70 #else
  71 #include <sys/socket.h>
  72 #include <netinet/in.h>
  73 #endif
  74 #include "rx_clock.h"
  75 #include "rx.h"
  76 #include "rx_queue.h"
  77 #ifdef  AFS_SUN5_ENV
  78 #include <sys/sysmacros.h>
  79 #endif
  80 #include "rx_packet.h"
  81 #include "rx_globals.h"
  82 #include <lwp.h>
  83 #include <assert.h>
  84 #include <string.h>
  85 #ifdef HAVE_UNISTD_H
  86 #include <unistd.h>
  87 #endif
  88 #endif /* KERNEL */
  89
  90 #ifdef RX_LOCKS_DB
  91 /* rxdb_fileID is used to identify the lock location, along with line#. */
  92 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  93 #endif /* RX_LOCKS_DB */
  94 static struct rx_packet *rx_mallocedP = 0;
  95 #ifdef RXDEBUG_PACKET
  96 static afs_uint32       rx_packet_id = 0;
  97 #endif
  98
  99 extern char cml_version_number[];
 100
 101 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
 102
 103 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
 104                                 afs_uint32 ahost, short aport,
 105                                 afs_int32 istack);
 106
 107 #ifdef RX_ENABLE_TSFPQ
 108 static int
 109 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
 110 #else
 111 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
 112                                    afs_uint32 first,
 113                                    struct rx_queue * q);
 114 #endif
 115
 116 /* some rules about packets:
 117  * 1.  When a packet is allocated, the final iov_buf contains room for
 118  * a security trailer, but iov_len masks that fact.  If the security
 119  * package wants to add the trailer, it may do so, and then extend
 120  * iov_len appropriately.  For this reason, packet's niovecs and
 121  * iov_len fields should be accurate before calling PreparePacket.
 122 */
 123
 124 /* Preconditions:
 125  *        all packet buffers (iov_base) are integral multiples of
 126  *        the word size.
 127  *        offset is an integral multiple of the word size.
 128  */
 129 afs_int32
 130 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 131 {
 132     unsigned int i;
 133     size_t l;
 134     for (l = 0, i = 1; i < packet->niovecs; i++) {
 135         if (l + packet->wirevec[i].iov_len > offset) {
 136             return
 137                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 138                                  (offset - l)));
 139         }
 140         l += packet->wirevec[i].iov_len;
 141     }
 142
 143     return 0;
 144 }
 145
 146 /* Preconditions:
 147  *        all packet buffers (iov_base) are integral multiples of the word size.
 148  *        offset is an integral multiple of the word size.
 149  */
 150 afs_int32
 151 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 152 {
 153     unsigned int i;
 154     size_t l;
 155     for (l = 0, i = 1; i < packet->niovecs; i++) {
 156         if (l + packet->wirevec[i].iov_len > offset) {
 157             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 158                              (offset - l))) = data;
 159             return 0;
 160         }
 161         l += packet->wirevec[i].iov_len;
 162     }
 163
 164     return 0;
 165 }
 166
 167 /* Preconditions:
 168  *        all packet buffers (iov_base) are integral multiples of the
 169  *        word size.
 170  *        offset is an integral multiple of the word size.
 171  * Packet Invariants:
 172  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 173  */
 174 afs_int32
 175 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 176                   char *out)
 177 {
 178     unsigned int i, j, l, r;
 179     for (l = 0, i = 1; i < packet->niovecs; i++) {
 180         if (l + packet->wirevec[i].iov_len > offset) {
 181             break;
 182         }
 183         l += packet->wirevec[i].iov_len;
 184     }
 185
 186     /* i is the iovec which contains the first little bit of data in which we
 187      * are interested.  l is the total length of everything prior to this iovec.
 188      * j is the number of bytes we can safely copy out of this iovec.
 189      * offset only applies to the first iovec.
 190      */
 191     r = resid;
 192     while ((r > 0) && (i < packet->niovecs)) {
 193         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 194         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 195         r -= j;
 196         out += j;
 197         l += packet->wirevec[i].iov_len;
 198         offset = l;
 199         i++;
 200     }
 201
 202     return (r ? (resid - r) : resid);
 203 }
 204
 205
 206 /* Preconditions:
 207  *        all packet buffers (iov_base) are integral multiples of the
 208  *        word size.
 209  *        offset is an integral multiple of the word size.
 210  */
 211 afs_int32
 212 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 213 {
 214     unsigned int i, j, l, o, r;
 215     char *b;
 216
 217     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 218         if (l + packet->wirevec[i].iov_len > o) {
 219             break;
 220         }
 221         l += packet->wirevec[i].iov_len;
 222     }
 223
 224     /* i is the iovec which contains the first little bit of data in which we
 225      * are interested.  l is the total length of everything prior to this iovec.
 226      * j is the number of bytes we can safely copy out of this iovec.
 227      * offset only applies to the first iovec.
 228      */
 229     r = resid;
 230     while ((r > 0) && (i <= RX_MAXWVECS)) {
 231         if (i >= packet->niovecs)
 232             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 233                 break;
 234
 235         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 236         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 237         memcpy(b, in, j);
 238         r -= j;
 239         in += j;
 240         l += packet->wirevec[i].iov_len;
 241         offset = l;
 242         i++;
 243     }
 244
 245     return (r ? (resid - r) : resid);
 246 }
 247
 248 int
 249 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 250 {
 251     struct rx_packet *p, *np;
 252
 253     num_pkts = AllocPacketBufs(class, num_pkts, q);
 254
 255     for (queue_Scan(q, p, np, rx_packet)) {
 256         RX_PACKET_IOV_FULLINIT(p);
 257     }
 258
 259     return num_pkts;
 260 }
 261
 262 #ifdef RX_ENABLE_TSFPQ
 263 static int
 264 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 265 {
 266     struct rx_ts_info_t * rx_ts_info;
 267     int transfer;
 268     SPLVAR;
 269
 270     RX_TS_INFO_GET(rx_ts_info);
 271
 272     transfer = num_pkts - rx_ts_info->_FPQ.len;
 273     if (transfer > 0) {
 274         NETPRI;
 275         MUTEX_ENTER(&rx_freePktQ_lock);
 276         transfer = MAX(transfer, rx_TSFPQGlobSize);
 277         if (transfer > rx_nFreePackets) {
 278             /* alloc enough for us, plus a few globs for other threads */
 279             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 280         }
 281
 282         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 283
 284         MUTEX_EXIT(&rx_freePktQ_lock);
 285         USERPRI;
 286     }
 287
 288     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 289
 290     return num_pkts;
 291 }
 292 #else /* RX_ENABLE_TSFPQ */
 293 static int
 294 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 295 {
 296     struct rx_packet *c;
 297     int i;
 298 #ifdef KERNEL
 299     int overq = 0;
 300 #endif
 301     SPLVAR;
 302
 303     NETPRI;
 304
 305     MUTEX_ENTER(&rx_freePktQ_lock);
 306
 307 #ifdef KERNEL
 308     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 309          num_pkts--, overq++);
 310
 311     if (overq) {
 312         rxi_NeedMorePackets = TRUE;
 313         if (rx_stats_active) {
 314             switch (class) {
 315             case RX_PACKET_CLASS_RECEIVE:
 316                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
 317                 break;
 318             case RX_PACKET_CLASS_SEND:
 319                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
 320                 break;
 321             case RX_PACKET_CLASS_SPECIAL:
 322                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
 323                 break;
 324             case RX_PACKET_CLASS_RECV_CBUF:
 325                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
 326                 break;
 327             case RX_PACKET_CLASS_SEND_CBUF:
 328                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
 329                 break;
 330             }
 331         }
 332     }
 333
 334     if (rx_nFreePackets < num_pkts)
 335         num_pkts = rx_nFreePackets;
 336
 337     if (!num_pkts) {
 338         rxi_NeedMorePackets = TRUE;
 339         goto done;
 340     }
 341 #else /* KERNEL */
 342     if (rx_nFreePackets < num_pkts) {
 343         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 344     }
 345 #endif /* KERNEL */
 346
 347     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 348          i < num_pkts;
 349          i++, c=queue_Next(c, rx_packet)) {
 350         RX_FPQ_MARK_USED(c);
 351     }
 352
 353     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 354
 355     rx_nFreePackets -= num_pkts;
 356
 357 #ifdef KERNEL
 358   done:
 359 #endif
 360     MUTEX_EXIT(&rx_freePktQ_lock);
 361
 362     USERPRI;
 363     return num_pkts;
 364 }
 365 #endif /* RX_ENABLE_TSFPQ */
 366
 367 /*
 368  * Free a packet currently used as a continuation buffer
 369  */
 370 #ifdef RX_ENABLE_TSFPQ
 371 /* num_pkts=0 means queue length is unknown */
 372 int
 373 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 374 {
 375     struct rx_ts_info_t * rx_ts_info;
 376     struct rx_packet *c, *nc;
 377     SPLVAR;
 378
 379     osi_Assert(num_pkts >= 0);
 380     RX_TS_INFO_GET(rx_ts_info);
 381
 382     if (!num_pkts) {
 383         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 384             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 385         }
 386     } else {
 387         for (queue_Scan(q, c, nc, rx_packet)) {
 388             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 389         }
 390     }
 391
 392     if (num_pkts) {
 393         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 394     }
 395
 396     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 397         NETPRI;
 398         MUTEX_ENTER(&rx_freePktQ_lock);
 399
 400         RX_TS_FPQ_LTOG(rx_ts_info);
 401
 402         /* Wakeup anyone waiting for packets */
 403         rxi_PacketsUnWait();
 404
 405         MUTEX_EXIT(&rx_freePktQ_lock);
 406         USERPRI;
 407     }
 408
 409     return num_pkts;
 410 }
 411 #else /* RX_ENABLE_TSFPQ */
 412 /* num_pkts=0 means queue length is unknown */
 413 int
 414 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 415 {
 416     struct rx_queue cbs;
 417     struct rx_packet *p, *np;
 418     int qlen = 0;
 419     SPLVAR;
 420
 421     osi_Assert(num_pkts >= 0);
 422     queue_Init(&cbs);
 423
 424     if (!num_pkts) {
 425         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 426             if (p->niovecs > 2) {
 427                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 428             }
 429             RX_FPQ_MARK_FREE(p);
 430         }
 431         if (!num_pkts)
 432             return 0;
 433     } else {
 434         for (queue_Scan(q, p, np, rx_packet)) {
 435             if (p->niovecs > 2) {
 436                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 437             }
 438             RX_FPQ_MARK_FREE(p);
 439         }
 440     }
 441
 442     if (qlen) {
 443         queue_SpliceAppend(q, &cbs);
 444         qlen += num_pkts;
 445     } else
 446         qlen = num_pkts;
 447
 448     NETPRI;
 449     MUTEX_ENTER(&rx_freePktQ_lock);
 450
 451     queue_SpliceAppend(&rx_freePacketQueue, q);
 452     rx_nFreePackets += qlen;
 453
 454     /* Wakeup anyone waiting for packets */
 455     rxi_PacketsUnWait();
 456
 457     MUTEX_EXIT(&rx_freePktQ_lock);
 458     USERPRI;
 459
 460     return num_pkts;
 461 }
 462 #endif /* RX_ENABLE_TSFPQ */
 463
 464 /* this one is kind of awful.
 465  * In rxkad, the packet has been all shortened, and everything, ready for
 466  * sending.  All of a sudden, we discover we need some of that space back.
 467  * This isn't terribly general, because it knows that the packets are only
 468  * rounded up to the EBS (userdata + security header).
 469  */
 470 int
 471 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 472 {
 473     int i;
 474     i = p->niovecs - 1;
 475     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 476         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 477             p->wirevec[i].iov_len += nb;
 478             return 0;
 479         }
 480     } else {
 481         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 482             p->wirevec[i].iov_len += nb;
 483             return 0;
 484         }
 485     }
 486
 487     return 0;
 488 }
 489
 490 /* get sufficient space to store nb bytes of data (or more), and hook
 491  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 492  * returns the number of bytes >0 which it failed to come up with.
 493  * Don't need to worry about locking on packet, since only
 494  * one thread can manipulate one at a time. Locking on continution
 495  * packets is handled by AllocPacketBufs */
 496 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 497 int
 498 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 499 {
 500     int i, nv;
 501     struct rx_queue q;
 502     struct rx_packet *cb, *ncb;
 503
 504     /* compute the number of cbuf's we need */
 505     nv = nb / RX_CBUFFERSIZE;
 506     if ((nv * RX_CBUFFERSIZE) < nb)
 507         nv++;
 508     if ((nv + p->niovecs) > RX_MAXWVECS)
 509         nv = RX_MAXWVECS - p->niovecs;
 510     if (nv < 1)
 511         return nb;
 512
 513     /* allocate buffers */
 514     queue_Init(&q);
 515     nv = AllocPacketBufs(class, nv, &q);
 516
 517     /* setup packet iovs */
 518     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 519         queue_Remove(cb);
 520         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 521         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 522     }
 523
 524     nb -= (nv * RX_CBUFFERSIZE);
 525     p->length += (nv * RX_CBUFFERSIZE);
 526     p->niovecs += nv;
 527
 528     return nb;
 529 }
 530
 531 /* Add more packet buffers */
 532 #ifdef RX_ENABLE_TSFPQ
 533 void
 534 rxi_MorePackets(int apackets)
 535 {
 536     struct rx_packet *p, *e;
 537     struct rx_ts_info_t * rx_ts_info;
 538     int getme;
 539     SPLVAR;
 540
 541     getme = apackets * sizeof(struct rx_packet);
 542     p = (struct rx_packet *)osi_Alloc(getme);
 543     osi_Assert(p);
 544
 545     PIN(p, getme);              /* XXXXX */
 546     memset(p, 0, getme);
 547     RX_TS_INFO_GET(rx_ts_info);
 548
 549     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 550     /* TSFPQ patch also needs to keep track of total packets */
 551
 552     MUTEX_ENTER(&rx_packets_mutex);
 553     rx_nPackets += apackets;
 554     RX_TS_FPQ_COMPUTE_LIMITS;
 555     MUTEX_EXIT(&rx_packets_mutex);
 556
 557     for (e = p + apackets; p < e; p++) {
 558         RX_PACKET_IOV_INIT(p);
 559         p->niovecs = 2;
 560
 561         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 562
 563         NETPRI;
 564         MUTEX_ENTER(&rx_freePktQ_lock);
 565 #ifdef RXDEBUG_PACKET
 566         p->packetId = rx_packet_id++;
 567         p->allNextp = rx_mallocedP;
 568 #endif /* RXDEBUG_PACKET */
 569         rx_mallocedP = p;
 570         MUTEX_EXIT(&rx_freePktQ_lock);
 571         USERPRI;
 572     }
 573     rx_ts_info->_FPQ.delta += apackets;
 574
 575     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 576         NETPRI;
 577         MUTEX_ENTER(&rx_freePktQ_lock);
 578
 579         RX_TS_FPQ_LTOG(rx_ts_info);
 580         rxi_NeedMorePackets = FALSE;
 581         rxi_PacketsUnWait();
 582
 583         MUTEX_EXIT(&rx_freePktQ_lock);
 584         USERPRI;
 585     }
 586 }
 587 #else /* RX_ENABLE_TSFPQ */
 588 void
 589 rxi_MorePackets(int apackets)
 590 {
 591     struct rx_packet *p, *e;
 592     int getme;
 593     SPLVAR;
 594
 595     getme = apackets * sizeof(struct rx_packet);
 596     p = (struct rx_packet *)osi_Alloc(getme);
 597     osi_Assert(p);
 598
 599     PIN(p, getme);              /* XXXXX */
 600     memset(p, 0, getme);
 601     NETPRI;
 602     MUTEX_ENTER(&rx_freePktQ_lock);
 603
 604     for (e = p + apackets; p < e; p++) {
 605         RX_PACKET_IOV_INIT(p);
 606 #ifdef RX_TRACK_PACKETS
 607         p->flags |= RX_PKTFLAG_FREE;
 608 #endif
 609         p->niovecs = 2;
 610
 611         queue_Append(&rx_freePacketQueue, p);
 612 #ifdef RXDEBUG_PACKET
 613         p->packetId = rx_packet_id++;
 614         p->allNextp = rx_mallocedP;
 615 #endif /* RXDEBUG_PACKET */
 616         rx_mallocedP = p;
 617     }
 618
 619     rx_nPackets += apackets;
 620     rx_nFreePackets += apackets;
 621     rxi_NeedMorePackets = FALSE;
 622     rxi_PacketsUnWait();
 623
 624     MUTEX_EXIT(&rx_freePktQ_lock);
 625     USERPRI;
 626 }
 627 #endif /* RX_ENABLE_TSFPQ */
 628
 629 #ifdef RX_ENABLE_TSFPQ
 630 void
 631 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 632 {
 633     struct rx_packet *p, *e;
 634     struct rx_ts_info_t * rx_ts_info;
 635     int getme;
 636     SPLVAR;
 637
 638     getme = apackets * sizeof(struct rx_packet);
 639     p = (struct rx_packet *)osi_Alloc(getme);
 640
 641     PIN(p, getme);              /* XXXXX */
 642     memset(p, 0, getme);
 643     RX_TS_INFO_GET(rx_ts_info);
 644
 645     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 646     /* TSFPQ patch also needs to keep track of total packets */
 647     MUTEX_ENTER(&rx_packets_mutex);
 648     rx_nPackets += apackets;
 649     RX_TS_FPQ_COMPUTE_LIMITS;
 650     MUTEX_EXIT(&rx_packets_mutex);
 651
 652     for (e = p + apackets; p < e; p++) {
 653         RX_PACKET_IOV_INIT(p);
 654         p->niovecs = 2;
 655         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 656
 657         NETPRI;
 658         MUTEX_ENTER(&rx_freePktQ_lock);
 659 #ifdef RXDEBUG_PACKET
 660         p->packetId = rx_packet_id++;
 661         p->allNextp = rx_mallocedP;
 662 #endif /* RXDEBUG_PACKET */
 663         rx_mallocedP = p;
 664         MUTEX_EXIT(&rx_freePktQ_lock);
 665         USERPRI;
 666     }
 667     rx_ts_info->_FPQ.delta += apackets;
 668
 669     if (flush_global &&
 670         (num_keep_local < apackets)) {
 671         NETPRI;
 672         MUTEX_ENTER(&rx_freePktQ_lock);
 673
 674         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 675         rxi_NeedMorePackets = FALSE;
 676         rxi_PacketsUnWait();
 677
 678         MUTEX_EXIT(&rx_freePktQ_lock);
 679         USERPRI;
 680     }
 681 }
 682 #endif /* RX_ENABLE_TSFPQ */
 683
 684 #ifndef KERNEL
 685 /* Add more packet buffers */
 686 void
 687 rxi_MorePacketsNoLock(int apackets)
 688 {
 689 #ifdef RX_ENABLE_TSFPQ
 690     struct rx_ts_info_t * rx_ts_info;
 691 #endif /* RX_ENABLE_TSFPQ */
 692     struct rx_packet *p, *e;
 693     int getme;
 694
 695     /* allocate enough packets that 1/4 of the packets will be able
 696      * to hold maximal amounts of data */
 697     apackets += (apackets / 4)
 698         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 699     do {
 700         getme = apackets * sizeof(struct rx_packet);
 701         p = (struct rx_packet *)osi_Alloc(getme);
 702         if (p == NULL) {
 703             apackets -= apackets / 4;
 704             osi_Assert(apackets > 0);
 705         }
 706     } while(p == NULL);
 707     memset(p, 0, getme);
 708
 709 #ifdef RX_ENABLE_TSFPQ
 710     RX_TS_INFO_GET(rx_ts_info);
 711     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 712 #endif /* RX_ENABLE_TSFPQ */
 713
 714     for (e = p + apackets; p < e; p++) {
 715         RX_PACKET_IOV_INIT(p);
 716 #ifdef RX_TRACK_PACKETS
 717         p->flags |= RX_PKTFLAG_FREE;
 718 #endif
 719         p->niovecs = 2;
 720
 721         queue_Append(&rx_freePacketQueue, p);
 722 #ifdef RXDEBUG_PACKET
 723         p->packetId = rx_packet_id++;
 724         p->allNextp = rx_mallocedP;
 725 #endif /* RXDEBUG_PACKET */
 726         rx_mallocedP = p;
 727     }
 728
 729     rx_nFreePackets += apackets;
 730     MUTEX_ENTER(&rx_packets_mutex);
 731     rx_nPackets += apackets;
 732 #ifdef RX_ENABLE_TSFPQ
 733     RX_TS_FPQ_COMPUTE_LIMITS;
 734 #endif /* RX_ENABLE_TSFPQ */
 735     MUTEX_EXIT(&rx_packets_mutex);
 736     rxi_NeedMorePackets = FALSE;
 737     rxi_PacketsUnWait();
 738 }
 739 #endif /* !KERNEL */
 740
 741 void
 742 rxi_FreeAllPackets(void)
 743 {
 744     /* must be called at proper interrupt level, etcetera */
 745     /* MTUXXX need to free all Packets */
 746     osi_Free(rx_mallocedP,
 747              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 748     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 749 }
 750
 751 #ifdef RX_ENABLE_TSFPQ
 752 void
 753 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 754 {
 755     struct rx_ts_info_t * rx_ts_info;
 756     int xfer;
 757     SPLVAR;
 758
 759     RX_TS_INFO_GET(rx_ts_info);
 760
 761     if (num_keep_local != rx_ts_info->_FPQ.len) {
 762         NETPRI;
 763         MUTEX_ENTER(&rx_freePktQ_lock);
 764         if (num_keep_local < rx_ts_info->_FPQ.len) {
 765             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 766             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 767             rxi_PacketsUnWait();
 768         } else {
 769             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 770             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 771                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 772             if (rx_nFreePackets < xfer) {
 773                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 774             }
 775             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 776         }
 777         MUTEX_EXIT(&rx_freePktQ_lock);
 778         USERPRI;
 779     }
 780 }
 781
 782 void
 783 rxi_FlushLocalPacketsTSFPQ(void)
 784 {
 785     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 786 }
 787 #endif /* RX_ENABLE_TSFPQ */
 788
 789 /* Allocate more packets iff we need more continuation buffers */
 790 /* In kernel, can't page in memory with interrupts disabled, so we
 791  * don't use the event mechanism. */
 792 void
 793 rx_CheckPackets(void)
 794 {
 795     if (rxi_NeedMorePackets) {
 796         rxi_MorePackets(rx_maxSendWindow);
 797     }
 798 }
 799
 800 /* In the packet freeing routine below, the assumption is that
 801    we want all of the packets to be used equally frequently, so that we
 802    don't get packet buffers paging out.  It would be just as valid to
 803    assume that we DO want them to page out if not many are being used.
 804    In any event, we assume the former, and append the packets to the end
 805    of the free list.  */
 806 /* This explanation is bogus.  The free list doesn't remain in any kind of
 807    useful order for afs_int32: the packets in use get pretty much randomly scattered
 808    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 809    must be stored so that packets which are adjacent in memory are adjacent in the
 810    free list.  An array springs rapidly to mind.
 811    */
 812
 813 /* Actually free the packet p. */
 814 #ifdef RX_ENABLE_TSFPQ
 815 void
 816 rxi_FreePacketNoLock(struct rx_packet *p)
 817 {
 818     struct rx_ts_info_t * rx_ts_info;
 819     dpf(("Free %"AFS_PTR_FMT"\n", p));
 820
 821     RX_TS_INFO_GET(rx_ts_info);
 822     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 823     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 824         RX_TS_FPQ_LTOG(rx_ts_info);
 825     }
 826 }
 827 #else /* RX_ENABLE_TSFPQ */
 828 void
 829 rxi_FreePacketNoLock(struct rx_packet *p)
 830 {
 831     dpf(("Free %"AFS_PTR_FMT"\n", p));
 832
 833     RX_FPQ_MARK_FREE(p);
 834     rx_nFreePackets++;
 835     queue_Append(&rx_freePacketQueue, p);
 836 }
 837 #endif /* RX_ENABLE_TSFPQ */
 838
 839 #ifdef RX_ENABLE_TSFPQ
 840 void
 841 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 842 {
 843     struct rx_ts_info_t * rx_ts_info;
 844     dpf(("Free %"AFS_PTR_FMT"\n", p));
 845
 846     RX_TS_INFO_GET(rx_ts_info);
 847     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 848
 849     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 850         NETPRI;
 851         MUTEX_ENTER(&rx_freePktQ_lock);
 852
 853         RX_TS_FPQ_LTOG(rx_ts_info);
 854
 855         /* Wakeup anyone waiting for packets */
 856         rxi_PacketsUnWait();
 857
 858         MUTEX_EXIT(&rx_freePktQ_lock);
 859         USERPRI;
 860     }
 861 }
 862 #endif /* RX_ENABLE_TSFPQ */
 863
 864 /*
 865  * free continuation buffers off a packet into a queue
 866  *
 867  * [IN] p      -- packet from which continuation buffers will be freed
 868  * [IN] first  -- iovec offset of first continuation buffer to free
 869  * [IN] q      -- queue into which continuation buffers will be chained
 870  *
 871  * returns:
 872  *   number of continuation buffers freed
 873  */
 874 #ifndef RX_ENABLE_TSFPQ
 875 static int
 876 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 877 {
 878     struct iovec *iov;
 879     struct rx_packet * cb;
 880     int count = 0;
 881
 882     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 883         iov = &p->wirevec[first];
 884         if (!iov->iov_base)
 885             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 886         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 887         RX_FPQ_MARK_FREE(cb);
 888         queue_Append(q, cb);
 889     }
 890     p->length = 0;
 891     p->niovecs = 0;
 892
 893     return count;
 894 }
 895 #endif
 896
 897 /*
 898  * free packet continuation buffers into the global free packet pool
 899  *
 900  * [IN] p      -- packet from which to free continuation buffers
 901  * [IN] first  -- iovec offset of first continuation buffer to free
 902  *
 903  * returns:
 904  *   zero always
 905  */
 906 int
 907 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 908 {
 909     struct iovec *iov;
 910
 911     for (first = MAX(2, first); first < p->niovecs; first++) {
 912         iov = &p->wirevec[first];
 913         if (!iov->iov_base)
 914             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 915         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 916     }
 917     p->length = 0;
 918     p->niovecs = 0;
 919
 920     return 0;
 921 }
 922
 923 #ifdef RX_ENABLE_TSFPQ
 924 /*
 925  * free packet continuation buffers into the thread-local free pool
 926  *
 927  * [IN] p             -- packet from which continuation buffers will be freed
 928  * [IN] first         -- iovec offset of first continuation buffer to free
 929  *                       any value less than 2, the min number of iovecs,
 930  *                       is treated as if it is 2.
 931  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 932  *                       global free pool before returning
 933  *
 934  * returns:
 935  *   zero always
 936  */
 937 static int
 938 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 939 {
 940     struct iovec *iov;
 941     struct rx_ts_info_t * rx_ts_info;
 942
 943     RX_TS_INFO_GET(rx_ts_info);
 944
 945     for (first = MAX(2, first); first < p->niovecs; first++) {
 946         iov = &p->wirevec[first];
 947         if (!iov->iov_base)
 948             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 949         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 950     }
 951     p->length = 0;
 952     p->niovecs = 0;
 953
 954     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 955         NETPRI;
 956         MUTEX_ENTER(&rx_freePktQ_lock);
 957
 958         RX_TS_FPQ_LTOG(rx_ts_info);
 959
 960         /* Wakeup anyone waiting for packets */
 961         rxi_PacketsUnWait();
 962
 963         MUTEX_EXIT(&rx_freePktQ_lock);
 964         USERPRI;
 965     }
 966     return 0;
 967 }
 968 #endif /* RX_ENABLE_TSFPQ */
 969
 970 int rxi_nBadIovecs = 0;
 971
 972 /* rxi_RestoreDataBufs
 973  *
 974  * Restore the correct sizes to the iovecs. Called when reusing a packet
 975  * for reading off the wire.
 976  */
 977 void
 978 rxi_RestoreDataBufs(struct rx_packet *p)
 979 {
 980     unsigned int i;
 981     struct iovec *iov = &p->wirevec[2];
 982
 983     RX_PACKET_IOV_INIT(p);
 984
 985     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 986         if (!iov->iov_base) {
 987             rxi_nBadIovecs++;
 988             p->niovecs = i;
 989             break;
 990         }
 991         iov->iov_len = RX_CBUFFERSIZE;
 992     }
 993 }
 994
 995 #ifdef RX_ENABLE_TSFPQ
 996 int
 997 rxi_TrimDataBufs(struct rx_packet *p, int first)
 998 {
 999     int length;
1000     struct iovec *iov, *end;
1001     struct rx_ts_info_t * rx_ts_info;
1002     SPLVAR;
1003
1004     if (first != 1)
1005         osi_Panic("TrimDataBufs 1: first must be 1");
1006
1007     /* Skip over continuation buffers containing message data */
1008     iov = &p->wirevec[2];
1009     end = iov + (p->niovecs - 2);
1010     length = p->length - p->wirevec[1].iov_len;
1011     for (; iov < end && length > 0; iov++) {
1012         if (!iov->iov_base)
1013             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1014         length -= iov->iov_len;
1015     }
1016
1017     /* iov now points to the first empty data buffer. */
1018     if (iov >= end)
1019         return 0;
1020
1021     RX_TS_INFO_GET(rx_ts_info);
1022     for (; iov < end; iov++) {
1023         if (!iov->iov_base)
1024             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1025         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1026         p->niovecs--;
1027     }
1028     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1029         NETPRI;
1030         MUTEX_ENTER(&rx_freePktQ_lock);
1031
1032         RX_TS_FPQ_LTOG(rx_ts_info);
1033         rxi_PacketsUnWait();
1034
1035         MUTEX_EXIT(&rx_freePktQ_lock);
1036         USERPRI;
1037     }
1038
1039     return 0;
1040 }
1041 #else /* RX_ENABLE_TSFPQ */
1042 int
1043 rxi_TrimDataBufs(struct rx_packet *p, int first)
1044 {
1045     int length;
1046     struct iovec *iov, *end;
1047     SPLVAR;
1048
1049     if (first != 1)
1050         osi_Panic("TrimDataBufs 1: first must be 1");
1051
1052     /* Skip over continuation buffers containing message data */
1053     iov = &p->wirevec[2];
1054     end = iov + (p->niovecs - 2);
1055     length = p->length - p->wirevec[1].iov_len;
1056     for (; iov < end && length > 0; iov++) {
1057         if (!iov->iov_base)
1058             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1059         length -= iov->iov_len;
1060     }
1061
1062     /* iov now points to the first empty data buffer. */
1063     if (iov >= end)
1064         return 0;
1065
1066     NETPRI;
1067     MUTEX_ENTER(&rx_freePktQ_lock);
1068
1069     for (; iov < end; iov++) {
1070         if (!iov->iov_base)
1071             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1072         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1073         p->niovecs--;
1074     }
1075     rxi_PacketsUnWait();
1076
1077     MUTEX_EXIT(&rx_freePktQ_lock);
1078     USERPRI;
1079
1080     return 0;
1081 }
1082 #endif /* RX_ENABLE_TSFPQ */
1083
1084 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1085  * remove it yourself first if you call this routine. */
1086 #ifdef RX_ENABLE_TSFPQ
1087 void
1088 rxi_FreePacket(struct rx_packet *p)
1089 {
1090     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1091     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1092 }
1093 #else /* RX_ENABLE_TSFPQ */
1094 void
1095 rxi_FreePacket(struct rx_packet *p)
1096 {
1097     SPLVAR;
1098
1099     NETPRI;
1100     MUTEX_ENTER(&rx_freePktQ_lock);
1101
1102     rxi_FreeDataBufsNoLock(p, 2);
1103     rxi_FreePacketNoLock(p);
1104     /* Wakeup anyone waiting for packets */
1105     rxi_PacketsUnWait();
1106
1107     MUTEX_EXIT(&rx_freePktQ_lock);
1108     USERPRI;
1109 }
1110 #endif /* RX_ENABLE_TSFPQ */
1111
1112 /* rxi_AllocPacket sets up p->length so it reflects the number of
1113  * bytes in the packet at this point, **not including** the header.
1114  * The header is absolutely necessary, besides, this is the way the
1115  * length field is usually used */
1116 #ifdef RX_ENABLE_TSFPQ
1117 struct rx_packet *
1118 rxi_AllocPacketNoLock(int class)
1119 {
1120     struct rx_packet *p;
1121     struct rx_ts_info_t * rx_ts_info;
1122
1123     RX_TS_INFO_GET(rx_ts_info);
1124
1125 #ifdef KERNEL
1126     if (rxi_OverQuota(class)) {
1127         rxi_NeedMorePackets = TRUE;
1128         if (rx_stats_active) {
1129             switch (class) {
1130             case RX_PACKET_CLASS_RECEIVE:
1131                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1132                 break;
1133             case RX_PACKET_CLASS_SEND:
1134                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1135                 break;
1136             case RX_PACKET_CLASS_SPECIAL:
1137                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1138                 break;
1139             case RX_PACKET_CLASS_RECV_CBUF:
1140                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1141                 break;
1142             case RX_PACKET_CLASS_SEND_CBUF:
1143                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1144                 break;
1145             }
1146         }
1147         return (struct rx_packet *)0;
1148     }
1149 #endif /* KERNEL */
1150
1151     if (rx_stats_active)
1152         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1153     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1154
1155 #ifdef KERNEL
1156         if (queue_IsEmpty(&rx_freePacketQueue))
1157             osi_Panic("rxi_AllocPacket error");
1158 #else /* KERNEL */
1159         if (queue_IsEmpty(&rx_freePacketQueue))
1160             rxi_MorePacketsNoLock(rx_maxSendWindow);
1161 #endif /* KERNEL */
1162
1163
1164         RX_TS_FPQ_GTOL(rx_ts_info);
1165     }
1166
1167     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1168
1169     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1170
1171
1172     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1173      * order to truncate outbound packets.  In the near future, may need
1174      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1175      */
1176     RX_PACKET_IOV_FULLINIT(p);
1177     return p;
1178 }
1179 #else /* RX_ENABLE_TSFPQ */
1180 struct rx_packet *
1181 rxi_AllocPacketNoLock(int class)
1182 {
1183     struct rx_packet *p;
1184
1185 #ifdef KERNEL
1186     if (rxi_OverQuota(class)) {
1187         rxi_NeedMorePackets = TRUE;
1188         if (rx_stats_active) {
1189             switch (class) {
1190             case RX_PACKET_CLASS_RECEIVE:
1191                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1192                 break;
1193             case RX_PACKET_CLASS_SEND:
1194                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1195                 break;
1196             case RX_PACKET_CLASS_SPECIAL:
1197                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1198                 break;
1199             case RX_PACKET_CLASS_RECV_CBUF:
1200                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1201                 break;
1202             case RX_PACKET_CLASS_SEND_CBUF:
1203                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1204                 break;
1205             }
1206         }
1207         return (struct rx_packet *)0;
1208     }
1209 #endif /* KERNEL */
1210
1211     if (rx_stats_active)
1212         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1213
1214 #ifdef KERNEL
1215     if (queue_IsEmpty(&rx_freePacketQueue))
1216         osi_Panic("rxi_AllocPacket error");
1217 #else /* KERNEL */
1218     if (queue_IsEmpty(&rx_freePacketQueue))
1219         rxi_MorePacketsNoLock(rx_maxSendWindow);
1220 #endif /* KERNEL */
1221
1222     rx_nFreePackets--;
1223     p = queue_First(&rx_freePacketQueue, rx_packet);
1224     queue_Remove(p);
1225     RX_FPQ_MARK_USED(p);
1226
1227     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1228
1229
1230     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1231      * order to truncate outbound packets.  In the near future, may need
1232      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1233      */
1234     RX_PACKET_IOV_FULLINIT(p);
1235     return p;
1236 }
1237 #endif /* RX_ENABLE_TSFPQ */
1238
1239 #ifdef RX_ENABLE_TSFPQ
1240 struct rx_packet *
1241 rxi_AllocPacketTSFPQ(int class, int pull_global)
1242 {
1243     struct rx_packet *p;
1244     struct rx_ts_info_t * rx_ts_info;
1245
1246     RX_TS_INFO_GET(rx_ts_info);
1247
1248     if (rx_stats_active)
1249         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1250     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1251         MUTEX_ENTER(&rx_freePktQ_lock);
1252
1253         if (queue_IsEmpty(&rx_freePacketQueue))
1254             rxi_MorePacketsNoLock(rx_maxSendWindow);
1255
1256         RX_TS_FPQ_GTOL(rx_ts_info);
1257
1258         MUTEX_EXIT(&rx_freePktQ_lock);
1259     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1260         return NULL;
1261     }
1262
1263     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1264
1265     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1266
1267     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1268      * order to truncate outbound packets.  In the near future, may need
1269      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1270      */
1271     RX_PACKET_IOV_FULLINIT(p);
1272     return p;
1273 }
1274 #endif /* RX_ENABLE_TSFPQ */
1275
1276 #ifdef RX_ENABLE_TSFPQ
1277 struct rx_packet *
1278 rxi_AllocPacket(int class)
1279 {
1280     struct rx_packet *p;
1281
1282     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1283     return p;
1284 }
1285 #else /* RX_ENABLE_TSFPQ */
1286 struct rx_packet *
1287 rxi_AllocPacket(int class)
1288 {
1289     struct rx_packet *p;
1290
1291     MUTEX_ENTER(&rx_freePktQ_lock);
1292     p = rxi_AllocPacketNoLock(class);
1293     MUTEX_EXIT(&rx_freePktQ_lock);
1294     return p;
1295 }
1296 #endif /* RX_ENABLE_TSFPQ */
1297
1298 /* This guy comes up with as many buffers as it {takes,can get} given
1299  * the MTU for this call. It also sets the packet length before
1300  * returning.  caution: this is often called at NETPRI
1301  * Called with call locked.
1302  */
1303 struct rx_packet *
1304 rxi_AllocSendPacket(struct rx_call *call, int want)
1305 {
1306     struct rx_packet *p = (struct rx_packet *)0;
1307     int mud;
1308     unsigned delta;
1309
1310     SPLVAR;
1311     mud = call->MTU - RX_HEADER_SIZE;
1312     delta =
1313         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1314         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1315
1316 #ifdef RX_ENABLE_TSFPQ
1317     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1318         want += delta;
1319         want = MIN(want, mud);
1320
1321         if ((unsigned)want > p->length)
1322             (void)rxi_AllocDataBuf(p, (want - p->length),
1323                                    RX_PACKET_CLASS_SEND_CBUF);
1324
1325         if (p->length > mud)
1326             p->length = mud;
1327
1328         if (delta >= p->length) {
1329             rxi_FreePacket(p);
1330             p = NULL;
1331         } else {
1332             p->length -= delta;
1333         }
1334         return p;
1335     }
1336 #endif /* RX_ENABLE_TSFPQ */
1337
1338     while (!(call->error)) {
1339         MUTEX_ENTER(&rx_freePktQ_lock);
1340         /* if an error occurred, or we get the packet we want, we're done */
1341         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1342             MUTEX_EXIT(&rx_freePktQ_lock);
1343
1344             want += delta;
1345             want = MIN(want, mud);
1346
1347             if ((unsigned)want > p->length)
1348                 (void)rxi_AllocDataBuf(p, (want - p->length),
1349                                        RX_PACKET_CLASS_SEND_CBUF);
1350
1351             if (p->length > mud)
1352                 p->length = mud;
1353
1354             if (delta >= p->length) {
1355                 rxi_FreePacket(p);
1356                 p = NULL;
1357             } else {
1358                 p->length -= delta;
1359             }
1360             break;
1361         }
1362
1363         /* no error occurred, and we didn't get a packet, so we sleep.
1364          * At this point, we assume that packets will be returned
1365          * sooner or later, as packets are acknowledged, and so we
1366          * just wait.  */
1367         NETPRI;
1368         call->flags |= RX_CALL_WAIT_PACKETS;
1369         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1370         MUTEX_EXIT(&call->lock);
1371         rx_waitingForPackets = 1;
1372
1373 #ifdef  RX_ENABLE_LOCKS
1374         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1375 #else
1376         osi_rxSleep(&rx_waitingForPackets);
1377 #endif
1378         MUTEX_EXIT(&rx_freePktQ_lock);
1379         MUTEX_ENTER(&call->lock);
1380         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1381         call->flags &= ~RX_CALL_WAIT_PACKETS;
1382         USERPRI;
1383     }
1384
1385     return p;
1386 }
1387
1388 #ifndef KERNEL
1389 #ifdef AFS_NT40_ENV
1390 /* Windows does not use file descriptors. */
1391 #define CountFDs(amax) 0
1392 #else
1393 /* count the number of used FDs */
1394 static int
1395 CountFDs(int amax)
1396 {
1397     struct stat tstat;
1398     int i, code;
1399     int count;
1400
1401     count = 0;
1402     for (i = 0; i < amax; i++) {
1403         code = fstat(i, &tstat);
1404         if (code == 0)
1405             count++;
1406     }
1407     return count;
1408 }
1409 #endif /* AFS_NT40_ENV */
1410 #else /* KERNEL */
1411
1412 #define CountFDs(amax) amax
1413
1414 #endif /* KERNEL */
1415
1416 #if !defined(KERNEL) || defined(UKERNEL)
1417
1418 /* This function reads a single packet from the interface into the
1419  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1420  * (host,port) of the sender are stored in the supplied variables, and
1421  * the data length of the packet is stored in the packet structure.
1422  * The header is decoded. */
1423 int
1424 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1425                u_short * port)
1426 {
1427     struct sockaddr_in from;
1428     unsigned int nbytes;
1429     afs_int32 rlen;
1430     afs_uint32 tlen, savelen;
1431     struct msghdr msg;
1432     rx_computelen(p, tlen);
1433     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1434
1435     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1436     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1437                                  * it once in order to avoid races.  */
1438     tlen = rlen - tlen;
1439     if (tlen > 0) {
1440         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1441         if (tlen > 0) {
1442             tlen = rlen - tlen;
1443         } else
1444             tlen = rlen;
1445     } else
1446         tlen = rlen;
1447
1448     /* Extend the last iovec for padding, it's just to make sure that the
1449      * read doesn't return more data than we expect, and is done to get around
1450      * our problems caused by the lack of a length field in the rx header.
1451      * Use the extra buffer that follows the localdata in each packet
1452      * structure. */
1453     savelen = p->wirevec[p->niovecs - 1].iov_len;
1454     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1455
1456     memset(&msg, 0, sizeof(msg));
1457     msg.msg_name = (char *)&from;
1458     msg.msg_namelen = sizeof(struct sockaddr_in);
1459     msg.msg_iov = p->wirevec;
1460     msg.msg_iovlen = p->niovecs;
1461     nbytes = rxi_Recvmsg(socket, &msg, 0);
1462
1463     /* restore the vec to its correct state */
1464     p->wirevec[p->niovecs - 1].iov_len = savelen;
1465
1466     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1467     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1468         if (nbytes < 0 && errno == EWOULDBLOCK) {
1469             if (rx_stats_active)
1470                 rx_MutexIncrement(rx_stats.noPacketOnRead, rx_stats_mutex);
1471         } else if (nbytes <= 0) {
1472             if (rx_stats_active) {
1473                 MUTEX_ENTER(&rx_stats_mutex);
1474                 rx_stats.bogusPacketOnRead++;
1475                 rx_stats.bogusHost = from.sin_addr.s_addr;
1476                 MUTEX_EXIT(&rx_stats_mutex);
1477             }
1478             dpf(("B: bogus packet from [%x,%d] nb=%d", ntohl(from.sin_addr.s_addr),
1479                  ntohs(from.sin_port), nbytes));
1480         }
1481         return 0;
1482     }
1483 #ifdef RXDEBUG
1484     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1485                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1486         rxi_DecodePacketHeader(p);
1487
1488         *host = from.sin_addr.s_addr;
1489         *port = from.sin_port;
1490
1491         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d",
1492               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1493               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1494               p->length));
1495 #ifdef RX_TRIMDATABUFS
1496         rxi_TrimDataBufs(p, 1);
1497 #endif
1498         return 0;
1499     }
1500 #endif
1501     else {
1502         /* Extract packet header. */
1503         rxi_DecodePacketHeader(p);
1504
1505         *host = from.sin_addr.s_addr;
1506         *port = from.sin_port;
1507         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1508             struct rx_peer *peer;
1509             if (rx_stats_active)
1510                 rx_MutexIncrement(rx_stats.packetsRead[p->header.type - 1], rx_stats_mutex);
1511             /*
1512              * Try to look up this peer structure.  If it doesn't exist,
1513              * don't create a new one -
1514              * we don't keep count of the bytes sent/received if a peer
1515              * structure doesn't already exist.
1516              *
1517              * The peer/connection cleanup code assumes that there is 1 peer
1518              * per connection.  If we actually created a peer structure here
1519              * and this packet was an rxdebug packet, the peer structure would
1520              * never be cleaned up.
1521              */
1522             peer = rxi_FindPeer(*host, *port, 0, 0);
1523             /* Since this may not be associated with a connection,
1524              * it may have no refCount, meaning we could race with
1525              * ReapConnections
1526              */
1527             if (peer && (peer->refCount > 0)) {
1528                 MUTEX_ENTER(&peer->peer_lock);
1529                 hadd32(peer->bytesReceived, p->length);
1530                 MUTEX_EXIT(&peer->peer_lock);
1531             }
1532         }
1533
1534 #ifdef RX_TRIMDATABUFS
1535         /* Free any empty packet buffers at the end of this packet */
1536         rxi_TrimDataBufs(p, 1);
1537 #endif
1538         return 1;
1539     }
1540 }
1541
1542 #endif /* !KERNEL || UKERNEL */
1543
1544 /* This function splits off the first packet in a jumbo packet.
1545  * As of AFS 3.5, jumbograms contain more than one fixed size
1546  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1547  * last packet header. All packets (except the last) are padded to
1548  * fall on RX_CBUFFERSIZE boundaries.
1549  * HACK: We store the length of the first n-1 packets in the
1550  * last two pad bytes. */
1551
1552 struct rx_packet *
1553 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1554                      int first)
1555 {
1556     struct rx_packet *np;
1557     struct rx_jumboHeader *jp;
1558     int niov, i;
1559     struct iovec *iov;
1560     int length;
1561     afs_uint32 temp;
1562
1563     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1564      * bytes in length. All but the first packet are preceded by
1565      * an abbreviated four byte header. The length of the last packet
1566      * is calculated from the size of the jumbogram. */
1567     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1568
1569     if ((int)p->length < length) {
1570         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1571         return NULL;
1572     }
1573     niov = p->niovecs - 2;
1574     if (niov < 1) {
1575         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1576         return NULL;
1577     }
1578     iov = &p->wirevec[2];
1579     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1580
1581     /* Get a pointer to the abbreviated packet header */
1582     jp = (struct rx_jumboHeader *)
1583         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1584
1585     /* Set up the iovecs for the next packet */
1586     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1587     np->wirevec[0].iov_len = sizeof(struct rx_header);
1588     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1589     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1590     np->niovecs = niov + 1;
1591     for (i = 2, iov++; i <= niov; i++, iov++) {
1592         np->wirevec[i] = *iov;
1593     }
1594     np->length = p->length - length;
1595     p->length = RX_JUMBOBUFFERSIZE;
1596     p->niovecs = 2;
1597
1598     /* Convert the jumbo packet header to host byte order */
1599     temp = ntohl(*(afs_uint32 *) jp);
1600     jp->flags = (u_char) (temp >> 24);
1601     jp->cksum = (u_short) (temp);
1602
1603     /* Fill in the packet header */
1604     np->header = p->header;
1605     np->header.serial = p->header.serial + 1;
1606     np->header.seq = p->header.seq + 1;
1607     np->header.flags = jp->flags;
1608     np->header.spare = jp->cksum;
1609
1610     return np;
1611 }
1612
1613 #ifndef KERNEL
1614 /* Send a udp datagram */
1615 int
1616 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1617             int length, int istack)
1618 {
1619     struct msghdr msg;
1620         int ret;
1621
1622     memset(&msg, 0, sizeof(msg));
1623     msg.msg_iov = dvec;
1624     msg.msg_iovlen = nvecs;
1625     msg.msg_name = addr;
1626     msg.msg_namelen = sizeof(struct sockaddr_in);
1627
1628     ret = rxi_Sendmsg(socket, &msg, 0);
1629
1630     return ret;
1631 }
1632 #elif !defined(UKERNEL)
1633 /*
1634  * message receipt is done in rxk_input or rx_put.
1635  */
1636
1637 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1638 /*
1639  * Copy an mblock to the contiguous area pointed to by cp.
1640  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1641  * but it doesn't really.
1642  * Returns the number of bytes not transferred.
1643  * The message is NOT changed.
1644  */
1645 static int
1646 cpytoc(mblk_t * mp, int off, int len, char *cp)
1647 {
1648     int n;
1649
1650     for (; mp && len > 0; mp = mp->b_cont) {
1651         if (mp->b_datap->db_type != M_DATA) {
1652             return -1;
1653         }
1654         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1655         memcpy(cp, (char *)mp->b_rptr, n);
1656         cp += n;
1657         len -= n;
1658         mp->b_rptr += n;
1659     }
1660     return (len);
1661 }
1662
1663 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1664  * but it doesn't really.
1665  * This sucks, anyway, do it like m_cpy.... below
1666  */
1667 static int
1668 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1669            int niovs)
1670 {
1671     int m, n, o, t, i;
1672
1673     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1674         if (mp->b_datap->db_type != M_DATA) {
1675             return -1;
1676         }
1677         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1678         len -= n;
1679         while (n) {
1680             if (!t) {
1681                 o = 0;
1682                 i++;
1683                 t = iovs[i].iov_len;
1684             }
1685             m = MIN(n, t);
1686             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1687             mp->b_rptr += m;
1688             o += m;
1689             t -= m;
1690             n -= m;
1691         }
1692     }
1693     return (len);
1694 }
1695
1696 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1697 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1698 #else
1699 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1700 static int
1701 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1702 {
1703     caddr_t p1, p2;
1704     unsigned int l1, l2, i, t;
1705
1706     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1707         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1708
1709     while (off && m)
1710         if (m->m_len <= off) {
1711             off -= m->m_len;
1712             m = m->m_next;
1713             continue;
1714         } else
1715             break;
1716
1717     if (m == NULL)
1718         return len;
1719
1720     p1 = mtod(m, caddr_t) + off;
1721     l1 = m->m_len - off;
1722     i = 0;
1723     p2 = iovs[0].iov_base;
1724     l2 = iovs[0].iov_len;
1725
1726     while (len) {
1727         t = MIN(l1, MIN(l2, (unsigned int)len));
1728         memcpy(p2, p1, t);
1729         p1 += t;
1730         p2 += t;
1731         l1 -= t;
1732         l2 -= t;
1733         len -= t;
1734         if (!l1) {
1735             m = m->m_next;
1736             if (!m)
1737                 break;
1738             p1 = mtod(m, caddr_t);
1739             l1 = m->m_len;
1740         }
1741         if (!l2) {
1742             if (++i >= niovs)
1743                 break;
1744             p2 = iovs[i].iov_base;
1745             l2 = iovs[i].iov_len;
1746         }
1747
1748     }
1749
1750     return len;
1751 }
1752 #endif /* LINUX */
1753 #endif /* AFS_SUN5_ENV */
1754
1755 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1756 int
1757 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1758 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1759      mblk_t *amb;
1760 #else
1761      struct mbuf *amb;
1762 #endif
1763      void (*free) ();
1764      struct rx_packet *phandle;
1765      int hdr_len, data_len;
1766 {
1767     int code;
1768
1769     code =
1770         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1771                      phandle->niovecs);
1772     (*free) (amb);
1773
1774     return code;
1775 }
1776 #endif /* LINUX */
1777 #endif /*KERNEL && !UKERNEL */
1778
1779
1780 /* send a response to a debug packet */
1781
1782 struct rx_packet *
1783 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1784                        afs_uint32 ahost, short aport, int istack)
1785 {
1786     struct rx_debugIn tin;
1787     afs_int32 tl;
1788     struct rx_serverQueueEntry *np, *nqe;
1789
1790     /*
1791      * Only respond to client-initiated Rx debug packets,
1792      * and clear the client flag in the response.
1793      */
1794     if (ap->header.flags & RX_CLIENT_INITIATED) {
1795         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1796         rxi_EncodePacketHeader(ap);
1797     } else {
1798         return ap;
1799     }
1800
1801     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1802     /* all done with packet, now set length to the truth, so we can
1803      * reuse this packet */
1804     rx_computelen(ap, ap->length);
1805
1806     tin.type = ntohl(tin.type);
1807     tin.index = ntohl(tin.index);
1808     switch (tin.type) {
1809     case RX_DEBUGI_GETSTATS:{
1810             struct rx_debugStats tstat;
1811
1812             /* get basic stats */
1813             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1814             tstat.version = RX_DEBUGI_VERSION;
1815 #ifndef RX_ENABLE_LOCKS
1816             tstat.waitingForPackets = rx_waitingForPackets;
1817 #endif
1818             MUTEX_ENTER(&rx_serverPool_lock);
1819             tstat.nFreePackets = htonl(rx_nFreePackets);
1820             tstat.nPackets = htonl(rx_nPackets);
1821             tstat.callsExecuted = htonl(rxi_nCalls);
1822             tstat.packetReclaims = htonl(rx_packetReclaims);
1823             tstat.usedFDs = CountFDs(64);
1824             tstat.nWaiting = htonl(rx_nWaiting);
1825             tstat.nWaited = htonl(rx_nWaited);
1826             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1827                         tstat.idleThreads);
1828             MUTEX_EXIT(&rx_serverPool_lock);
1829             tstat.idleThreads = htonl(tstat.idleThreads);
1830             tl = sizeof(struct rx_debugStats) - ap->length;
1831             if (tl > 0)
1832                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1833
1834             if (tl <= 0) {
1835                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1836                                (char *)&tstat);
1837                 ap->length = sizeof(struct rx_debugStats);
1838                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1839                 rx_computelen(ap, ap->length);
1840             }
1841             break;
1842         }
1843
1844     case RX_DEBUGI_GETALLCONN:
1845     case RX_DEBUGI_GETCONN:{
1846             unsigned int i, j;
1847             struct rx_connection *tc;
1848             struct rx_call *tcall;
1849             struct rx_debugConn tconn;
1850             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1851
1852
1853             tl = sizeof(struct rx_debugConn) - ap->length;
1854             if (tl > 0)
1855                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1856             if (tl > 0)
1857                 return ap;
1858
1859             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1860             /* get N'th (maybe) "interesting" connection info */
1861             for (i = 0; i < rx_hashTableSize; i++) {
1862 #if !defined(KERNEL)
1863                 /* the time complexity of the algorithm used here
1864                  * exponentially increses with the number of connections.
1865                  */
1866 #ifdef AFS_PTHREAD_ENV
1867                 pthread_yield();
1868 #else
1869                 (void)IOMGR_Poll();
1870 #endif
1871 #endif
1872                 MUTEX_ENTER(&rx_connHashTable_lock);
1873                 /* We might be slightly out of step since we are not
1874                  * locking each call, but this is only debugging output.
1875                  */
1876                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1877                     if ((all || rxi_IsConnInteresting(tc))
1878                         && tin.index-- <= 0) {
1879                         tconn.host = tc->peer->host;
1880                         tconn.port = tc->peer->port;
1881                         tconn.cid = htonl(tc->cid);
1882                         tconn.epoch = htonl(tc->epoch);
1883                         tconn.serial = htonl(tc->serial);
1884                         for (j = 0; j < RX_MAXCALLS; j++) {
1885                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1886                             if ((tcall = tc->call[j])) {
1887                                 tconn.callState[j] = tcall->state;
1888                                 tconn.callMode[j] = tcall->mode;
1889                                 tconn.callFlags[j] = tcall->flags;
1890                                 if (queue_IsNotEmpty(&tcall->rq))
1891                                     tconn.callOther[j] |= RX_OTHER_IN;
1892                                 if (queue_IsNotEmpty(&tcall->tq))
1893                                     tconn.callOther[j] |= RX_OTHER_OUT;
1894                             } else
1895                                 tconn.callState[j] = RX_STATE_NOTINIT;
1896                         }
1897
1898                         tconn.natMTU = htonl(tc->peer->natMTU);
1899                         tconn.error = htonl(tc->error);
1900                         tconn.flags = tc->flags;
1901                         tconn.type = tc->type;
1902                         tconn.securityIndex = tc->securityIndex;
1903                         if (tc->securityObject) {
1904                             RXS_GetStats(tc->securityObject, tc,
1905                                          &tconn.secStats);
1906 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1907 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1908                             DOHTONL(flags);
1909                             DOHTONL(expires);
1910                             DOHTONL(packetsReceived);
1911                             DOHTONL(packetsSent);
1912                             DOHTONL(bytesReceived);
1913                             DOHTONL(bytesSent);
1914                             for (i = 0;
1915                                  i <
1916                                  sizeof(tconn.secStats.spares) /
1917                                  sizeof(short); i++)
1918                                 DOHTONS(spares[i]);
1919                             for (i = 0;
1920                                  i <
1921                                  sizeof(tconn.secStats.sparel) /
1922                                  sizeof(afs_int32); i++)
1923                                 DOHTONL(sparel[i]);
1924                         }
1925
1926                         MUTEX_EXIT(&rx_connHashTable_lock);
1927                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1928                                        (char *)&tconn);
1929                         tl = ap->length;
1930                         ap->length = sizeof(struct rx_debugConn);
1931                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1932                                             istack);
1933                         ap->length = tl;
1934                         return ap;
1935                     }
1936                 }
1937                 MUTEX_EXIT(&rx_connHashTable_lock);
1938             }
1939             /* if we make it here, there are no interesting packets */
1940             tconn.cid = htonl(0xffffffff);      /* means end */
1941             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1942                            (char *)&tconn);
1943             tl = ap->length;
1944             ap->length = sizeof(struct rx_debugConn);
1945             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1946             ap->length = tl;
1947             break;
1948         }
1949
1950         /*
1951          * Pass back all the peer structures we have available
1952          */
1953
1954     case RX_DEBUGI_GETPEER:{
1955             unsigned int i;
1956             struct rx_peer *tp;
1957             struct rx_debugPeer tpeer;
1958
1959
1960             tl = sizeof(struct rx_debugPeer) - ap->length;
1961             if (tl > 0)
1962                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1963             if (tl > 0)
1964                 return ap;
1965
1966             memset(&tpeer, 0, sizeof(tpeer));
1967             for (i = 0; i < rx_hashTableSize; i++) {
1968 #if !defined(KERNEL)
1969                 /* the time complexity of the algorithm used here
1970                  * exponentially increses with the number of peers.
1971                  *
1972                  * Yielding after processing each hash table entry
1973                  * and dropping rx_peerHashTable_lock.
1974                  * also increases the risk that we will miss a new
1975                  * entry - but we are willing to live with this
1976                  * limitation since this is meant for debugging only
1977                  */
1978 #ifdef AFS_PTHREAD_ENV
1979                 pthread_yield();
1980 #else
1981                 (void)IOMGR_Poll();
1982 #endif
1983 #endif
1984                 MUTEX_ENTER(&rx_peerHashTable_lock);
1985                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1986                     if (tin.index-- <= 0) {
1987                         tp->refCount++;
1988                         MUTEX_EXIT(&rx_peerHashTable_lock);
1989
1990                         MUTEX_ENTER(&tp->peer_lock);
1991                         tpeer.host = tp->host;
1992                         tpeer.port = tp->port;
1993                         tpeer.ifMTU = htons(tp->ifMTU);
1994                         tpeer.idleWhen = htonl(tp->idleWhen);
1995                         tpeer.refCount = htons(tp->refCount);
1996                         tpeer.burstSize = tp->burstSize;
1997                         tpeer.burst = tp->burst;
1998                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1999                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
2000                         tpeer.rtt = htonl(tp->rtt);
2001                         tpeer.rtt_dev = htonl(tp->rtt_dev);
2002                         tpeer.timeout.sec = htonl(tp->timeout.sec);
2003                         tpeer.timeout.usec = htonl(tp->timeout.usec);
2004                         tpeer.nSent = htonl(tp->nSent);
2005                         tpeer.reSends = htonl(tp->reSends);
2006                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
2007                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
2008                         tpeer.rateFlag = htonl(tp->rateFlag);
2009                         tpeer.natMTU = htons(tp->natMTU);
2010                         tpeer.maxMTU = htons(tp->maxMTU);
2011                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2012                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2013                         tpeer.MTU = htons(tp->MTU);
2014                         tpeer.cwind = htons(tp->cwind);
2015                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2016                         tpeer.congestSeq = htons(tp->congestSeq);
2017                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2018                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2019                         tpeer.bytesReceived.high =
2020                             htonl(tp->bytesReceived.high);
2021                         tpeer.bytesReceived.low =
2022                             htonl(tp->bytesReceived.low);
2023                         MUTEX_EXIT(&tp->peer_lock);
2024
2025                         MUTEX_ENTER(&rx_peerHashTable_lock);
2026                         tp->refCount--;
2027                         MUTEX_EXIT(&rx_peerHashTable_lock);
2028
2029                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2030                                        (char *)&tpeer);
2031                         tl = ap->length;
2032                         ap->length = sizeof(struct rx_debugPeer);
2033                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2034                                             istack);
2035                         ap->length = tl;
2036                         return ap;
2037                     }
2038                 }
2039                 MUTEX_EXIT(&rx_peerHashTable_lock);
2040             }
2041             /* if we make it here, there are no interesting packets */
2042             tpeer.host = htonl(0xffffffff);     /* means end */
2043             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2044                            (char *)&tpeer);
2045             tl = ap->length;
2046             ap->length = sizeof(struct rx_debugPeer);
2047             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2048             ap->length = tl;
2049             break;
2050         }
2051
2052     case RX_DEBUGI_RXSTATS:{
2053             int i;
2054             afs_int32 *s;
2055
2056             tl = sizeof(rx_stats) - ap->length;
2057             if (tl > 0)
2058                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2059             if (tl > 0)
2060                 return ap;
2061
2062             /* Since its all int32s convert to network order with a loop. */
2063         if (rx_stats_active)
2064             MUTEX_ENTER(&rx_stats_mutex);
2065             s = (afs_int32 *) & rx_stats;
2066             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2067                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2068
2069             tl = ap->length;
2070             ap->length = sizeof(rx_stats);
2071         if (rx_stats_active)
2072             MUTEX_EXIT(&rx_stats_mutex);
2073             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2074             ap->length = tl;
2075             break;
2076         }
2077
2078     default:
2079         /* error response packet */
2080         tin.type = htonl(RX_DEBUGI_BADTYPE);
2081         tin.index = tin.type;
2082         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2083         tl = ap->length;
2084         ap->length = sizeof(struct rx_debugIn);
2085         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2086         ap->length = tl;
2087         break;
2088     }
2089     return ap;
2090 }
2091
2092 struct rx_packet *
2093 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2094                          afs_uint32 ahost, short aport, int istack)
2095 {
2096     afs_int32 tl;
2097
2098     /*
2099      * Only respond to client-initiated version requests, and
2100      * clear that flag in the response.
2101      */
2102     if (ap->header.flags & RX_CLIENT_INITIATED) {
2103         char buf[66];
2104
2105         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2106         rxi_EncodePacketHeader(ap);
2107         memset(buf, 0, sizeof(buf));
2108         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2109         rx_packetwrite(ap, 0, 65, buf);
2110         tl = ap->length;
2111         ap->length = 65;
2112         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2113         ap->length = tl;
2114     }
2115
2116     return ap;
2117 }
2118
2119
2120 /* send a debug packet back to the sender */
2121 static void
2122 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2123                     afs_uint32 ahost, short aport, afs_int32 istack)
2124 {
2125     struct sockaddr_in taddr;
2126     unsigned int i, nbytes, savelen = 0;
2127     int saven = 0;
2128 #ifdef KERNEL
2129     int waslocked = ISAFS_GLOCK();
2130 #endif
2131
2132     taddr.sin_family = AF_INET;
2133     taddr.sin_port = aport;
2134     taddr.sin_addr.s_addr = ahost;
2135 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2136     taddr.sin_len = sizeof(struct sockaddr_in);
2137 #endif
2138
2139     /* We need to trim the niovecs. */
2140     nbytes = apacket->length;
2141     for (i = 1; i < apacket->niovecs; i++) {
2142         if (nbytes <= apacket->wirevec[i].iov_len) {
2143             savelen = apacket->wirevec[i].iov_len;
2144             saven = apacket->niovecs;
2145             apacket->wirevec[i].iov_len = nbytes;
2146             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2147         } else
2148             nbytes -= apacket->wirevec[i].iov_len;
2149     }
2150 #ifdef KERNEL
2151 #ifdef RX_KERNEL_TRACE
2152     if (ICL_SETACTIVE(afs_iclSetp)) {
2153         if (!waslocked)
2154             AFS_GLOCK();
2155         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2156                    "before osi_NetSend()");
2157         AFS_GUNLOCK();
2158     } else
2159 #else
2160     if (waslocked)
2161         AFS_GUNLOCK();
2162 #endif
2163 #endif
2164     /* debug packets are not reliably delivered, hence the cast below. */
2165     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2166                       apacket->length + RX_HEADER_SIZE, istack);
2167 #ifdef KERNEL
2168 #ifdef RX_KERNEL_TRACE
2169     if (ICL_SETACTIVE(afs_iclSetp)) {
2170         AFS_GLOCK();
2171         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2172                    "after osi_NetSend()");
2173         if (!waslocked)
2174             AFS_GUNLOCK();
2175     } else
2176 #else
2177     if (waslocked)
2178         AFS_GLOCK();
2179 #endif
2180 #endif
2181     if (saven) {                /* means we truncated the packet above. */
2182         apacket->wirevec[i - 1].iov_len = savelen;
2183         apacket->niovecs = saven;
2184     }
2185
2186 }
2187
2188 /* Send the packet to appropriate destination for the specified
2189  * call.  The header is first encoded and placed in the packet.
2190  */
2191 void
2192 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2193                struct rx_packet *p, int istack)
2194 {
2195 #if defined(KERNEL)
2196     int waslocked;
2197 #endif
2198     int code;
2199     struct sockaddr_in addr;
2200     struct rx_peer *peer = conn->peer;
2201     osi_socket socket;
2202 #ifdef RXDEBUG
2203     char deliveryType = 'S';
2204 #endif
2205     /* The address we're sending the packet to */
2206     memset(&addr, 0, sizeof(addr));
2207     addr.sin_family = AF_INET;
2208     addr.sin_port = peer->port;
2209     addr.sin_addr.s_addr = peer->host;
2210
2211     /* This stuff should be revamped, I think, so that most, if not
2212      * all, of the header stuff is always added here.  We could
2213      * probably do away with the encode/decode routines. XXXXX */
2214
2215     /* Stamp each packet with a unique serial number.  The serial
2216      * number is maintained on a connection basis because some types
2217      * of security may be based on the serial number of the packet,
2218      * and security is handled on a per authenticated-connection
2219      * basis. */
2220     /* Pre-increment, to guarantee no zero serial number; a zero
2221      * serial number means the packet was never sent. */
2222     MUTEX_ENTER(&conn->conn_data_lock);
2223     p->header.serial = ++conn->serial;
2224     if (p->length > conn->peer->maxPacketSize) {
2225         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2226             (p->header.flags & RX_REQUEST_ACK)) {
2227             conn->lastPingSize = p->length;
2228             conn->lastPingSizeSer = p->header.serial;
2229         } else if (p->header.seq != 0) {
2230             conn->lastPacketSize = p->length;
2231             conn->lastPacketSizeSeq = p->header.seq;
2232         }
2233     }
2234     MUTEX_EXIT(&conn->conn_data_lock);
2235     /* This is so we can adjust retransmit time-outs better in the face of
2236      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2237      */
2238     if (p->firstSerial == 0) {
2239         p->firstSerial = p->header.serial;
2240     }
2241 #ifdef RXDEBUG
2242     /* If an output tracer function is defined, call it with the packet and
2243      * network address.  Note this function may modify its arguments. */
2244     if (rx_almostSent) {
2245         int drop = (*rx_almostSent) (p, &addr);
2246         /* drop packet if return value is non-zero? */
2247         if (drop)
2248             deliveryType = 'D'; /* Drop the packet */
2249     }
2250 #endif
2251
2252     /* Get network byte order header */
2253     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2254                                  * touch ALL the fields */
2255
2256     /* Send the packet out on the same socket that related packets are being
2257      * received on */
2258     socket =
2259         (conn->type ==
2260          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2261
2262 #ifdef RXDEBUG
2263     /* Possibly drop this packet,  for testing purposes */
2264     if ((deliveryType == 'D')
2265         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2266             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2267         deliveryType = 'D';     /* Drop the packet */
2268     } else {
2269         deliveryType = 'S';     /* Send the packet */
2270 #endif /* RXDEBUG */
2271
2272         /* Loop until the packet is sent.  We'd prefer just to use a
2273          * blocking socket, but unfortunately the interface doesn't
2274          * allow us to have the socket block in send mode, and not
2275          * block in receive mode */
2276 #ifdef KERNEL
2277         waslocked = ISAFS_GLOCK();
2278 #ifdef RX_KERNEL_TRACE
2279         if (ICL_SETACTIVE(afs_iclSetp)) {
2280             if (!waslocked)
2281                 AFS_GLOCK();
2282             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2283                        "before osi_NetSend()");
2284             AFS_GUNLOCK();
2285         } else
2286 #else
2287         if (waslocked)
2288             AFS_GUNLOCK();
2289 #endif
2290 #endif
2291         if ((code =
2292              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2293                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2294             /* send failed, so let's hurry up the resend, eh? */
2295             if (rx_stats_active)
2296                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2297             p->retryTime = p->timeSent; /* resend it very soon */
2298             clock_Addmsec(&(p->retryTime),
2299                           10 + (((afs_uint32) p->backoff) << 8));
2300             /* Some systems are nice and tell us right away that we cannot
2301              * reach this recipient by returning an error code.
2302              * So, when this happens let's "down" the host NOW so
2303              * we don't sit around waiting for this host to timeout later.
2304              */
2305             if (call &&
2306 #ifdef AFS_NT40_ENV
2307                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2308 #elif defined(AFS_LINUX20_ENV)
2309                 code == -ENETUNREACH
2310 #elif defined(AFS_DARWIN_ENV)
2311                 code == EHOSTUNREACH
2312 #else
2313                 0
2314 #endif
2315                 )
2316                 call->lastReceiveTime = 0;
2317         }
2318 #ifdef KERNEL
2319 #ifdef RX_KERNEL_TRACE
2320         if (ICL_SETACTIVE(afs_iclSetp)) {
2321             AFS_GLOCK();
2322             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2323                        "after osi_NetSend()");
2324             if (!waslocked)
2325                 AFS_GUNLOCK();
2326         } else
2327 #else
2328         if (waslocked)
2329             AFS_GLOCK();
2330 #endif
2331 #endif
2332 #ifdef RXDEBUG
2333     }
2334     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2335           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2336           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2337           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2338 #endif
2339     if (rx_stats_active)
2340         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2341     MUTEX_ENTER(&peer->peer_lock);
2342     hadd32(peer->bytesSent, p->length);
2343     MUTEX_EXIT(&peer->peer_lock);
2344 }
2345
2346 /* Send a list of packets to appropriate destination for the specified
2347  * connection.  The headers are first encoded and placed in the packets.
2348  */
2349 void
2350 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2351                    struct rx_packet **list, int len, int istack)
2352 {
2353 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2354     int waslocked;
2355 #endif
2356     struct sockaddr_in addr;
2357     struct rx_peer *peer = conn->peer;
2358     osi_socket socket;
2359     struct rx_packet *p = NULL;
2360     struct iovec wirevec[RX_MAXIOVECS];
2361     int i, length, code;
2362     afs_uint32 serial;
2363     afs_uint32 temp;
2364     struct rx_jumboHeader *jp;
2365 #ifdef RXDEBUG
2366     char deliveryType = 'S';
2367 #endif
2368     /* The address we're sending the packet to */
2369     addr.sin_family = AF_INET;
2370     addr.sin_port = peer->port;
2371     addr.sin_addr.s_addr = peer->host;
2372
2373     if (len + 1 > RX_MAXIOVECS) {
2374         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2375     }
2376
2377     /*
2378      * Stamp the packets in this jumbogram with consecutive serial numbers
2379      */
2380     MUTEX_ENTER(&conn->conn_data_lock);
2381     serial = conn->serial;
2382     conn->serial += len;
2383     for (i = 0; i < len; i++) {
2384         p = list[i];
2385         if (p->length > conn->peer->maxPacketSize) {
2386             /* a ping *or* a sequenced packet can count */
2387             if ((p->length > conn->peer->maxPacketSize)) {
2388                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2389                      (p->header.flags & RX_REQUEST_ACK)) &&
2390                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2391                     conn->lastPingSize = p->length;
2392                     conn->lastPingSizeSer = serial + i;
2393                 } else if ((p->header.seq != 0) &&
2394                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2395                     conn->lastPacketSize = p->length;
2396                     conn->lastPacketSizeSeq = p->header.seq;
2397                 }
2398             }
2399         }
2400     }
2401     MUTEX_EXIT(&conn->conn_data_lock);
2402
2403
2404     /* This stuff should be revamped, I think, so that most, if not
2405      * all, of the header stuff is always added here.  We could
2406      * probably do away with the encode/decode routines. XXXXX */
2407
2408     jp = NULL;
2409     length = RX_HEADER_SIZE;
2410     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2411     wirevec[0].iov_len = RX_HEADER_SIZE;
2412     for (i = 0; i < len; i++) {
2413         p = list[i];
2414
2415         /* The whole 3.5 jumbogram scheme relies on packets fitting
2416          * in a single packet buffer. */
2417         if (p->niovecs > 2) {
2418             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2419         }
2420
2421         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2422          * in this chunk.  */
2423         if (i < len - 1) {
2424             if (p->length != RX_JUMBOBUFFERSIZE) {
2425                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2426             }
2427             p->header.flags |= RX_JUMBO_PACKET;
2428             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2429             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2430         } else {
2431             wirevec[i + 1].iov_len = p->length;
2432             length += p->length;
2433         }
2434         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2435         if (jp != NULL) {
2436             /* Convert jumbo packet header to network byte order */
2437             temp = (afs_uint32) (p->header.flags) << 24;
2438             temp |= (afs_uint32) (p->header.spare);
2439             *(afs_uint32 *) jp = htonl(temp);
2440         }
2441         jp = (struct rx_jumboHeader *)
2442             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2443
2444         /* Stamp each packet with a unique serial number.  The serial
2445          * number is maintained on a connection basis because some types
2446          * of security may be based on the serial number of the packet,
2447          * and security is handled on a per authenticated-connection
2448          * basis. */
2449         /* Pre-increment, to guarantee no zero serial number; a zero
2450          * serial number means the packet was never sent. */
2451         p->header.serial = ++serial;
2452         /* This is so we can adjust retransmit time-outs better in the face of
2453          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2454          */
2455         if (p->firstSerial == 0) {
2456             p->firstSerial = p->header.serial;
2457         }
2458 #ifdef RXDEBUG
2459         /* If an output tracer function is defined, call it with the packet and
2460          * network address.  Note this function may modify its arguments. */
2461         if (rx_almostSent) {
2462             int drop = (*rx_almostSent) (p, &addr);
2463             /* drop packet if return value is non-zero? */
2464             if (drop)
2465                 deliveryType = 'D';     /* Drop the packet */
2466         }
2467 #endif
2468
2469         /* Get network byte order header */
2470         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2471                                          * touch ALL the fields */
2472     }
2473
2474     /* Send the packet out on the same socket that related packets are being
2475      * received on */
2476     socket =
2477         (conn->type ==
2478          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2479
2480 #ifdef RXDEBUG
2481     /* Possibly drop this packet,  for testing purposes */
2482     if ((deliveryType == 'D')
2483         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2484             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2485         deliveryType = 'D';     /* Drop the packet */
2486     } else {
2487         deliveryType = 'S';     /* Send the packet */
2488 #endif /* RXDEBUG */
2489
2490         /* Loop until the packet is sent.  We'd prefer just to use a
2491          * blocking socket, but unfortunately the interface doesn't
2492          * allow us to have the socket block in send mode, and not
2493          * block in receive mode */
2494 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2495         waslocked = ISAFS_GLOCK();
2496         if (!istack && waslocked)
2497             AFS_GUNLOCK();
2498 #endif
2499         if ((code =
2500              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2501                          istack)) != 0) {
2502             /* send failed, so let's hurry up the resend, eh? */
2503             if (rx_stats_active)
2504                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2505             for (i = 0; i < len; i++) {
2506                 p = list[i];
2507                 p->retryTime = p->timeSent;     /* resend it very soon */
2508                 clock_Addmsec(&(p->retryTime),
2509                               10 + (((afs_uint32) p->backoff) << 8));
2510             }
2511             /* Some systems are nice and tell us right away that we cannot
2512              * reach this recipient by returning an error code.
2513              * So, when this happens let's "down" the host NOW so
2514              * we don't sit around waiting for this host to timeout later.
2515              */
2516             if (call &&
2517 #ifdef AFS_NT40_ENV
2518                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2519 #elif defined(AFS_LINUX20_ENV)
2520                 code == -ENETUNREACH
2521 #elif defined(AFS_DARWIN_ENV)
2522                 code == EHOSTUNREACH
2523 #else
2524                 0
2525 #endif
2526                 )
2527                 call->lastReceiveTime = 0;
2528         }
2529 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2530         if (!istack && waslocked)
2531             AFS_GLOCK();
2532 #endif
2533 #ifdef RXDEBUG
2534     }
2535
2536     assert(p != NULL);
2537
2538     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2539           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2540           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2541           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2542
2543 #endif
2544     if (rx_stats_active)
2545         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2546     MUTEX_ENTER(&peer->peer_lock);
2547     hadd32(peer->bytesSent, p->length);
2548     MUTEX_EXIT(&peer->peer_lock);
2549 }
2550
2551
2552 /* Send a "special" packet to the peer connection.  If call is
2553  * specified, then the packet is directed to a specific call channel
2554  * associated with the connection, otherwise it is directed to the
2555  * connection only. Uses optionalPacket if it is supplied, rather than
2556  * allocating a new packet buffer.  Nbytes is the length of the data
2557  * portion of the packet.  If data is non-null, nbytes of data are
2558  * copied into the packet.  Type is the type of the packet, as defined
2559  * in rx.h.  Bug: there's a lot of duplication between this and other
2560  * routines.  This needs to be cleaned up. */
2561 struct rx_packet *
2562 rxi_SendSpecial(struct rx_call *call,
2563                 struct rx_connection *conn,
2564                 struct rx_packet *optionalPacket, int type, char *data,
2565                 int nbytes, int istack)
2566 {
2567     /* Some of the following stuff should be common code for all
2568      * packet sends (it's repeated elsewhere) */
2569     struct rx_packet *p;
2570     unsigned int i = 0;
2571     int savelen = 0, saven = 0;
2572     int channel, callNumber;
2573     if (call) {
2574         channel = call->channel;
2575         callNumber = *call->callNumber;
2576         /* BUSY packets refer to the next call on this connection */
2577         if (type == RX_PACKET_TYPE_BUSY) {
2578             callNumber++;
2579         }
2580     } else {
2581         channel = 0;
2582         callNumber = 0;
2583     }
2584     p = optionalPacket;
2585     if (!p) {
2586         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2587         if (!p)
2588             osi_Panic("rxi_SendSpecial failure");
2589     }
2590
2591     if (nbytes != -1)
2592         p->length = nbytes;
2593     else
2594         nbytes = p->length;
2595     p->header.serviceId = conn->serviceId;
2596     p->header.securityIndex = conn->securityIndex;
2597     p->header.cid = (conn->cid | channel);
2598     p->header.callNumber = callNumber;
2599     p->header.seq = 0;
2600     p->header.epoch = conn->epoch;
2601     p->header.type = type;
2602     p->header.flags = 0;
2603     if (conn->type == RX_CLIENT_CONNECTION)
2604         p->header.flags |= RX_CLIENT_INITIATED;
2605     if (data)
2606         rx_packetwrite(p, 0, nbytes, data);
2607
2608     for (i = 1; i < p->niovecs; i++) {
2609         if (nbytes <= p->wirevec[i].iov_len) {
2610             savelen = p->wirevec[i].iov_len;
2611             saven = p->niovecs;
2612             p->wirevec[i].iov_len = nbytes;
2613             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2614         } else
2615             nbytes -= p->wirevec[i].iov_len;
2616     }
2617
2618     if (call)
2619         rxi_Send(call, p, istack);
2620     else
2621         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2622     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2623         /* really need to do this, but it seems safer this way, given that  */
2624         /* sneaky optionalPacket... */
2625         p->wirevec[i - 1].iov_len = savelen;
2626         p->niovecs = saven;
2627     }
2628     if (!optionalPacket)
2629         rxi_FreePacket(p);
2630     return optionalPacket;
2631 }
2632
2633
2634 /* Encode the packet's header (from the struct header in the packet to
2635  * the net byte order representation in the wire representation of the
2636  * packet, which is what is actually sent out on the wire) */
2637 void
2638 rxi_EncodePacketHeader(struct rx_packet *p)
2639 {
2640     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2641
2642     memset(buf, 0, RX_HEADER_SIZE);
2643     *buf++ = htonl(p->header.epoch);
2644     *buf++ = htonl(p->header.cid);
2645     *buf++ = htonl(p->header.callNumber);
2646     *buf++ = htonl(p->header.seq);
2647     *buf++ = htonl(p->header.serial);
2648     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2649                    | (((afs_uint32) p->header.flags) << 16)
2650                    | (p->header.userStatus << 8) | p->header.securityIndex);
2651     /* Note: top 16 bits of this next word were reserved */
2652     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2653 }
2654
2655 /* Decode the packet's header (from net byte order to a struct header) */
2656 void
2657 rxi_DecodePacketHeader(struct rx_packet *p)
2658 {
2659     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2660     afs_uint32 temp;
2661
2662     p->header.epoch = ntohl(*buf);
2663     buf++;
2664     p->header.cid = ntohl(*buf);
2665     buf++;
2666     p->header.callNumber = ntohl(*buf);
2667     buf++;
2668     p->header.seq = ntohl(*buf);
2669     buf++;
2670     p->header.serial = ntohl(*buf);
2671     buf++;
2672
2673     temp = ntohl(*buf);
2674     buf++;
2675
2676     /* C will truncate byte fields to bytes for me */
2677     p->header.type = temp >> 24;
2678     p->header.flags = temp >> 16;
2679     p->header.userStatus = temp >> 8;
2680     p->header.securityIndex = temp >> 0;
2681
2682     temp = ntohl(*buf);
2683     buf++;
2684
2685     p->header.serviceId = (temp & 0xffff);
2686     p->header.spare = temp >> 16;
2687     /* Note: top 16 bits of this last word are the security checksum */
2688 }
2689
2690 void
2691 rxi_PrepareSendPacket(struct rx_call *call,
2692                       struct rx_packet *p, int last)
2693 {
2694     struct rx_connection *conn = call->conn;
2695     unsigned int i;
2696     afs_int32 len;              /* len must be a signed type; it can go negative */
2697
2698     p->flags &= ~RX_PKTFLAG_ACKED;
2699     p->header.cid = (conn->cid | call->channel);
2700     p->header.serviceId = conn->serviceId;
2701     p->header.securityIndex = conn->securityIndex;
2702
2703     /* No data packets on call 0. Where do these come from? */
2704     if (*call->callNumber == 0)
2705         *call->callNumber = 1;
2706
2707     p->header.callNumber = *call->callNumber;
2708     p->header.seq = call->tnext++;
2709     p->header.epoch = conn->epoch;
2710     p->header.type = RX_PACKET_TYPE_DATA;
2711     p->header.flags = 0;
2712     p->header.spare = 0;
2713     if (conn->type == RX_CLIENT_CONNECTION)
2714         p->header.flags |= RX_CLIENT_INITIATED;
2715
2716     if (last)
2717         p->header.flags |= RX_LAST_PACKET;
2718
2719     clock_Zero(&p->retryTime);  /* Never yet transmitted */
2720     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2721     p->header.serial = 0;       /* Another way of saying never transmitted... */
2722     p->backoff = 0;
2723
2724     /* Now that we're sure this is the last data on the call, make sure
2725      * that the "length" and the sum of the iov_lens matches. */
2726     len = p->length + call->conn->securityHeaderSize;
2727
2728     for (i = 1; i < p->niovecs && len > 0; i++) {
2729         len -= p->wirevec[i].iov_len;
2730     }
2731     if (len > 0) {
2732         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2733     } else if (i < p->niovecs) {
2734         /* Free any extra elements in the wirevec */
2735 #if defined(RX_ENABLE_TSFPQ)
2736         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2737 #else /* !RX_ENABLE_TSFPQ */
2738         MUTEX_ENTER(&rx_freePktQ_lock);
2739         rxi_FreeDataBufsNoLock(p, i);
2740         MUTEX_EXIT(&rx_freePktQ_lock);
2741 #endif /* !RX_ENABLE_TSFPQ */
2742
2743         p->niovecs = i;
2744     }
2745     if (len)
2746         p->wirevec[i - 1].iov_len += len;
2747     RXS_PreparePacket(conn->securityObject, call, p);
2748 }
2749
2750 /* Given an interface MTU size, calculate an adjusted MTU size that
2751  * will make efficient use of the RX buffers when the peer is sending
2752  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2753 int
2754 rxi_AdjustIfMTU(int mtu)
2755 {
2756     int adjMTU;
2757     int frags;
2758
2759     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2760         return mtu;
2761     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2762     if (mtu <= adjMTU) {
2763         return mtu;
2764     }
2765     mtu -= adjMTU;
2766     if (mtu <= 0) {
2767         return adjMTU;
2768     }
2769     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2770     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2771 }
2772
2773 /* Given an interface MTU size, and the peer's advertised max receive
2774  * size, calculate an adjisted maxMTU size that makes efficient use
2775  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2776 int
2777 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2778 {
2779     int maxMTU = mtu * rxi_nSendFrags;
2780     maxMTU = MIN(maxMTU, peerMaxMTU);
2781     return rxi_AdjustIfMTU(maxMTU);
2782 }
2783
2784 /* Given a packet size, figure out how many datagram packet will fit.
2785  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2786  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2787  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2788 int
2789 rxi_AdjustDgramPackets(int frags, int mtu)
2790 {
2791     int maxMTU;
2792     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2793         return 1;
2794     }
2795     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2796     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2797     /* subtract the size of the first and last packets */
2798     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2799     if (maxMTU < 0) {
2800         return 1;
2801     }
2802     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2803 }
2804
2805 #ifndef KERNEL
2806 /*
2807  * This function can be used by the Windows Cache Manager
2808  * to dump the list of all rx packets so that we can determine
2809  * where the packet leakage is.
2810  */
2811 int rx_DumpPackets(FILE *outputFile, char *cookie)
2812 {
2813 #ifdef RXDEBUG_PACKET
2814     struct rx_packet *p;
2815 #ifdef AFS_NT40_ENV
2816     int zilch;
2817     char output[2048];
2818 #define RXDPRINTF sprintf
2819 #define RXDPRINTOUT output
2820 #else
2821 #define RXDPRINTF fprintf
2822 #define RXDPRINTOUT outputFile
2823 #endif
2824
2825     NETPRI;
2826     MUTEX_ENTER(&rx_freePktQ_lock);
2827     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2828 #ifdef AFS_NT40_ENV
2829     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2830 #endif
2831
2832     for (p = rx_mallocedP; p; p = p->allNextp) {
2833         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2834                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2835                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2836                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2837                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2838                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2839 #ifdef AFS_NT40_ENV
2840         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2841 #endif
2842     }
2843
2844     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2845 #ifdef AFS_NT40_ENV
2846     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2847 #endif
2848
2849     MUTEX_EXIT(&rx_freePktQ_lock);
2850     USERPRI;
2851 #endif /* RXDEBUG_PACKET */
2852     return 0;
2853 }
2854 #endif