src/rx/rx_packet.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 #include <afsconfig.h>
  11 #ifdef KERNEL
  12 #include "afs/param.h"
  13 #else
  14 #include <afs/param.h>
  15 #endif
  16
  17 #ifdef KERNEL
  18 #if defined(UKERNEL)
  19 #include "afs/sysincludes.h"
  20 #include "afsincludes.h"
  21 #include "rx/rx_kcommon.h"
  22 #include "rx/rx_clock.h"
  23 #include "rx/rx_queue.h"
  24 #include "rx/rx_packet.h"
  25 #else /* defined(UKERNEL) */
  26 #ifdef RX_KERNEL_TRACE
  27 #include "../rx/rx_kcommon.h"
  28 #endif
  29 #include "h/types.h"
  30 #ifndef AFS_LINUX20_ENV
  31 #include "h/systm.h"
  32 #endif
  33 #if defined(AFS_SGI_ENV) || defined(AFS_HPUX110_ENV)
  34 #include "afs/sysincludes.h"
  35 #endif
  36 #if defined(AFS_OBSD_ENV)
  37 #include "h/proc.h"
  38 #endif
  39 #include "h/socket.h"
  40 #if !defined(AFS_SUN5_ENV) &&  !defined(AFS_LINUX20_ENV) && !defined(AFS_HPUX110_ENV)
  41 #if     !defined(AFS_OSF_ENV) && !defined(AFS_AIX41_ENV)
  42 #include "sys/mount.h"          /* it gets pulled in by something later anyway */
  43 #endif
  44 #include "h/mbuf.h"
  45 #endif
  46 #include "netinet/in.h"
  47 #include "afs/afs_osi.h"
  48 #include "rx_kmutex.h"
  49 #include "rx/rx_clock.h"
  50 #include "rx/rx_queue.h"
  51 #ifdef  AFS_SUN5_ENV
  52 #include <sys/sysmacros.h>
  53 #endif
  54 #include "rx/rx_packet.h"
  55 #endif /* defined(UKERNEL) */
  56 #include "rx/rx_globals.h"
  57 #else /* KERNEL */
  58 #include "sys/types.h"
  59 #include <sys/stat.h>
  60 #include <errno.h>
  61 #if defined(AFS_NT40_ENV)
  62 #include <winsock2.h>
  63 #ifndef EWOULDBLOCK
  64 #define EWOULDBLOCK WSAEWOULDBLOCK
  65 #endif
  66 #include "rx_user.h"
  67 #include "rx_xmit_nt.h"
  68 #include <stdlib.h>
  69 #else
  70 #include <sys/socket.h>
  71 #include <netinet/in.h>
  72 #endif
  73 #include "rx_clock.h"
  74 #include "rx.h"
  75 #include "rx_queue.h"
  76 #ifdef  AFS_SUN5_ENV
  77 #include <sys/sysmacros.h>
  78 #endif
  79 #include "rx_packet.h"
  80 #include "rx_globals.h"
  81 #include <lwp.h>
  82 #include <assert.h>
  83 #include <string.h>
  84 #ifdef HAVE_UNISTD_H
  85 #include <unistd.h>
  86 #endif
  87 #endif /* KERNEL */
  88
  89 #ifdef RX_LOCKS_DB
  90 /* rxdb_fileID is used to identify the lock location, along with line#. */
  91 static int rxdb_fileID = RXDB_FILE_RX_PACKET;
  92 #endif /* RX_LOCKS_DB */
  93 static struct rx_packet *rx_mallocedP = 0;
  94 #ifdef RXDEBUG_PACKET
  95 static afs_uint32       rx_packet_id = 0;
  96 #endif
  97
  98 extern char cml_version_number[];
  99
 100 static int AllocPacketBufs(int class, int num_pkts, struct rx_queue *q);
 101
 102 static void rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
 103                                 afs_uint32 ahost, short aport,
 104                                 afs_int32 istack);
 105
 106 #ifdef RX_ENABLE_TSFPQ
 107 static int
 108 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global);
 109 #else
 110 static int rxi_FreeDataBufsToQueue(struct rx_packet *p,
 111                                    afs_uint32 first,
 112                                    struct rx_queue * q);
 113 #endif
 114
 115 /* some rules about packets:
 116  * 1.  When a packet is allocated, the final iov_buf contains room for
 117  * a security trailer, but iov_len masks that fact.  If the security
 118  * package wants to add the trailer, it may do so, and then extend
 119  * iov_len appropriately.  For this reason, packet's niovecs and
 120  * iov_len fields should be accurate before calling PreparePacket.
 121 */
 122
 123 /* Preconditions:
 124  *        all packet buffers (iov_base) are integral multiples of
 125  *        the word size.
 126  *        offset is an integral multiple of the word size.
 127  */
 128 afs_int32
 129 rx_SlowGetInt32(struct rx_packet *packet, size_t offset)
 130 {
 131     unsigned int i;
 132     size_t l;
 133     for (l = 0, i = 1; i < packet->niovecs; i++) {
 134         if (l + packet->wirevec[i].iov_len > offset) {
 135             return
 136                 *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 137                                  (offset - l)));
 138         }
 139         l += packet->wirevec[i].iov_len;
 140     }
 141
 142     return 0;
 143 }
 144
 145 /* Preconditions:
 146  *        all packet buffers (iov_base) are integral multiples of the word size.
 147  *        offset is an integral multiple of the word size.
 148  */
 149 afs_int32
 150 rx_SlowPutInt32(struct rx_packet * packet, size_t offset, afs_int32 data)
 151 {
 152     unsigned int i;
 153     size_t l;
 154     for (l = 0, i = 1; i < packet->niovecs; i++) {
 155         if (l + packet->wirevec[i].iov_len > offset) {
 156             *((afs_int32 *) ((char *)(packet->wirevec[i].iov_base) +
 157                              (offset - l))) = data;
 158             return 0;
 159         }
 160         l += packet->wirevec[i].iov_len;
 161     }
 162
 163     return 0;
 164 }
 165
 166 /* Preconditions:
 167  *        all packet buffers (iov_base) are integral multiples of the
 168  *        word size.
 169  *        offset is an integral multiple of the word size.
 170  * Packet Invariants:
 171  *         all buffers are contiguously arrayed in the iovec from 0..niovecs-1
 172  */
 173 afs_int32
 174 rx_SlowReadPacket(struct rx_packet * packet, unsigned int offset, int resid,
 175                   char *out)
 176 {
 177     unsigned int i, j, l, r;
 178     for (l = 0, i = 1; i < packet->niovecs; i++) {
 179         if (l + packet->wirevec[i].iov_len > offset) {
 180             break;
 181         }
 182         l += packet->wirevec[i].iov_len;
 183     }
 184
 185     /* i is the iovec which contains the first little bit of data in which we
 186      * are interested.  l is the total length of everything prior to this iovec.
 187      * j is the number of bytes we can safely copy out of this iovec.
 188      * offset only applies to the first iovec.
 189      */
 190     r = resid;
 191     while ((r > 0) && (i < packet->niovecs)) {
 192         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 193         memcpy(out, (char *)(packet->wirevec[i].iov_base) + (offset - l), j);
 194         r -= j;
 195         out += j;
 196         l += packet->wirevec[i].iov_len;
 197         offset = l;
 198         i++;
 199     }
 200
 201     return (r ? (resid - r) : resid);
 202 }
 203
 204
 205 /* Preconditions:
 206  *        all packet buffers (iov_base) are integral multiples of the
 207  *        word size.
 208  *        offset is an integral multiple of the word size.
 209  */
 210 afs_int32
 211 rx_SlowWritePacket(struct rx_packet * packet, int offset, int resid, char *in)
 212 {
 213     unsigned int i, j, l, o, r;
 214     char *b;
 215
 216     for (l = 0, i = 1, o = offset; i < packet->niovecs; i++) {
 217         if (l + packet->wirevec[i].iov_len > o) {
 218             break;
 219         }
 220         l += packet->wirevec[i].iov_len;
 221     }
 222
 223     /* i is the iovec which contains the first little bit of data in which we
 224      * are interested.  l is the total length of everything prior to this iovec.
 225      * j is the number of bytes we can safely copy out of this iovec.
 226      * offset only applies to the first iovec.
 227      */
 228     r = resid;
 229     while ((r > 0) && (i <= RX_MAXWVECS)) {
 230         if (i >= packet->niovecs)
 231             if (rxi_AllocDataBuf(packet, r, RX_PACKET_CLASS_SEND_CBUF) > 0)     /* ++niovecs as a side-effect */
 232                 break;
 233
 234         b = (char *)(packet->wirevec[i].iov_base) + (offset - l);
 235         j = MIN(r, packet->wirevec[i].iov_len - (offset - l));
 236         memcpy(b, in, j);
 237         r -= j;
 238         in += j;
 239         l += packet->wirevec[i].iov_len;
 240         offset = l;
 241         i++;
 242     }
 243
 244     return (r ? (resid - r) : resid);
 245 }
 246
 247 int
 248 rxi_AllocPackets(int class, int num_pkts, struct rx_queue * q)
 249 {
 250     struct rx_packet *p, *np;
 251
 252     num_pkts = AllocPacketBufs(class, num_pkts, q);
 253
 254     for (queue_Scan(q, p, np, rx_packet)) {
 255         RX_PACKET_IOV_FULLINIT(p);
 256     }
 257
 258     return num_pkts;
 259 }
 260
 261 #ifdef RX_ENABLE_TSFPQ
 262 static int
 263 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 264 {
 265     struct rx_ts_info_t * rx_ts_info;
 266     int transfer;
 267     SPLVAR;
 268
 269     RX_TS_INFO_GET(rx_ts_info);
 270
 271     transfer = num_pkts - rx_ts_info->_FPQ.len;
 272     if (transfer > 0) {
 273         NETPRI;
 274         MUTEX_ENTER(&rx_freePktQ_lock);
 275         transfer = MAX(transfer, rx_TSFPQGlobSize);
 276         if (transfer > rx_nFreePackets) {
 277             /* alloc enough for us, plus a few globs for other threads */
 278             rxi_MorePacketsNoLock(transfer + 4 * rx_initSendWindow);
 279         }
 280
 281         RX_TS_FPQ_GTOL2(rx_ts_info, transfer);
 282
 283         MUTEX_EXIT(&rx_freePktQ_lock);
 284         USERPRI;
 285     }
 286
 287     RX_TS_FPQ_QCHECKOUT(rx_ts_info, num_pkts, q);
 288
 289     return num_pkts;
 290 }
 291 #else /* RX_ENABLE_TSFPQ */
 292 static int
 293 AllocPacketBufs(int class, int num_pkts, struct rx_queue * q)
 294 {
 295     struct rx_packet *c;
 296     int i;
 297 #ifdef KERNEL
 298     int overq = 0;
 299 #endif
 300     SPLVAR;
 301
 302     NETPRI;
 303
 304     MUTEX_ENTER(&rx_freePktQ_lock);
 305
 306 #ifdef KERNEL
 307     for (; (num_pkts > 0) && (rxi_OverQuota2(class,num_pkts));
 308          num_pkts--, overq++);
 309
 310     if (overq) {
 311         rxi_NeedMorePackets = TRUE;
 312         if (rx_stats_active) {
 313             switch (class) {
 314             case RX_PACKET_CLASS_RECEIVE:
 315                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
 316                 break;
 317             case RX_PACKET_CLASS_SEND:
 318                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
 319                 break;
 320             case RX_PACKET_CLASS_SPECIAL:
 321                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
 322                 break;
 323             case RX_PACKET_CLASS_RECV_CBUF:
 324                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
 325                 break;
 326             case RX_PACKET_CLASS_SEND_CBUF:
 327                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
 328                 break;
 329             }
 330         }
 331     }
 332
 333     if (rx_nFreePackets < num_pkts)
 334         num_pkts = rx_nFreePackets;
 335
 336     if (!num_pkts) {
 337         rxi_NeedMorePackets = TRUE;
 338         goto done;
 339     }
 340 #else /* KERNEL */
 341     if (rx_nFreePackets < num_pkts) {
 342         rxi_MorePacketsNoLock(MAX((num_pkts-rx_nFreePackets), 4 * rx_initSendWindow));
 343     }
 344 #endif /* KERNEL */
 345
 346     for (i=0, c=queue_First(&rx_freePacketQueue, rx_packet);
 347          i < num_pkts;
 348          i++, c=queue_Next(c, rx_packet)) {
 349         RX_FPQ_MARK_USED(c);
 350     }
 351
 352     queue_SplitBeforeAppend(&rx_freePacketQueue,q,c);
 353
 354     rx_nFreePackets -= num_pkts;
 355
 356 #ifdef KERNEL
 357   done:
 358 #endif
 359     MUTEX_EXIT(&rx_freePktQ_lock);
 360
 361     USERPRI;
 362     return num_pkts;
 363 }
 364 #endif /* RX_ENABLE_TSFPQ */
 365
 366 /*
 367  * Free a packet currently used as a continuation buffer
 368  */
 369 #ifdef RX_ENABLE_TSFPQ
 370 /* num_pkts=0 means queue length is unknown */
 371 int
 372 rxi_FreePackets(int num_pkts, struct rx_queue * q)
 373 {
 374     struct rx_ts_info_t * rx_ts_info;
 375     struct rx_packet *c, *nc;
 376     SPLVAR;
 377
 378     osi_Assert(num_pkts >= 0);
 379     RX_TS_INFO_GET(rx_ts_info);
 380
 381     if (!num_pkts) {
 382         for (queue_Scan(q, c, nc, rx_packet), num_pkts++) {
 383             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 384         }
 385     } else {
 386         for (queue_Scan(q, c, nc, rx_packet)) {
 387             rxi_FreeDataBufsTSFPQ(c, 2, 0);
 388         }
 389     }
 390
 391     if (num_pkts) {
 392         RX_TS_FPQ_QCHECKIN(rx_ts_info, num_pkts, q);
 393     }
 394
 395     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 396         NETPRI;
 397         MUTEX_ENTER(&rx_freePktQ_lock);
 398
 399         RX_TS_FPQ_LTOG(rx_ts_info);
 400
 401         /* Wakeup anyone waiting for packets */
 402         rxi_PacketsUnWait();
 403
 404         MUTEX_EXIT(&rx_freePktQ_lock);
 405         USERPRI;
 406     }
 407
 408     return num_pkts;
 409 }
 410 #else /* RX_ENABLE_TSFPQ */
 411 /* num_pkts=0 means queue length is unknown */
 412 int
 413 rxi_FreePackets(int num_pkts, struct rx_queue *q)
 414 {
 415     struct rx_queue cbs;
 416     struct rx_packet *p, *np;
 417     int qlen = 0;
 418     SPLVAR;
 419
 420     osi_Assert(num_pkts >= 0);
 421     queue_Init(&cbs);
 422
 423     if (!num_pkts) {
 424         for (queue_Scan(q, p, np, rx_packet), num_pkts++) {
 425             if (p->niovecs > 2) {
 426                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 427             }
 428             RX_FPQ_MARK_FREE(p);
 429         }
 430         if (!num_pkts)
 431             return 0;
 432     } else {
 433         for (queue_Scan(q, p, np, rx_packet)) {
 434             if (p->niovecs > 2) {
 435                 qlen += rxi_FreeDataBufsToQueue(p, 2, &cbs);
 436             }
 437             RX_FPQ_MARK_FREE(p);
 438         }
 439     }
 440
 441     if (qlen) {
 442         queue_SpliceAppend(q, &cbs);
 443         qlen += num_pkts;
 444     } else
 445         qlen = num_pkts;
 446
 447     NETPRI;
 448     MUTEX_ENTER(&rx_freePktQ_lock);
 449
 450     queue_SpliceAppend(&rx_freePacketQueue, q);
 451     rx_nFreePackets += qlen;
 452
 453     /* Wakeup anyone waiting for packets */
 454     rxi_PacketsUnWait();
 455
 456     MUTEX_EXIT(&rx_freePktQ_lock);
 457     USERPRI;
 458
 459     return num_pkts;
 460 }
 461 #endif /* RX_ENABLE_TSFPQ */
 462
 463 /* this one is kind of awful.
 464  * In rxkad, the packet has been all shortened, and everything, ready for
 465  * sending.  All of a sudden, we discover we need some of that space back.
 466  * This isn't terribly general, because it knows that the packets are only
 467  * rounded up to the EBS (userdata + security header).
 468  */
 469 int
 470 rxi_RoundUpPacket(struct rx_packet *p, unsigned int nb)
 471 {
 472     int i;
 473     i = p->niovecs - 1;
 474     if (p->wirevec[i].iov_base == (caddr_t) p->localdata) {
 475         if (p->wirevec[i].iov_len <= RX_FIRSTBUFFERSIZE - nb) {
 476             p->wirevec[i].iov_len += nb;
 477             return 0;
 478         }
 479     } else {
 480         if (p->wirevec[i].iov_len <= RX_CBUFFERSIZE - nb) {
 481             p->wirevec[i].iov_len += nb;
 482             return 0;
 483         }
 484     }
 485
 486     return 0;
 487 }
 488
 489 /* get sufficient space to store nb bytes of data (or more), and hook
 490  * it into the supplied packet.  Return nbytes<=0 if successful, otherwise
 491  * returns the number of bytes >0 which it failed to come up with.
 492  * Don't need to worry about locking on packet, since only
 493  * one thread can manipulate one at a time. Locking on continution
 494  * packets is handled by AllocPacketBufs */
 495 /* MTUXXX don't need to go throught the for loop if we can trust niovecs */
 496 int
 497 rxi_AllocDataBuf(struct rx_packet *p, int nb, int class)
 498 {
 499     int i, nv;
 500     struct rx_queue q;
 501     struct rx_packet *cb, *ncb;
 502
 503     /* compute the number of cbuf's we need */
 504     nv = nb / RX_CBUFFERSIZE;
 505     if ((nv * RX_CBUFFERSIZE) < nb)
 506         nv++;
 507     if ((nv + p->niovecs) > RX_MAXWVECS)
 508         nv = RX_MAXWVECS - p->niovecs;
 509     if (nv < 1)
 510         return nb;
 511
 512     /* allocate buffers */
 513     queue_Init(&q);
 514     nv = AllocPacketBufs(class, nv, &q);
 515
 516     /* setup packet iovs */
 517     for (i = p->niovecs, queue_Scan(&q, cb, ncb, rx_packet), i++) {
 518         queue_Remove(cb);
 519         p->wirevec[i].iov_base = (caddr_t) cb->localdata;
 520         p->wirevec[i].iov_len = RX_CBUFFERSIZE;
 521     }
 522
 523     nb -= (nv * RX_CBUFFERSIZE);
 524     p->length += (nv * RX_CBUFFERSIZE);
 525     p->niovecs += nv;
 526
 527     return nb;
 528 }
 529
 530 /* Add more packet buffers */
 531 #ifdef RX_ENABLE_TSFPQ
 532 void
 533 rxi_MorePackets(int apackets)
 534 {
 535     struct rx_packet *p, *e;
 536     struct rx_ts_info_t * rx_ts_info;
 537     int getme;
 538     SPLVAR;
 539
 540     getme = apackets * sizeof(struct rx_packet);
 541     p = (struct rx_packet *)osi_Alloc(getme);
 542     osi_Assert(p);
 543
 544     PIN(p, getme);              /* XXXXX */
 545     memset(p, 0, getme);
 546     RX_TS_INFO_GET(rx_ts_info);
 547
 548     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 549     /* TSFPQ patch also needs to keep track of total packets */
 550
 551     MUTEX_ENTER(&rx_packets_mutex);
 552     rx_nPackets += apackets;
 553     RX_TS_FPQ_COMPUTE_LIMITS;
 554     MUTEX_EXIT(&rx_packets_mutex);
 555
 556     for (e = p + apackets; p < e; p++) {
 557         RX_PACKET_IOV_INIT(p);
 558         p->niovecs = 2;
 559
 560         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 561
 562         NETPRI;
 563         MUTEX_ENTER(&rx_freePktQ_lock);
 564 #ifdef RXDEBUG_PACKET
 565         p->packetId = rx_packet_id++;
 566         p->allNextp = rx_mallocedP;
 567 #endif /* RXDEBUG_PACKET */
 568         rx_mallocedP = p;
 569         MUTEX_EXIT(&rx_freePktQ_lock);
 570         USERPRI;
 571     }
 572     rx_ts_info->_FPQ.delta += apackets;
 573
 574     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 575         NETPRI;
 576         MUTEX_ENTER(&rx_freePktQ_lock);
 577
 578         RX_TS_FPQ_LTOG(rx_ts_info);
 579         rxi_NeedMorePackets = FALSE;
 580         rxi_PacketsUnWait();
 581
 582         MUTEX_EXIT(&rx_freePktQ_lock);
 583         USERPRI;
 584     }
 585 }
 586 #else /* RX_ENABLE_TSFPQ */
 587 void
 588 rxi_MorePackets(int apackets)
 589 {
 590     struct rx_packet *p, *e;
 591     int getme;
 592     SPLVAR;
 593
 594     getme = apackets * sizeof(struct rx_packet);
 595     p = (struct rx_packet *)osi_Alloc(getme);
 596     osi_Assert(p);
 597
 598     PIN(p, getme);              /* XXXXX */
 599     memset(p, 0, getme);
 600     NETPRI;
 601     MUTEX_ENTER(&rx_freePktQ_lock);
 602
 603     for (e = p + apackets; p < e; p++) {
 604         RX_PACKET_IOV_INIT(p);
 605 #ifdef RX_TRACK_PACKETS
 606         p->flags |= RX_PKTFLAG_FREE;
 607 #endif
 608         p->niovecs = 2;
 609
 610         queue_Append(&rx_freePacketQueue, p);
 611 #ifdef RXDEBUG_PACKET
 612         p->packetId = rx_packet_id++;
 613         p->allNextp = rx_mallocedP;
 614 #endif /* RXDEBUG_PACKET */
 615         rx_mallocedP = p;
 616     }
 617
 618     rx_nPackets += apackets;
 619     rx_nFreePackets += apackets;
 620     rxi_NeedMorePackets = FALSE;
 621     rxi_PacketsUnWait();
 622
 623     MUTEX_EXIT(&rx_freePktQ_lock);
 624     USERPRI;
 625 }
 626 #endif /* RX_ENABLE_TSFPQ */
 627
 628 #ifdef RX_ENABLE_TSFPQ
 629 void
 630 rxi_MorePacketsTSFPQ(int apackets, int flush_global, int num_keep_local)
 631 {
 632     struct rx_packet *p, *e;
 633     struct rx_ts_info_t * rx_ts_info;
 634     int getme;
 635     SPLVAR;
 636
 637     getme = apackets * sizeof(struct rx_packet);
 638     p = (struct rx_packet *)osi_Alloc(getme);
 639
 640     PIN(p, getme);              /* XXXXX */
 641     memset(p, 0, getme);
 642     RX_TS_INFO_GET(rx_ts_info);
 643
 644     RX_TS_FPQ_LOCAL_ALLOC(rx_ts_info,apackets);
 645     /* TSFPQ patch also needs to keep track of total packets */
 646     MUTEX_ENTER(&rx_packets_mutex);
 647     rx_nPackets += apackets;
 648     RX_TS_FPQ_COMPUTE_LIMITS;
 649     MUTEX_EXIT(&rx_packets_mutex);
 650
 651     for (e = p + apackets; p < e; p++) {
 652         RX_PACKET_IOV_INIT(p);
 653         p->niovecs = 2;
 654         RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 655
 656         NETPRI;
 657         MUTEX_ENTER(&rx_freePktQ_lock);
 658 #ifdef RXDEBUG_PACKET
 659         p->packetId = rx_packet_id++;
 660         p->allNextp = rx_mallocedP;
 661 #endif /* RXDEBUG_PACKET */
 662         rx_mallocedP = p;
 663         MUTEX_EXIT(&rx_freePktQ_lock);
 664         USERPRI;
 665     }
 666     rx_ts_info->_FPQ.delta += apackets;
 667
 668     if (flush_global &&
 669         (num_keep_local < apackets)) {
 670         NETPRI;
 671         MUTEX_ENTER(&rx_freePktQ_lock);
 672
 673         RX_TS_FPQ_LTOG2(rx_ts_info, (apackets - num_keep_local));
 674         rxi_NeedMorePackets = FALSE;
 675         rxi_PacketsUnWait();
 676
 677         MUTEX_EXIT(&rx_freePktQ_lock);
 678         USERPRI;
 679     }
 680 }
 681 #endif /* RX_ENABLE_TSFPQ */
 682
 683 #ifndef KERNEL
 684 /* Add more packet buffers */
 685 void
 686 rxi_MorePacketsNoLock(int apackets)
 687 {
 688 #ifdef RX_ENABLE_TSFPQ
 689     struct rx_ts_info_t * rx_ts_info;
 690 #endif /* RX_ENABLE_TSFPQ */
 691     struct rx_packet *p, *e;
 692     int getme;
 693
 694     /* allocate enough packets that 1/4 of the packets will be able
 695      * to hold maximal amounts of data */
 696     apackets += (apackets / 4)
 697         * ((rx_maxJumboRecvSize - RX_FIRSTBUFFERSIZE) / RX_CBUFFERSIZE);
 698     do {
 699         getme = apackets * sizeof(struct rx_packet);
 700         p = (struct rx_packet *)osi_Alloc(getme);
 701         if (p == NULL) {
 702             apackets -= apackets / 4;
 703             osi_Assert(apackets > 0);
 704         }
 705     } while(p == NULL);
 706     memset(p, 0, getme);
 707
 708 #ifdef RX_ENABLE_TSFPQ
 709     RX_TS_INFO_GET(rx_ts_info);
 710     RX_TS_FPQ_GLOBAL_ALLOC(rx_ts_info,apackets);
 711 #endif /* RX_ENABLE_TSFPQ */
 712
 713     for (e = p + apackets; p < e; p++) {
 714         RX_PACKET_IOV_INIT(p);
 715 #ifdef RX_TRACK_PACKETS
 716         p->flags |= RX_PKTFLAG_FREE;
 717 #endif
 718         p->niovecs = 2;
 719
 720         queue_Append(&rx_freePacketQueue, p);
 721 #ifdef RXDEBUG_PACKET
 722         p->packetId = rx_packet_id++;
 723         p->allNextp = rx_mallocedP;
 724 #endif /* RXDEBUG_PACKET */
 725         rx_mallocedP = p;
 726     }
 727
 728     rx_nFreePackets += apackets;
 729     MUTEX_ENTER(&rx_packets_mutex);
 730     rx_nPackets += apackets;
 731 #ifdef RX_ENABLE_TSFPQ
 732     RX_TS_FPQ_COMPUTE_LIMITS;
 733 #endif /* RX_ENABLE_TSFPQ */
 734     MUTEX_EXIT(&rx_packets_mutex);
 735     rxi_NeedMorePackets = FALSE;
 736     rxi_PacketsUnWait();
 737 }
 738 #endif /* !KERNEL */
 739
 740 void
 741 rxi_FreeAllPackets(void)
 742 {
 743     /* must be called at proper interrupt level, etcetera */
 744     /* MTUXXX need to free all Packets */
 745     osi_Free(rx_mallocedP,
 746              (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 747     UNPIN(rx_mallocedP, (rx_maxReceiveWindow + 2) * sizeof(struct rx_packet));
 748 }
 749
 750 #ifdef RX_ENABLE_TSFPQ
 751 void
 752 rxi_AdjustLocalPacketsTSFPQ(int num_keep_local, int allow_overcommit)
 753 {
 754     struct rx_ts_info_t * rx_ts_info;
 755     int xfer;
 756     SPLVAR;
 757
 758     RX_TS_INFO_GET(rx_ts_info);
 759
 760     if (num_keep_local != rx_ts_info->_FPQ.len) {
 761         NETPRI;
 762         MUTEX_ENTER(&rx_freePktQ_lock);
 763         if (num_keep_local < rx_ts_info->_FPQ.len) {
 764             xfer = rx_ts_info->_FPQ.len - num_keep_local;
 765             RX_TS_FPQ_LTOG2(rx_ts_info, xfer);
 766             rxi_PacketsUnWait();
 767         } else {
 768             xfer = num_keep_local - rx_ts_info->_FPQ.len;
 769             if ((num_keep_local > rx_TSFPQLocalMax) && !allow_overcommit)
 770                 xfer = rx_TSFPQLocalMax - rx_ts_info->_FPQ.len;
 771             if (rx_nFreePackets < xfer) {
 772                 rxi_MorePacketsNoLock(MAX(xfer - rx_nFreePackets, 4 * rx_initSendWindow));
 773             }
 774             RX_TS_FPQ_GTOL2(rx_ts_info, xfer);
 775         }
 776         MUTEX_EXIT(&rx_freePktQ_lock);
 777         USERPRI;
 778     }
 779 }
 780
 781 void
 782 rxi_FlushLocalPacketsTSFPQ(void)
 783 {
 784     rxi_AdjustLocalPacketsTSFPQ(0, 0);
 785 }
 786 #endif /* RX_ENABLE_TSFPQ */
 787
 788 /* Allocate more packets iff we need more continuation buffers */
 789 /* In kernel, can't page in memory with interrupts disabled, so we
 790  * don't use the event mechanism. */
 791 void
 792 rx_CheckPackets(void)
 793 {
 794     if (rxi_NeedMorePackets) {
 795         rxi_MorePackets(rx_maxSendWindow);
 796     }
 797 }
 798
 799 /* In the packet freeing routine below, the assumption is that
 800    we want all of the packets to be used equally frequently, so that we
 801    don't get packet buffers paging out.  It would be just as valid to
 802    assume that we DO want them to page out if not many are being used.
 803    In any event, we assume the former, and append the packets to the end
 804    of the free list.  */
 805 /* This explanation is bogus.  The free list doesn't remain in any kind of
 806    useful order for afs_int32: the packets in use get pretty much randomly scattered
 807    across all the pages.  In order to permit unused {packets,bufs} to page out, they
 808    must be stored so that packets which are adjacent in memory are adjacent in the
 809    free list.  An array springs rapidly to mind.
 810    */
 811
 812 /* Actually free the packet p. */
 813 #ifdef RX_ENABLE_TSFPQ
 814 void
 815 rxi_FreePacketNoLock(struct rx_packet *p)
 816 {
 817     struct rx_ts_info_t * rx_ts_info;
 818     dpf(("Free %"AFS_PTR_FMT"\n", p));
 819
 820     RX_TS_INFO_GET(rx_ts_info);
 821     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 822     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
 823         RX_TS_FPQ_LTOG(rx_ts_info);
 824     }
 825 }
 826 #else /* RX_ENABLE_TSFPQ */
 827 void
 828 rxi_FreePacketNoLock(struct rx_packet *p)
 829 {
 830     dpf(("Free %"AFS_PTR_FMT"\n", p));
 831
 832     RX_FPQ_MARK_FREE(p);
 833     rx_nFreePackets++;
 834     queue_Append(&rx_freePacketQueue, p);
 835 }
 836 #endif /* RX_ENABLE_TSFPQ */
 837
 838 #ifdef RX_ENABLE_TSFPQ
 839 void
 840 rxi_FreePacketTSFPQ(struct rx_packet *p, int flush_global)
 841 {
 842     struct rx_ts_info_t * rx_ts_info;
 843     dpf(("Free %"AFS_PTR_FMT"\n", p));
 844
 845     RX_TS_INFO_GET(rx_ts_info);
 846     RX_TS_FPQ_CHECKIN(rx_ts_info,p);
 847
 848     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 849         NETPRI;
 850         MUTEX_ENTER(&rx_freePktQ_lock);
 851
 852         RX_TS_FPQ_LTOG(rx_ts_info);
 853
 854         /* Wakeup anyone waiting for packets */
 855         rxi_PacketsUnWait();
 856
 857         MUTEX_EXIT(&rx_freePktQ_lock);
 858         USERPRI;
 859     }
 860 }
 861 #endif /* RX_ENABLE_TSFPQ */
 862
 863 /*
 864  * free continuation buffers off a packet into a queue
 865  *
 866  * [IN] p      -- packet from which continuation buffers will be freed
 867  * [IN] first  -- iovec offset of first continuation buffer to free
 868  * [IN] q      -- queue into which continuation buffers will be chained
 869  *
 870  * returns:
 871  *   number of continuation buffers freed
 872  */
 873 #ifndef RX_ENABLE_TSFPQ
 874 static int
 875 rxi_FreeDataBufsToQueue(struct rx_packet *p, afs_uint32 first, struct rx_queue * q)
 876 {
 877     struct iovec *iov;
 878     struct rx_packet * cb;
 879     int count = 0;
 880
 881     for (first = MAX(2, first); first < p->niovecs; first++, count++) {
 882         iov = &p->wirevec[first];
 883         if (!iov->iov_base)
 884             osi_Panic("rxi_FreeDataBufsToQueue: unexpected NULL iov");
 885         cb = RX_CBUF_TO_PACKET(iov->iov_base, p);
 886         RX_FPQ_MARK_FREE(cb);
 887         queue_Append(q, cb);
 888     }
 889     p->length = 0;
 890     p->niovecs = 0;
 891
 892     return count;
 893 }
 894 #endif
 895
 896 /*
 897  * free packet continuation buffers into the global free packet pool
 898  *
 899  * [IN] p      -- packet from which to free continuation buffers
 900  * [IN] first  -- iovec offset of first continuation buffer to free
 901  *
 902  * returns:
 903  *   zero always
 904  */
 905 int
 906 rxi_FreeDataBufsNoLock(struct rx_packet *p, afs_uint32 first)
 907 {
 908     struct iovec *iov;
 909
 910     for (first = MAX(2, first); first < p->niovecs; first++) {
 911         iov = &p->wirevec[first];
 912         if (!iov->iov_base)
 913             osi_Panic("rxi_FreeDataBufsNoLock: unexpected NULL iov");
 914         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
 915     }
 916     p->length = 0;
 917     p->niovecs = 0;
 918
 919     return 0;
 920 }
 921
 922 #ifdef RX_ENABLE_TSFPQ
 923 /*
 924  * free packet continuation buffers into the thread-local free pool
 925  *
 926  * [IN] p             -- packet from which continuation buffers will be freed
 927  * [IN] first         -- iovec offset of first continuation buffer to free
 928  *                       any value less than 2, the min number of iovecs,
 929  *                       is treated as if it is 2.
 930  * [IN] flush_global  -- if nonzero, we will flush overquota packets to the
 931  *                       global free pool before returning
 932  *
 933  * returns:
 934  *   zero always
 935  */
 936 static int
 937 rxi_FreeDataBufsTSFPQ(struct rx_packet *p, afs_uint32 first, int flush_global)
 938 {
 939     struct iovec *iov;
 940     struct rx_ts_info_t * rx_ts_info;
 941
 942     RX_TS_INFO_GET(rx_ts_info);
 943
 944     for (first = MAX(2, first); first < p->niovecs; first++) {
 945         iov = &p->wirevec[first];
 946         if (!iov->iov_base)
 947             osi_Panic("rxi_FreeDataBufsTSFPQ: unexpected NULL iov");
 948         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
 949     }
 950     p->length = 0;
 951     p->niovecs = 0;
 952
 953     if (flush_global && (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax)) {
 954         NETPRI;
 955         MUTEX_ENTER(&rx_freePktQ_lock);
 956
 957         RX_TS_FPQ_LTOG(rx_ts_info);
 958
 959         /* Wakeup anyone waiting for packets */
 960         rxi_PacketsUnWait();
 961
 962         MUTEX_EXIT(&rx_freePktQ_lock);
 963         USERPRI;
 964     }
 965     return 0;
 966 }
 967 #endif /* RX_ENABLE_TSFPQ */
 968
 969 int rxi_nBadIovecs = 0;
 970
 971 /* rxi_RestoreDataBufs
 972  *
 973  * Restore the correct sizes to the iovecs. Called when reusing a packet
 974  * for reading off the wire.
 975  */
 976 void
 977 rxi_RestoreDataBufs(struct rx_packet *p)
 978 {
 979     unsigned int i;
 980     struct iovec *iov = &p->wirevec[2];
 981
 982     RX_PACKET_IOV_INIT(p);
 983
 984     for (i = 2, iov = &p->wirevec[2]; i < p->niovecs; i++, iov++) {
 985         if (!iov->iov_base) {
 986             rxi_nBadIovecs++;
 987             p->niovecs = i;
 988             break;
 989         }
 990         iov->iov_len = RX_CBUFFERSIZE;
 991     }
 992 }
 993
 994 #ifdef RX_ENABLE_TSFPQ
 995 int
 996 rxi_TrimDataBufs(struct rx_packet *p, int first)
 997 {
 998     int length;
 999     struct iovec *iov, *end;
1000     struct rx_ts_info_t * rx_ts_info;
1001     SPLVAR;
1002
1003     if (first != 1)
1004         osi_Panic("TrimDataBufs 1: first must be 1");
1005
1006     /* Skip over continuation buffers containing message data */
1007     iov = &p->wirevec[2];
1008     end = iov + (p->niovecs - 2);
1009     length = p->length - p->wirevec[1].iov_len;
1010     for (; iov < end && length > 0; iov++) {
1011         if (!iov->iov_base)
1012             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1013         length -= iov->iov_len;
1014     }
1015
1016     /* iov now points to the first empty data buffer. */
1017     if (iov >= end)
1018         return 0;
1019
1020     RX_TS_INFO_GET(rx_ts_info);
1021     for (; iov < end; iov++) {
1022         if (!iov->iov_base)
1023             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1024         RX_TS_FPQ_CHECKIN(rx_ts_info,RX_CBUF_TO_PACKET(iov->iov_base, p));
1025         p->niovecs--;
1026     }
1027     if (rx_ts_info->_FPQ.len > rx_TSFPQLocalMax) {
1028         NETPRI;
1029         MUTEX_ENTER(&rx_freePktQ_lock);
1030
1031         RX_TS_FPQ_LTOG(rx_ts_info);
1032         rxi_PacketsUnWait();
1033
1034         MUTEX_EXIT(&rx_freePktQ_lock);
1035         USERPRI;
1036     }
1037
1038     return 0;
1039 }
1040 #else /* RX_ENABLE_TSFPQ */
1041 int
1042 rxi_TrimDataBufs(struct rx_packet *p, int first)
1043 {
1044     int length;
1045     struct iovec *iov, *end;
1046     SPLVAR;
1047
1048     if (first != 1)
1049         osi_Panic("TrimDataBufs 1: first must be 1");
1050
1051     /* Skip over continuation buffers containing message data */
1052     iov = &p->wirevec[2];
1053     end = iov + (p->niovecs - 2);
1054     length = p->length - p->wirevec[1].iov_len;
1055     for (; iov < end && length > 0; iov++) {
1056         if (!iov->iov_base)
1057             osi_Panic("TrimDataBufs 3: vecs 1-niovecs must not be NULL");
1058         length -= iov->iov_len;
1059     }
1060
1061     /* iov now points to the first empty data buffer. */
1062     if (iov >= end)
1063         return 0;
1064
1065     NETPRI;
1066     MUTEX_ENTER(&rx_freePktQ_lock);
1067
1068     for (; iov < end; iov++) {
1069         if (!iov->iov_base)
1070             osi_Panic("TrimDataBufs 4: vecs 2-niovecs must not be NULL");
1071         rxi_FreePacketNoLock(RX_CBUF_TO_PACKET(iov->iov_base, p));
1072         p->niovecs--;
1073     }
1074     rxi_PacketsUnWait();
1075
1076     MUTEX_EXIT(&rx_freePktQ_lock);
1077     USERPRI;
1078
1079     return 0;
1080 }
1081 #endif /* RX_ENABLE_TSFPQ */
1082
1083 /* Free the packet p.  P is assumed not to be on any queue, i.e.
1084  * remove it yourself first if you call this routine. */
1085 #ifdef RX_ENABLE_TSFPQ
1086 void
1087 rxi_FreePacket(struct rx_packet *p)
1088 {
1089     rxi_FreeDataBufsTSFPQ(p, 2, 0);
1090     rxi_FreePacketTSFPQ(p, RX_TS_FPQ_FLUSH_GLOBAL);
1091 }
1092 #else /* RX_ENABLE_TSFPQ */
1093 void
1094 rxi_FreePacket(struct rx_packet *p)
1095 {
1096     SPLVAR;
1097
1098     NETPRI;
1099     MUTEX_ENTER(&rx_freePktQ_lock);
1100
1101     rxi_FreeDataBufsNoLock(p, 2);
1102     rxi_FreePacketNoLock(p);
1103     /* Wakeup anyone waiting for packets */
1104     rxi_PacketsUnWait();
1105
1106     MUTEX_EXIT(&rx_freePktQ_lock);
1107     USERPRI;
1108 }
1109 #endif /* RX_ENABLE_TSFPQ */
1110
1111 /* rxi_AllocPacket sets up p->length so it reflects the number of
1112  * bytes in the packet at this point, **not including** the header.
1113  * The header is absolutely necessary, besides, this is the way the
1114  * length field is usually used */
1115 #ifdef RX_ENABLE_TSFPQ
1116 struct rx_packet *
1117 rxi_AllocPacketNoLock(int class)
1118 {
1119     struct rx_packet *p;
1120     struct rx_ts_info_t * rx_ts_info;
1121
1122     RX_TS_INFO_GET(rx_ts_info);
1123
1124 #ifdef KERNEL
1125     if (rxi_OverQuota(class)) {
1126         rxi_NeedMorePackets = TRUE;
1127         if (rx_stats_active) {
1128             switch (class) {
1129             case RX_PACKET_CLASS_RECEIVE:
1130                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1131                 break;
1132             case RX_PACKET_CLASS_SEND:
1133                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1134                 break;
1135             case RX_PACKET_CLASS_SPECIAL:
1136                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1137                 break;
1138             case RX_PACKET_CLASS_RECV_CBUF:
1139                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1140                 break;
1141             case RX_PACKET_CLASS_SEND_CBUF:
1142                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1143                 break;
1144             }
1145         }
1146         return (struct rx_packet *)0;
1147     }
1148 #endif /* KERNEL */
1149
1150     if (rx_stats_active)
1151         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1152     if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1153
1154 #ifdef KERNEL
1155         if (queue_IsEmpty(&rx_freePacketQueue))
1156             osi_Panic("rxi_AllocPacket error");
1157 #else /* KERNEL */
1158         if (queue_IsEmpty(&rx_freePacketQueue))
1159             rxi_MorePacketsNoLock(rx_maxSendWindow);
1160 #endif /* KERNEL */
1161
1162
1163         RX_TS_FPQ_GTOL(rx_ts_info);
1164     }
1165
1166     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1167
1168     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1169
1170
1171     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1172      * order to truncate outbound packets.  In the near future, may need
1173      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1174      */
1175     RX_PACKET_IOV_FULLINIT(p);
1176     return p;
1177 }
1178 #else /* RX_ENABLE_TSFPQ */
1179 struct rx_packet *
1180 rxi_AllocPacketNoLock(int class)
1181 {
1182     struct rx_packet *p;
1183
1184 #ifdef KERNEL
1185     if (rxi_OverQuota(class)) {
1186         rxi_NeedMorePackets = TRUE;
1187         if (rx_stats_active) {
1188             switch (class) {
1189             case RX_PACKET_CLASS_RECEIVE:
1190                 rx_MutexIncrement(rx_stats.receivePktAllocFailures, rx_stats_mutex);
1191                 break;
1192             case RX_PACKET_CLASS_SEND:
1193                 rx_MutexIncrement(rx_stats.sendPktAllocFailures, rx_stats_mutex);
1194                 break;
1195             case RX_PACKET_CLASS_SPECIAL:
1196                 rx_MutexIncrement(rx_stats.specialPktAllocFailures, rx_stats_mutex);
1197                 break;
1198             case RX_PACKET_CLASS_RECV_CBUF:
1199                 rx_MutexIncrement(rx_stats.receiveCbufPktAllocFailures, rx_stats_mutex);
1200                 break;
1201             case RX_PACKET_CLASS_SEND_CBUF:
1202                 rx_MutexIncrement(rx_stats.sendCbufPktAllocFailures, rx_stats_mutex);
1203                 break;
1204             }
1205         }
1206         return (struct rx_packet *)0;
1207     }
1208 #endif /* KERNEL */
1209
1210     if (rx_stats_active)
1211         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1212
1213 #ifdef KERNEL
1214     if (queue_IsEmpty(&rx_freePacketQueue))
1215         osi_Panic("rxi_AllocPacket error");
1216 #else /* KERNEL */
1217     if (queue_IsEmpty(&rx_freePacketQueue))
1218         rxi_MorePacketsNoLock(rx_maxSendWindow);
1219 #endif /* KERNEL */
1220
1221     rx_nFreePackets--;
1222     p = queue_First(&rx_freePacketQueue, rx_packet);
1223     queue_Remove(p);
1224     RX_FPQ_MARK_USED(p);
1225
1226     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1227
1228
1229     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1230      * order to truncate outbound packets.  In the near future, may need
1231      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1232      */
1233     RX_PACKET_IOV_FULLINIT(p);
1234     return p;
1235 }
1236 #endif /* RX_ENABLE_TSFPQ */
1237
1238 #ifdef RX_ENABLE_TSFPQ
1239 struct rx_packet *
1240 rxi_AllocPacketTSFPQ(int class, int pull_global)
1241 {
1242     struct rx_packet *p;
1243     struct rx_ts_info_t * rx_ts_info;
1244
1245     RX_TS_INFO_GET(rx_ts_info);
1246
1247     if (rx_stats_active)
1248         rx_MutexIncrement(rx_stats.packetRequests, rx_stats_mutex);
1249     if (pull_global && queue_IsEmpty(&rx_ts_info->_FPQ)) {
1250         MUTEX_ENTER(&rx_freePktQ_lock);
1251
1252         if (queue_IsEmpty(&rx_freePacketQueue))
1253             rxi_MorePacketsNoLock(rx_maxSendWindow);
1254
1255         RX_TS_FPQ_GTOL(rx_ts_info);
1256
1257         MUTEX_EXIT(&rx_freePktQ_lock);
1258     } else if (queue_IsEmpty(&rx_ts_info->_FPQ)) {
1259         return NULL;
1260     }
1261
1262     RX_TS_FPQ_CHECKOUT(rx_ts_info,p);
1263
1264     dpf(("Alloc %"AFS_PTR_FMT", class %d\n", p, class));
1265
1266     /* have to do this here because rx_FlushWrite fiddles with the iovs in
1267      * order to truncate outbound packets.  In the near future, may need
1268      * to allocate bufs from a static pool here, and/or in AllocSendPacket
1269      */
1270     RX_PACKET_IOV_FULLINIT(p);
1271     return p;
1272 }
1273 #endif /* RX_ENABLE_TSFPQ */
1274
1275 #ifdef RX_ENABLE_TSFPQ
1276 struct rx_packet *
1277 rxi_AllocPacket(int class)
1278 {
1279     struct rx_packet *p;
1280
1281     p = rxi_AllocPacketTSFPQ(class, RX_TS_FPQ_PULL_GLOBAL);
1282     return p;
1283 }
1284 #else /* RX_ENABLE_TSFPQ */
1285 struct rx_packet *
1286 rxi_AllocPacket(int class)
1287 {
1288     struct rx_packet *p;
1289
1290     MUTEX_ENTER(&rx_freePktQ_lock);
1291     p = rxi_AllocPacketNoLock(class);
1292     MUTEX_EXIT(&rx_freePktQ_lock);
1293     return p;
1294 }
1295 #endif /* RX_ENABLE_TSFPQ */
1296
1297 /* This guy comes up with as many buffers as it {takes,can get} given
1298  * the MTU for this call. It also sets the packet length before
1299  * returning.  caution: this is often called at NETPRI
1300  * Called with call locked.
1301  */
1302 struct rx_packet *
1303 rxi_AllocSendPacket(struct rx_call *call, int want)
1304 {
1305     struct rx_packet *p = (struct rx_packet *)0;
1306     int mud;
1307     unsigned delta;
1308
1309     SPLVAR;
1310     mud = call->MTU - RX_HEADER_SIZE;
1311     delta =
1312         rx_GetSecurityHeaderSize(rx_ConnectionOf(call)) +
1313         rx_GetSecurityMaxTrailerSize(rx_ConnectionOf(call));
1314
1315 #ifdef RX_ENABLE_TSFPQ
1316     if ((p = rxi_AllocPacketTSFPQ(RX_PACKET_CLASS_SEND, 0))) {
1317         want += delta;
1318         want = MIN(want, mud);
1319
1320         if ((unsigned)want > p->length)
1321             (void)rxi_AllocDataBuf(p, (want - p->length),
1322                                    RX_PACKET_CLASS_SEND_CBUF);
1323
1324         if (p->length > mud)
1325             p->length = mud;
1326
1327         if (delta >= p->length) {
1328             rxi_FreePacket(p);
1329             p = NULL;
1330         } else {
1331             p->length -= delta;
1332         }
1333         return p;
1334     }
1335 #endif /* RX_ENABLE_TSFPQ */
1336
1337     while (!(call->error)) {
1338         MUTEX_ENTER(&rx_freePktQ_lock);
1339         /* if an error occurred, or we get the packet we want, we're done */
1340         if ((p = rxi_AllocPacketNoLock(RX_PACKET_CLASS_SEND))) {
1341             MUTEX_EXIT(&rx_freePktQ_lock);
1342
1343             want += delta;
1344             want = MIN(want, mud);
1345
1346             if ((unsigned)want > p->length)
1347                 (void)rxi_AllocDataBuf(p, (want - p->length),
1348                                        RX_PACKET_CLASS_SEND_CBUF);
1349
1350             if (p->length > mud)
1351                 p->length = mud;
1352
1353             if (delta >= p->length) {
1354                 rxi_FreePacket(p);
1355                 p = NULL;
1356             } else {
1357                 p->length -= delta;
1358             }
1359             break;
1360         }
1361
1362         /* no error occurred, and we didn't get a packet, so we sleep.
1363          * At this point, we assume that packets will be returned
1364          * sooner or later, as packets are acknowledged, and so we
1365          * just wait.  */
1366         NETPRI;
1367         call->flags |= RX_CALL_WAIT_PACKETS;
1368         CALL_HOLD(call, RX_CALL_REFCOUNT_PACKET);
1369         MUTEX_EXIT(&call->lock);
1370         rx_waitingForPackets = 1;
1371
1372 #ifdef  RX_ENABLE_LOCKS
1373         CV_WAIT(&rx_waitingForPackets_cv, &rx_freePktQ_lock);
1374 #else
1375         osi_rxSleep(&rx_waitingForPackets);
1376 #endif
1377         MUTEX_EXIT(&rx_freePktQ_lock);
1378         MUTEX_ENTER(&call->lock);
1379         CALL_RELE(call, RX_CALL_REFCOUNT_PACKET);
1380         call->flags &= ~RX_CALL_WAIT_PACKETS;
1381         USERPRI;
1382     }
1383
1384     return p;
1385 }
1386
1387 #ifndef KERNEL
1388 #ifdef AFS_NT40_ENV
1389 /* Windows does not use file descriptors. */
1390 #define CountFDs(amax) 0
1391 #else
1392 /* count the number of used FDs */
1393 static int
1394 CountFDs(int amax)
1395 {
1396     struct stat tstat;
1397     int i, code;
1398     int count;
1399
1400     count = 0;
1401     for (i = 0; i < amax; i++) {
1402         code = fstat(i, &tstat);
1403         if (code == 0)
1404             count++;
1405     }
1406     return count;
1407 }
1408 #endif /* AFS_NT40_ENV */
1409 #else /* KERNEL */
1410
1411 #define CountFDs(amax) amax
1412
1413 #endif /* KERNEL */
1414
1415 #if !defined(KERNEL) || defined(UKERNEL)
1416
1417 /* This function reads a single packet from the interface into the
1418  * supplied packet buffer (*p).  Return 0 if the packet is bogus.  The
1419  * (host,port) of the sender are stored in the supplied variables, and
1420  * the data length of the packet is stored in the packet structure.
1421  * The header is decoded. */
1422 int
1423 rxi_ReadPacket(osi_socket socket, struct rx_packet *p, afs_uint32 * host,
1424                u_short * port)
1425 {
1426     struct sockaddr_in from;
1427     unsigned int nbytes;
1428     afs_int32 rlen;
1429     afs_uint32 tlen, savelen;
1430     struct msghdr msg;
1431     rx_computelen(p, tlen);
1432     rx_SetDataSize(p, tlen);    /* this is the size of the user data area */
1433
1434     tlen += RX_HEADER_SIZE;     /* now this is the size of the entire packet */
1435     rlen = rx_maxJumboRecvSize; /* this is what I am advertising.  Only check
1436                                  * it once in order to avoid races.  */
1437     tlen = rlen - tlen;
1438     if (tlen > 0) {
1439         tlen = rxi_AllocDataBuf(p, tlen, RX_PACKET_CLASS_SEND_CBUF);
1440         if (tlen > 0) {
1441             tlen = rlen - tlen;
1442         } else
1443             tlen = rlen;
1444     } else
1445         tlen = rlen;
1446
1447     /* Extend the last iovec for padding, it's just to make sure that the
1448      * read doesn't return more data than we expect, and is done to get around
1449      * our problems caused by the lack of a length field in the rx header.
1450      * Use the extra buffer that follows the localdata in each packet
1451      * structure. */
1452     savelen = p->wirevec[p->niovecs - 1].iov_len;
1453     p->wirevec[p->niovecs - 1].iov_len += RX_EXTRABUFFERSIZE;
1454
1455     memset(&msg, 0, sizeof(msg));
1456     msg.msg_name = (char *)&from;
1457     msg.msg_namelen = sizeof(struct sockaddr_in);
1458     msg.msg_iov = p->wirevec;
1459     msg.msg_iovlen = p->niovecs;
1460     nbytes = rxi_Recvmsg(socket, &msg, 0);
1461
1462     /* restore the vec to its correct state */
1463     p->wirevec[p->niovecs - 1].iov_len = savelen;
1464
1465     p->length = (u_short)(nbytes - RX_HEADER_SIZE);
1466     if ((nbytes > tlen) || (p->length & 0x8000)) {      /* Bogus packet */
1467         if (nbytes < 0 && errno == EWOULDBLOCK) {
1468             if (rx_stats_active)
1469                 rx_MutexIncrement(rx_stats.noPacketOnRead, rx_stats_mutex);
1470         } else if (nbytes <= 0) {
1471             if (rx_stats_active) {
1472                 MUTEX_ENTER(&rx_stats_mutex);
1473                 rx_stats.bogusPacketOnRead++;
1474                 rx_stats.bogusHost = from.sin_addr.s_addr;
1475                 MUTEX_EXIT(&rx_stats_mutex);
1476             }
1477             dpf(("B: bogus packet from [%x,%d] nb=%d", ntohl(from.sin_addr.s_addr),
1478                  ntohs(from.sin_port), nbytes));
1479         }
1480         return 0;
1481     }
1482 #ifdef RXDEBUG
1483     else if ((rx_intentionallyDroppedOnReadPer100 > 0)
1484                 && (random() % 100 < rx_intentionallyDroppedOnReadPer100)) {
1485         rxi_DecodePacketHeader(p);
1486
1487         *host = from.sin_addr.s_addr;
1488         *port = from.sin_port;
1489
1490         dpf(("Dropped %d %s: %x.%u.%u.%u.%u.%u.%u flags %d len %d",
1491               p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(*host), ntohs(*port), p->header.serial,
1492               p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.flags,
1493               p->length));
1494 #ifdef RX_TRIMDATABUFS
1495         rxi_TrimDataBufs(p, 1);
1496 #endif
1497         return 0;
1498     }
1499 #endif
1500     else {
1501         /* Extract packet header. */
1502         rxi_DecodePacketHeader(p);
1503
1504         *host = from.sin_addr.s_addr;
1505         *port = from.sin_port;
1506         if (p->header.type > 0 && p->header.type < RX_N_PACKET_TYPES) {
1507             if (rx_stats_active) {
1508                 struct rx_peer *peer;
1509                 rx_MutexIncrement(rx_stats.packetsRead[p->header.type - 1], rx_stats_mutex);
1510                 /*
1511                  * Try to look up this peer structure.  If it doesn't exist,
1512                  * don't create a new one -
1513                  * we don't keep count of the bytes sent/received if a peer
1514                  * structure doesn't already exist.
1515                  *
1516                  * The peer/connection cleanup code assumes that there is 1 peer
1517                  * per connection.  If we actually created a peer structure here
1518                  * and this packet was an rxdebug packet, the peer structure would
1519                  * never be cleaned up.
1520                  */
1521                 peer = rxi_FindPeer(*host, *port, 0, 0);
1522                 /* Since this may not be associated with a connection,
1523                  * it may have no refCount, meaning we could race with
1524                  * ReapConnections
1525                  */
1526                 if (peer && (peer->refCount > 0)) {
1527                     MUTEX_ENTER(&peer->peer_lock);
1528                     hadd32(peer->bytesReceived, p->length);
1529                     MUTEX_EXIT(&peer->peer_lock);
1530                 }
1531             }
1532         }
1533
1534 #ifdef RX_TRIMDATABUFS
1535         /* Free any empty packet buffers at the end of this packet */
1536         rxi_TrimDataBufs(p, 1);
1537 #endif
1538         return 1;
1539     }
1540 }
1541
1542 #endif /* !KERNEL || UKERNEL */
1543
1544 /* This function splits off the first packet in a jumbo packet.
1545  * As of AFS 3.5, jumbograms contain more than one fixed size
1546  * packet, and the RX_JUMBO_PACKET flag is set in all but the
1547  * last packet header. All packets (except the last) are padded to
1548  * fall on RX_CBUFFERSIZE boundaries.
1549  * HACK: We store the length of the first n-1 packets in the
1550  * last two pad bytes. */
1551
1552 struct rx_packet *
1553 rxi_SplitJumboPacket(struct rx_packet *p, afs_uint32 host, short port,
1554                      int first)
1555 {
1556     struct rx_packet *np;
1557     struct rx_jumboHeader *jp;
1558     int niov, i;
1559     struct iovec *iov;
1560     int length;
1561     afs_uint32 temp;
1562
1563     /* All but the last packet in each jumbogram are RX_JUMBOBUFFERSIZE
1564      * bytes in length. All but the first packet are preceded by
1565      * an abbreviated four byte header. The length of the last packet
1566      * is calculated from the size of the jumbogram. */
1567     length = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
1568
1569     if ((int)p->length < length) {
1570         dpf(("rxi_SplitJumboPacket: bogus length %d\n", p->length));
1571         return NULL;
1572     }
1573     niov = p->niovecs - 2;
1574     if (niov < 1) {
1575         dpf(("rxi_SplitJumboPacket: bogus niovecs %d\n", p->niovecs));
1576         return NULL;
1577     }
1578     iov = &p->wirevec[2];
1579     np = RX_CBUF_TO_PACKET(iov->iov_base, p);
1580
1581     /* Get a pointer to the abbreviated packet header */
1582     jp = (struct rx_jumboHeader *)
1583         ((char *)(p->wirevec[1].iov_base) + RX_JUMBOBUFFERSIZE);
1584
1585     /* Set up the iovecs for the next packet */
1586     np->wirevec[0].iov_base = (char *)(&np->wirehead[0]);
1587     np->wirevec[0].iov_len = sizeof(struct rx_header);
1588     np->wirevec[1].iov_base = (char *)(&np->localdata[0]);
1589     np->wirevec[1].iov_len = length - RX_JUMBOHEADERSIZE;
1590     np->niovecs = niov + 1;
1591     for (i = 2, iov++; i <= niov; i++, iov++) {
1592         np->wirevec[i] = *iov;
1593     }
1594     np->length = p->length - length;
1595     p->length = RX_JUMBOBUFFERSIZE;
1596     p->niovecs = 2;
1597
1598     /* Convert the jumbo packet header to host byte order */
1599     temp = ntohl(*(afs_uint32 *) jp);
1600     jp->flags = (u_char) (temp >> 24);
1601     jp->cksum = (u_short) (temp);
1602
1603     /* Fill in the packet header */
1604     np->header = p->header;
1605     np->header.serial = p->header.serial + 1;
1606     np->header.seq = p->header.seq + 1;
1607     np->header.flags = jp->flags;
1608     np->header.spare = jp->cksum;
1609
1610     return np;
1611 }
1612
1613 #ifndef KERNEL
1614 /* Send a udp datagram */
1615 int
1616 osi_NetSend(osi_socket socket, void *addr, struct iovec *dvec, int nvecs,
1617             int length, int istack)
1618 {
1619     struct msghdr msg;
1620         int ret;
1621
1622     memset(&msg, 0, sizeof(msg));
1623     msg.msg_iov = dvec;
1624     msg.msg_iovlen = nvecs;
1625     msg.msg_name = addr;
1626     msg.msg_namelen = sizeof(struct sockaddr_in);
1627
1628     ret = rxi_Sendmsg(socket, &msg, 0);
1629
1630     return ret;
1631 }
1632 #elif !defined(UKERNEL)
1633 /*
1634  * message receipt is done in rxk_input or rx_put.
1635  */
1636
1637 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1638 /*
1639  * Copy an mblock to the contiguous area pointed to by cp.
1640  * MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1641  * but it doesn't really.
1642  * Returns the number of bytes not transferred.
1643  * The message is NOT changed.
1644  */
1645 static int
1646 cpytoc(mblk_t * mp, int off, int len, char *cp)
1647 {
1648     int n;
1649
1650     for (; mp && len > 0; mp = mp->b_cont) {
1651         if (mp->b_datap->db_type != M_DATA) {
1652             return -1;
1653         }
1654         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1655         memcpy(cp, (char *)mp->b_rptr, n);
1656         cp += n;
1657         len -= n;
1658         mp->b_rptr += n;
1659     }
1660     return (len);
1661 }
1662
1663 /* MTUXXX Supposed to skip <off> bytes and copy <len> bytes,
1664  * but it doesn't really.
1665  * This sucks, anyway, do it like m_cpy.... below
1666  */
1667 static int
1668 cpytoiovec(mblk_t * mp, int off, int len, struct iovec *iovs,
1669            int niovs)
1670 {
1671     int m, n, o, t, i;
1672
1673     for (i = -1, t = 0; i < niovs && mp && len > 0; mp = mp->b_cont) {
1674         if (mp->b_datap->db_type != M_DATA) {
1675             return -1;
1676         }
1677         n = MIN(len, (mp->b_wptr - mp->b_rptr));
1678         len -= n;
1679         while (n) {
1680             if (!t) {
1681                 o = 0;
1682                 i++;
1683                 t = iovs[i].iov_len;
1684             }
1685             m = MIN(n, t);
1686             memcpy(iovs[i].iov_base + o, (char *)mp->b_rptr, m);
1687             mp->b_rptr += m;
1688             o += m;
1689             t -= m;
1690             n -= m;
1691         }
1692     }
1693     return (len);
1694 }
1695
1696 #define m_cpytoc(a, b, c, d)  cpytoc(a, b, c, d)
1697 #define m_cpytoiovec(a, b, c, d, e) cpytoiovec(a, b, c, d, e)
1698 #else
1699 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1700 static int
1701 m_cpytoiovec(struct mbuf *m, int off, int len, struct iovec iovs[], int niovs)
1702 {
1703     caddr_t p1, p2;
1704     unsigned int l1, l2, i, t;
1705
1706     if (m == NULL || off < 0 || len < 0 || iovs == NULL)
1707         osi_Panic("m_cpytoiovec");      /* MTUXXX probably don't need this check */
1708
1709     while (off && m)
1710         if (m->m_len <= off) {
1711             off -= m->m_len;
1712             m = m->m_next;
1713             continue;
1714         } else
1715             break;
1716
1717     if (m == NULL)
1718         return len;
1719
1720     p1 = mtod(m, caddr_t) + off;
1721     l1 = m->m_len - off;
1722     i = 0;
1723     p2 = iovs[0].iov_base;
1724     l2 = iovs[0].iov_len;
1725
1726     while (len) {
1727         t = MIN(l1, MIN(l2, (unsigned int)len));
1728         memcpy(p2, p1, t);
1729         p1 += t;
1730         p2 += t;
1731         l1 -= t;
1732         l2 -= t;
1733         len -= t;
1734         if (!l1) {
1735             m = m->m_next;
1736             if (!m)
1737                 break;
1738             p1 = mtod(m, caddr_t);
1739             l1 = m->m_len;
1740         }
1741         if (!l2) {
1742             if (++i >= niovs)
1743                 break;
1744             p2 = iovs[i].iov_base;
1745             l2 = iovs[i].iov_len;
1746         }
1747
1748     }
1749
1750     return len;
1751 }
1752 #endif /* LINUX */
1753 #endif /* AFS_SUN5_ENV */
1754
1755 #if !defined(AFS_LINUX20_ENV) && !defined(AFS_DARWIN80_ENV)
1756 int
1757 rx_mb_to_packet(amb, free, hdr_len, data_len, phandle)
1758 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX110_ENV)
1759      mblk_t *amb;
1760 #else
1761      struct mbuf *amb;
1762 #endif
1763      void (*free) ();
1764      struct rx_packet *phandle;
1765      int hdr_len, data_len;
1766 {
1767     int code;
1768
1769     code =
1770         m_cpytoiovec(amb, hdr_len, data_len, phandle->wirevec,
1771                      phandle->niovecs);
1772     (*free) (amb);
1773
1774     return code;
1775 }
1776 #endif /* LINUX */
1777 #endif /*KERNEL && !UKERNEL */
1778
1779
1780 /* send a response to a debug packet */
1781
1782 struct rx_packet *
1783 rxi_ReceiveDebugPacket(struct rx_packet *ap, osi_socket asocket,
1784                        afs_uint32 ahost, short aport, int istack)
1785 {
1786     struct rx_debugIn tin;
1787     afs_int32 tl;
1788     struct rx_serverQueueEntry *np, *nqe;
1789
1790     /*
1791      * Only respond to client-initiated Rx debug packets,
1792      * and clear the client flag in the response.
1793      */
1794     if (ap->header.flags & RX_CLIENT_INITIATED) {
1795         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
1796         rxi_EncodePacketHeader(ap);
1797     } else {
1798         return ap;
1799     }
1800
1801     rx_packetread(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
1802     /* all done with packet, now set length to the truth, so we can
1803      * reuse this packet */
1804     rx_computelen(ap, ap->length);
1805
1806     tin.type = ntohl(tin.type);
1807     tin.index = ntohl(tin.index);
1808     switch (tin.type) {
1809     case RX_DEBUGI_GETSTATS:{
1810             struct rx_debugStats tstat;
1811
1812             /* get basic stats */
1813             memset(&tstat, 0, sizeof(tstat));   /* make sure spares are zero */
1814             tstat.version = RX_DEBUGI_VERSION;
1815 #ifndef RX_ENABLE_LOCKS
1816             tstat.waitingForPackets = rx_waitingForPackets;
1817 #endif
1818             MUTEX_ENTER(&rx_serverPool_lock);
1819             tstat.nFreePackets = htonl(rx_nFreePackets);
1820             tstat.nPackets = htonl(rx_nPackets);
1821             tstat.callsExecuted = htonl(rxi_nCalls);
1822             tstat.packetReclaims = htonl(rx_packetReclaims);
1823             tstat.usedFDs = CountFDs(64);
1824             tstat.nWaiting = htonl(rx_nWaiting);
1825             tstat.nWaited = htonl(rx_nWaited);
1826             queue_Count(&rx_idleServerQueue, np, nqe, rx_serverQueueEntry,
1827                         tstat.idleThreads);
1828             MUTEX_EXIT(&rx_serverPool_lock);
1829             tstat.idleThreads = htonl(tstat.idleThreads);
1830             tl = sizeof(struct rx_debugStats) - ap->length;
1831             if (tl > 0)
1832                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1833
1834             if (tl <= 0) {
1835                 rx_packetwrite(ap, 0, sizeof(struct rx_debugStats),
1836                                (char *)&tstat);
1837                 ap->length = sizeof(struct rx_debugStats);
1838                 rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1839                 rx_computelen(ap, ap->length);
1840             }
1841             break;
1842         }
1843
1844     case RX_DEBUGI_GETALLCONN:
1845     case RX_DEBUGI_GETCONN:{
1846             unsigned int i, j;
1847             struct rx_connection *tc;
1848             struct rx_call *tcall;
1849             struct rx_debugConn tconn;
1850             int all = (tin.type == RX_DEBUGI_GETALLCONN);
1851
1852
1853             tl = sizeof(struct rx_debugConn) - ap->length;
1854             if (tl > 0)
1855                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1856             if (tl > 0)
1857                 return ap;
1858
1859             memset(&tconn, 0, sizeof(tconn));   /* make sure spares are zero */
1860             /* get N'th (maybe) "interesting" connection info */
1861             for (i = 0; i < rx_hashTableSize; i++) {
1862 #if !defined(KERNEL)
1863                 /* the time complexity of the algorithm used here
1864                  * exponentially increses with the number of connections.
1865                  */
1866 #ifdef AFS_PTHREAD_ENV
1867                 pthread_yield();
1868 #else
1869                 (void)IOMGR_Poll();
1870 #endif
1871 #endif
1872                 MUTEX_ENTER(&rx_connHashTable_lock);
1873                 /* We might be slightly out of step since we are not
1874                  * locking each call, but this is only debugging output.
1875                  */
1876                 for (tc = rx_connHashTable[i]; tc; tc = tc->next) {
1877                     if ((all || rxi_IsConnInteresting(tc))
1878                         && tin.index-- <= 0) {
1879                         tconn.host = tc->peer->host;
1880                         tconn.port = tc->peer->port;
1881                         tconn.cid = htonl(tc->cid);
1882                         tconn.epoch = htonl(tc->epoch);
1883                         tconn.serial = htonl(tc->serial);
1884                         for (j = 0; j < RX_MAXCALLS; j++) {
1885                             tconn.callNumber[j] = htonl(tc->callNumber[j]);
1886                             if ((tcall = tc->call[j])) {
1887                                 tconn.callState[j] = tcall->state;
1888                                 tconn.callMode[j] = tcall->mode;
1889                                 tconn.callFlags[j] = tcall->flags;
1890                                 if (queue_IsNotEmpty(&tcall->rq))
1891                                     tconn.callOther[j] |= RX_OTHER_IN;
1892                                 if (queue_IsNotEmpty(&tcall->tq))
1893                                     tconn.callOther[j] |= RX_OTHER_OUT;
1894                             } else
1895                                 tconn.callState[j] = RX_STATE_NOTINIT;
1896                         }
1897
1898                         tconn.natMTU = htonl(tc->peer->natMTU);
1899                         tconn.error = htonl(tc->error);
1900                         tconn.flags = tc->flags;
1901                         tconn.type = tc->type;
1902                         tconn.securityIndex = tc->securityIndex;
1903                         if (tc->securityObject) {
1904                             RXS_GetStats(tc->securityObject, tc,
1905                                          &tconn.secStats);
1906 #define DOHTONL(a) (tconn.secStats.a = htonl(tconn.secStats.a))
1907 #define DOHTONS(a) (tconn.secStats.a = htons(tconn.secStats.a))
1908                             DOHTONL(flags);
1909                             DOHTONL(expires);
1910                             DOHTONL(packetsReceived);
1911                             DOHTONL(packetsSent);
1912                             DOHTONL(bytesReceived);
1913                             DOHTONL(bytesSent);
1914                             for (i = 0;
1915                                  i <
1916                                  sizeof(tconn.secStats.spares) /
1917                                  sizeof(short); i++)
1918                                 DOHTONS(spares[i]);
1919                             for (i = 0;
1920                                  i <
1921                                  sizeof(tconn.secStats.sparel) /
1922                                  sizeof(afs_int32); i++)
1923                                 DOHTONL(sparel[i]);
1924                         }
1925
1926                         MUTEX_EXIT(&rx_connHashTable_lock);
1927                         rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1928                                        (char *)&tconn);
1929                         tl = ap->length;
1930                         ap->length = sizeof(struct rx_debugConn);
1931                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
1932                                             istack);
1933                         ap->length = tl;
1934                         return ap;
1935                     }
1936                 }
1937                 MUTEX_EXIT(&rx_connHashTable_lock);
1938             }
1939             /* if we make it here, there are no interesting packets */
1940             tconn.cid = htonl(0xffffffff);      /* means end */
1941             rx_packetwrite(ap, 0, sizeof(struct rx_debugConn),
1942                            (char *)&tconn);
1943             tl = ap->length;
1944             ap->length = sizeof(struct rx_debugConn);
1945             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
1946             ap->length = tl;
1947             break;
1948         }
1949
1950         /*
1951          * Pass back all the peer structures we have available
1952          */
1953
1954     case RX_DEBUGI_GETPEER:{
1955             unsigned int i;
1956             struct rx_peer *tp;
1957             struct rx_debugPeer tpeer;
1958
1959
1960             tl = sizeof(struct rx_debugPeer) - ap->length;
1961             if (tl > 0)
1962                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
1963             if (tl > 0)
1964                 return ap;
1965
1966             memset(&tpeer, 0, sizeof(tpeer));
1967             for (i = 0; i < rx_hashTableSize; i++) {
1968 #if !defined(KERNEL)
1969                 /* the time complexity of the algorithm used here
1970                  * exponentially increses with the number of peers.
1971                  *
1972                  * Yielding after processing each hash table entry
1973                  * and dropping rx_peerHashTable_lock.
1974                  * also increases the risk that we will miss a new
1975                  * entry - but we are willing to live with this
1976                  * limitation since this is meant for debugging only
1977                  */
1978 #ifdef AFS_PTHREAD_ENV
1979                 pthread_yield();
1980 #else
1981                 (void)IOMGR_Poll();
1982 #endif
1983 #endif
1984                 MUTEX_ENTER(&rx_peerHashTable_lock);
1985                 for (tp = rx_peerHashTable[i]; tp; tp = tp->next) {
1986                     if (tin.index-- <= 0) {
1987                         tp->refCount++;
1988                         MUTEX_EXIT(&rx_peerHashTable_lock);
1989
1990                         MUTEX_ENTER(&tp->peer_lock);
1991                         tpeer.host = tp->host;
1992                         tpeer.port = tp->port;
1993                         tpeer.ifMTU = htons(tp->ifMTU);
1994                         tpeer.idleWhen = htonl(tp->idleWhen);
1995                         tpeer.refCount = htons(tp->refCount);
1996                         tpeer.burstSize = tp->burstSize;
1997                         tpeer.burst = tp->burst;
1998                         tpeer.burstWait.sec = htonl(tp->burstWait.sec);
1999                         tpeer.burstWait.usec = htonl(tp->burstWait.usec);
2000                         tpeer.rtt = htonl(tp->rtt);
2001                         tpeer.rtt_dev = htonl(tp->rtt_dev);
2002                         tpeer.timeout.sec = htonl(tp->timeout.sec);
2003                         tpeer.timeout.usec = htonl(tp->timeout.usec);
2004                         tpeer.nSent = htonl(tp->nSent);
2005                         tpeer.reSends = htonl(tp->reSends);
2006                         tpeer.inPacketSkew = htonl(tp->inPacketSkew);
2007                         tpeer.outPacketSkew = htonl(tp->outPacketSkew);
2008                         tpeer.rateFlag = htonl(tp->rateFlag);
2009                         tpeer.natMTU = htons(tp->natMTU);
2010                         tpeer.maxMTU = htons(tp->maxMTU);
2011                         tpeer.maxDgramPackets = htons(tp->maxDgramPackets);
2012                         tpeer.ifDgramPackets = htons(tp->ifDgramPackets);
2013                         tpeer.MTU = htons(tp->MTU);
2014                         tpeer.cwind = htons(tp->cwind);
2015                         tpeer.nDgramPackets = htons(tp->nDgramPackets);
2016                         tpeer.congestSeq = htons(tp->congestSeq);
2017                         tpeer.bytesSent.high = htonl(tp->bytesSent.high);
2018                         tpeer.bytesSent.low = htonl(tp->bytesSent.low);
2019                         tpeer.bytesReceived.high =
2020                             htonl(tp->bytesReceived.high);
2021                         tpeer.bytesReceived.low =
2022                             htonl(tp->bytesReceived.low);
2023                         MUTEX_EXIT(&tp->peer_lock);
2024
2025                         MUTEX_ENTER(&rx_peerHashTable_lock);
2026                         tp->refCount--;
2027                         MUTEX_EXIT(&rx_peerHashTable_lock);
2028
2029                         rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2030                                        (char *)&tpeer);
2031                         tl = ap->length;
2032                         ap->length = sizeof(struct rx_debugPeer);
2033                         rxi_SendDebugPacket(ap, asocket, ahost, aport,
2034                                             istack);
2035                         ap->length = tl;
2036                         return ap;
2037                     }
2038                 }
2039                 MUTEX_EXIT(&rx_peerHashTable_lock);
2040             }
2041             /* if we make it here, there are no interesting packets */
2042             tpeer.host = htonl(0xffffffff);     /* means end */
2043             rx_packetwrite(ap, 0, sizeof(struct rx_debugPeer),
2044                            (char *)&tpeer);
2045             tl = ap->length;
2046             ap->length = sizeof(struct rx_debugPeer);
2047             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2048             ap->length = tl;
2049             break;
2050         }
2051
2052     case RX_DEBUGI_RXSTATS:{
2053             int i;
2054             afs_int32 *s;
2055
2056             tl = sizeof(rx_stats) - ap->length;
2057             if (tl > 0)
2058                 tl = rxi_AllocDataBuf(ap, tl, RX_PACKET_CLASS_SEND_CBUF);
2059             if (tl > 0)
2060                 return ap;
2061
2062             /* Since its all int32s convert to network order with a loop. */
2063         if (rx_stats_active)
2064             MUTEX_ENTER(&rx_stats_mutex);
2065             s = (afs_int32 *) & rx_stats;
2066             for (i = 0; i < sizeof(rx_stats) / sizeof(afs_int32); i++, s++)
2067                 rx_PutInt32(ap, i * sizeof(afs_int32), htonl(*s));
2068
2069             tl = ap->length;
2070             ap->length = sizeof(rx_stats);
2071         if (rx_stats_active)
2072             MUTEX_EXIT(&rx_stats_mutex);
2073             rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2074             ap->length = tl;
2075             break;
2076         }
2077
2078     default:
2079         /* error response packet */
2080         tin.type = htonl(RX_DEBUGI_BADTYPE);
2081         tin.index = tin.type;
2082         rx_packetwrite(ap, 0, sizeof(struct rx_debugIn), (char *)&tin);
2083         tl = ap->length;
2084         ap->length = sizeof(struct rx_debugIn);
2085         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2086         ap->length = tl;
2087         break;
2088     }
2089     return ap;
2090 }
2091
2092 struct rx_packet *
2093 rxi_ReceiveVersionPacket(struct rx_packet *ap, osi_socket asocket,
2094                          afs_uint32 ahost, short aport, int istack)
2095 {
2096     afs_int32 tl;
2097
2098     /*
2099      * Only respond to client-initiated version requests, and
2100      * clear that flag in the response.
2101      */
2102     if (ap->header.flags & RX_CLIENT_INITIATED) {
2103         char buf[66];
2104
2105         ap->header.flags = ap->header.flags & ~RX_CLIENT_INITIATED;
2106         rxi_EncodePacketHeader(ap);
2107         memset(buf, 0, sizeof(buf));
2108         strncpy(buf, cml_version_number + 4, sizeof(buf) - 1);
2109         rx_packetwrite(ap, 0, 65, buf);
2110         tl = ap->length;
2111         ap->length = 65;
2112         rxi_SendDebugPacket(ap, asocket, ahost, aport, istack);
2113         ap->length = tl;
2114     }
2115
2116     return ap;
2117 }
2118
2119
2120 /* send a debug packet back to the sender */
2121 static void
2122 rxi_SendDebugPacket(struct rx_packet *apacket, osi_socket asocket,
2123                     afs_uint32 ahost, short aport, afs_int32 istack)
2124 {
2125     struct sockaddr_in taddr;
2126     unsigned int i, nbytes, savelen = 0;
2127     int saven = 0;
2128 #ifdef KERNEL
2129     int waslocked = ISAFS_GLOCK();
2130 #endif
2131
2132     taddr.sin_family = AF_INET;
2133     taddr.sin_port = aport;
2134     taddr.sin_addr.s_addr = ahost;
2135 #ifdef STRUCT_SOCKADDR_HAS_SA_LEN
2136     taddr.sin_len = sizeof(struct sockaddr_in);
2137 #endif
2138
2139     /* We need to trim the niovecs. */
2140     nbytes = apacket->length;
2141     for (i = 1; i < apacket->niovecs; i++) {
2142         if (nbytes <= apacket->wirevec[i].iov_len) {
2143             savelen = apacket->wirevec[i].iov_len;
2144             saven = apacket->niovecs;
2145             apacket->wirevec[i].iov_len = nbytes;
2146             apacket->niovecs = i + 1;   /* so condition fails because i == niovecs */
2147         } else
2148             nbytes -= apacket->wirevec[i].iov_len;
2149     }
2150 #ifdef KERNEL
2151 #ifdef RX_KERNEL_TRACE
2152     if (ICL_SETACTIVE(afs_iclSetp)) {
2153         if (!waslocked)
2154             AFS_GLOCK();
2155         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2156                    "before osi_NetSend()");
2157         AFS_GUNLOCK();
2158     } else
2159 #else
2160     if (waslocked)
2161         AFS_GUNLOCK();
2162 #endif
2163 #endif
2164     /* debug packets are not reliably delivered, hence the cast below. */
2165     (void)osi_NetSend(asocket, &taddr, apacket->wirevec, apacket->niovecs,
2166                       apacket->length + RX_HEADER_SIZE, istack);
2167 #ifdef KERNEL
2168 #ifdef RX_KERNEL_TRACE
2169     if (ICL_SETACTIVE(afs_iclSetp)) {
2170         AFS_GLOCK();
2171         afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2172                    "after osi_NetSend()");
2173         if (!waslocked)
2174             AFS_GUNLOCK();
2175     } else
2176 #else
2177     if (waslocked)
2178         AFS_GLOCK();
2179 #endif
2180 #endif
2181     if (saven) {                /* means we truncated the packet above. */
2182         apacket->wirevec[i - 1].iov_len = savelen;
2183         apacket->niovecs = saven;
2184     }
2185
2186 }
2187
2188 /* Send the packet to appropriate destination for the specified
2189  * call.  The header is first encoded and placed in the packet.
2190  */
2191 void
2192 rxi_SendPacket(struct rx_call *call, struct rx_connection *conn,
2193                struct rx_packet *p, int istack)
2194 {
2195 #if defined(KERNEL)
2196     int waslocked;
2197 #endif
2198     int code;
2199     struct sockaddr_in addr;
2200     struct rx_peer *peer = conn->peer;
2201     osi_socket socket;
2202 #ifdef RXDEBUG
2203     char deliveryType = 'S';
2204 #endif
2205     /* The address we're sending the packet to */
2206     memset(&addr, 0, sizeof(addr));
2207     addr.sin_family = AF_INET;
2208     addr.sin_port = peer->port;
2209     addr.sin_addr.s_addr = peer->host;
2210
2211     /* This stuff should be revamped, I think, so that most, if not
2212      * all, of the header stuff is always added here.  We could
2213      * probably do away with the encode/decode routines. XXXXX */
2214
2215     /* Stamp each packet with a unique serial number.  The serial
2216      * number is maintained on a connection basis because some types
2217      * of security may be based on the serial number of the packet,
2218      * and security is handled on a per authenticated-connection
2219      * basis. */
2220     /* Pre-increment, to guarantee no zero serial number; a zero
2221      * serial number means the packet was never sent. */
2222     MUTEX_ENTER(&conn->conn_data_lock);
2223     p->header.serial = ++conn->serial;
2224     if (p->length > conn->peer->maxPacketSize) {
2225         if ((p->header.type == RX_PACKET_TYPE_ACK) &&
2226             (p->header.flags & RX_REQUEST_ACK)) {
2227             conn->lastPingSize = p->length;
2228             conn->lastPingSizeSer = p->header.serial;
2229         } else if (p->header.seq != 0) {
2230             conn->lastPacketSize = p->length;
2231             conn->lastPacketSizeSeq = p->header.seq;
2232         }
2233     }
2234     MUTEX_EXIT(&conn->conn_data_lock);
2235     /* This is so we can adjust retransmit time-outs better in the face of
2236      * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2237      */
2238     if (p->firstSerial == 0) {
2239         p->firstSerial = p->header.serial;
2240     }
2241 #ifdef RXDEBUG
2242     /* If an output tracer function is defined, call it with the packet and
2243      * network address.  Note this function may modify its arguments. */
2244     if (rx_almostSent) {
2245         int drop = (*rx_almostSent) (p, &addr);
2246         /* drop packet if return value is non-zero? */
2247         if (drop)
2248             deliveryType = 'D'; /* Drop the packet */
2249     }
2250 #endif
2251
2252     /* Get network byte order header */
2253     rxi_EncodePacketHeader(p);  /* XXX in the event of rexmit, etc, don't need to
2254                                  * touch ALL the fields */
2255
2256     /* Send the packet out on the same socket that related packets are being
2257      * received on */
2258     socket =
2259         (conn->type ==
2260          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2261
2262 #ifdef RXDEBUG
2263     /* Possibly drop this packet,  for testing purposes */
2264     if ((deliveryType == 'D')
2265         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2266             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2267         deliveryType = 'D';     /* Drop the packet */
2268     } else {
2269         deliveryType = 'S';     /* Send the packet */
2270 #endif /* RXDEBUG */
2271
2272         /* Loop until the packet is sent.  We'd prefer just to use a
2273          * blocking socket, but unfortunately the interface doesn't
2274          * allow us to have the socket block in send mode, and not
2275          * block in receive mode */
2276 #ifdef KERNEL
2277         waslocked = ISAFS_GLOCK();
2278 #ifdef RX_KERNEL_TRACE
2279         if (ICL_SETACTIVE(afs_iclSetp)) {
2280             if (!waslocked)
2281                 AFS_GLOCK();
2282             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2283                        "before osi_NetSend()");
2284             AFS_GUNLOCK();
2285         } else
2286 #else
2287         if (waslocked)
2288             AFS_GUNLOCK();
2289 #endif
2290 #endif
2291         if ((code =
2292              osi_NetSend(socket, &addr, p->wirevec, p->niovecs,
2293                          p->length + RX_HEADER_SIZE, istack)) != 0) {
2294             /* send failed, so let's hurry up the resend, eh? */
2295             if (rx_stats_active)
2296                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2297             p->retryTime = p->timeSent; /* resend it very soon */
2298             clock_Addmsec(&(p->retryTime),
2299                           10 + (((afs_uint32) p->backoff) << 8));
2300             /* Some systems are nice and tell us right away that we cannot
2301              * reach this recipient by returning an error code.
2302              * So, when this happens let's "down" the host NOW so
2303              * we don't sit around waiting for this host to timeout later.
2304              */
2305             if (call &&
2306 #ifdef AFS_NT40_ENV
2307                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2308 #elif defined(AFS_LINUX20_ENV)
2309                 code == -ENETUNREACH
2310 #elif defined(AFS_DARWIN_ENV)
2311                 code == EHOSTUNREACH
2312 #else
2313                 0
2314 #endif
2315                 )
2316                 call->lastReceiveTime = 0;
2317         }
2318 #ifdef KERNEL
2319 #ifdef RX_KERNEL_TRACE
2320         if (ICL_SETACTIVE(afs_iclSetp)) {
2321             AFS_GLOCK();
2322             afs_Trace1(afs_iclSetp, CM_TRACE_TIMESTAMP, ICL_TYPE_STRING,
2323                        "after osi_NetSend()");
2324             if (!waslocked)
2325                 AFS_GUNLOCK();
2326         } else
2327 #else
2328         if (waslocked)
2329             AFS_GLOCK();
2330 #endif
2331 #endif
2332 #ifdef RXDEBUG
2333     }
2334     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2335           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2336           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2337           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2338 #endif
2339     if (rx_stats_active) {
2340         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2341         MUTEX_ENTER(&peer->peer_lock);
2342         hadd32(peer->bytesSent, p->length);
2343         MUTEX_EXIT(&peer->peer_lock);
2344     }
2345 }
2346
2347 /* Send a list of packets to appropriate destination for the specified
2348  * connection.  The headers are first encoded and placed in the packets.
2349  */
2350 void
2351 rxi_SendPacketList(struct rx_call *call, struct rx_connection *conn,
2352                    struct rx_packet **list, int len, int istack)
2353 {
2354 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2355     int waslocked;
2356 #endif
2357     struct sockaddr_in addr;
2358     struct rx_peer *peer = conn->peer;
2359     osi_socket socket;
2360     struct rx_packet *p = NULL;
2361     struct iovec wirevec[RX_MAXIOVECS];
2362     int i, length, code;
2363     afs_uint32 serial;
2364     afs_uint32 temp;
2365     struct rx_jumboHeader *jp;
2366 #ifdef RXDEBUG
2367     char deliveryType = 'S';
2368 #endif
2369     /* The address we're sending the packet to */
2370     addr.sin_family = AF_INET;
2371     addr.sin_port = peer->port;
2372     addr.sin_addr.s_addr = peer->host;
2373
2374     if (len + 1 > RX_MAXIOVECS) {
2375         osi_Panic("rxi_SendPacketList, len > RX_MAXIOVECS\n");
2376     }
2377
2378     /*
2379      * Stamp the packets in this jumbogram with consecutive serial numbers
2380      */
2381     MUTEX_ENTER(&conn->conn_data_lock);
2382     serial = conn->serial;
2383     conn->serial += len;
2384     for (i = 0; i < len; i++) {
2385         p = list[i];
2386         if (p->length > conn->peer->maxPacketSize) {
2387             /* a ping *or* a sequenced packet can count */
2388             if ((p->length > conn->peer->maxPacketSize)) {
2389                 if (((p->header.type == RX_PACKET_TYPE_ACK) &&
2390                      (p->header.flags & RX_REQUEST_ACK)) &&
2391                     ((i == 0) || (p->length >= conn->lastPingSize))) {
2392                     conn->lastPingSize = p->length;
2393                     conn->lastPingSizeSer = serial + i;
2394                 } else if ((p->header.seq != 0) &&
2395                            ((i == 0) || (p->length >= conn->lastPacketSize))) {
2396                     conn->lastPacketSize = p->length;
2397                     conn->lastPacketSizeSeq = p->header.seq;
2398                 }
2399             }
2400         }
2401     }
2402     MUTEX_EXIT(&conn->conn_data_lock);
2403
2404
2405     /* This stuff should be revamped, I think, so that most, if not
2406      * all, of the header stuff is always added here.  We could
2407      * probably do away with the encode/decode routines. XXXXX */
2408
2409     jp = NULL;
2410     length = RX_HEADER_SIZE;
2411     wirevec[0].iov_base = (char *)(&list[0]->wirehead[0]);
2412     wirevec[0].iov_len = RX_HEADER_SIZE;
2413     for (i = 0; i < len; i++) {
2414         p = list[i];
2415
2416         /* The whole 3.5 jumbogram scheme relies on packets fitting
2417          * in a single packet buffer. */
2418         if (p->niovecs > 2) {
2419             osi_Panic("rxi_SendPacketList, niovecs > 2\n");
2420         }
2421
2422         /* Set the RX_JUMBO_PACKET flags in all but the last packets
2423          * in this chunk.  */
2424         if (i < len - 1) {
2425             if (p->length != RX_JUMBOBUFFERSIZE) {
2426                 osi_Panic("rxi_SendPacketList, length != jumbo size\n");
2427             }
2428             p->header.flags |= RX_JUMBO_PACKET;
2429             length += RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2430             wirevec[i + 1].iov_len = RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2431         } else {
2432             wirevec[i + 1].iov_len = p->length;
2433             length += p->length;
2434         }
2435         wirevec[i + 1].iov_base = (char *)(&p->localdata[0]);
2436         if (jp != NULL) {
2437             /* Convert jumbo packet header to network byte order */
2438             temp = (afs_uint32) (p->header.flags) << 24;
2439             temp |= (afs_uint32) (p->header.spare);
2440             *(afs_uint32 *) jp = htonl(temp);
2441         }
2442         jp = (struct rx_jumboHeader *)
2443             ((char *)(&p->localdata[0]) + RX_JUMBOBUFFERSIZE);
2444
2445         /* Stamp each packet with a unique serial number.  The serial
2446          * number is maintained on a connection basis because some types
2447          * of security may be based on the serial number of the packet,
2448          * and security is handled on a per authenticated-connection
2449          * basis. */
2450         /* Pre-increment, to guarantee no zero serial number; a zero
2451          * serial number means the packet was never sent. */
2452         p->header.serial = ++serial;
2453         /* This is so we can adjust retransmit time-outs better in the face of
2454          * rapidly changing round-trip times.  RTO estimation is not a la Karn.
2455          */
2456         if (p->firstSerial == 0) {
2457             p->firstSerial = p->header.serial;
2458         }
2459 #ifdef RXDEBUG
2460         /* If an output tracer function is defined, call it with the packet and
2461          * network address.  Note this function may modify its arguments. */
2462         if (rx_almostSent) {
2463             int drop = (*rx_almostSent) (p, &addr);
2464             /* drop packet if return value is non-zero? */
2465             if (drop)
2466                 deliveryType = 'D';     /* Drop the packet */
2467         }
2468 #endif
2469
2470         /* Get network byte order header */
2471         rxi_EncodePacketHeader(p);      /* XXX in the event of rexmit, etc, don't need to
2472                                          * touch ALL the fields */
2473     }
2474
2475     /* Send the packet out on the same socket that related packets are being
2476      * received on */
2477     socket =
2478         (conn->type ==
2479          RX_CLIENT_CONNECTION ? rx_socket : conn->service->socket);
2480
2481 #ifdef RXDEBUG
2482     /* Possibly drop this packet,  for testing purposes */
2483     if ((deliveryType == 'D')
2484         || ((rx_intentionallyDroppedPacketsPer100 > 0)
2485             && (random() % 100 < rx_intentionallyDroppedPacketsPer100))) {
2486         deliveryType = 'D';     /* Drop the packet */
2487     } else {
2488         deliveryType = 'S';     /* Send the packet */
2489 #endif /* RXDEBUG */
2490
2491         /* Loop until the packet is sent.  We'd prefer just to use a
2492          * blocking socket, but unfortunately the interface doesn't
2493          * allow us to have the socket block in send mode, and not
2494          * block in receive mode */
2495 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2496         waslocked = ISAFS_GLOCK();
2497         if (!istack && waslocked)
2498             AFS_GUNLOCK();
2499 #endif
2500         if ((code =
2501              osi_NetSend(socket, &addr, &wirevec[0], len + 1, length,
2502                          istack)) != 0) {
2503             /* send failed, so let's hurry up the resend, eh? */
2504             if (rx_stats_active)
2505                 rx_MutexIncrement(rx_stats.netSendFailures, rx_stats_mutex);
2506             for (i = 0; i < len; i++) {
2507                 p = list[i];
2508                 p->retryTime = p->timeSent;     /* resend it very soon */
2509                 clock_Addmsec(&(p->retryTime),
2510                               10 + (((afs_uint32) p->backoff) << 8));
2511             }
2512             /* Some systems are nice and tell us right away that we cannot
2513              * reach this recipient by returning an error code.
2514              * So, when this happens let's "down" the host NOW so
2515              * we don't sit around waiting for this host to timeout later.
2516              */
2517             if (call &&
2518 #ifdef AFS_NT40_ENV
2519                 (code == -1 && WSAGetLastError() == WSAEHOSTUNREACH) || (code == -WSAEHOSTUNREACH)
2520 #elif defined(AFS_LINUX20_ENV)
2521                 code == -ENETUNREACH
2522 #elif defined(AFS_DARWIN_ENV)
2523                 code == EHOSTUNREACH
2524 #else
2525                 0
2526 #endif
2527                 )
2528                 call->lastReceiveTime = 0;
2529         }
2530 #if     defined(AFS_SUN5_ENV) && defined(KERNEL)
2531         if (!istack && waslocked)
2532             AFS_GLOCK();
2533 #endif
2534 #ifdef RXDEBUG
2535     }
2536
2537     assert(p != NULL);
2538
2539     dpf(("%c %d %s: %x.%u.%u.%u.%u.%u.%u flags %d, packet %"AFS_PTR_FMT" resend %d.%.3d len %d",
2540           deliveryType, p->header.serial, rx_packetTypes[p->header.type - 1], ntohl(peer->host),
2541           ntohs(peer->port), p->header.serial, p->header.epoch, p->header.cid, p->header.callNumber,
2542           p->header.seq, p->header.flags, p, p->retryTime.sec, p->retryTime.usec / 1000, p->length));
2543
2544 #endif
2545     if (rx_stats_active) {
2546         rx_MutexIncrement(rx_stats.packetsSent[p->header.type - 1], rx_stats_mutex);
2547         MUTEX_ENTER(&peer->peer_lock);
2548         hadd32(peer->bytesSent, p->length);
2549         MUTEX_EXIT(&peer->peer_lock);
2550     }
2551 }
2552
2553
2554 /* Send a "special" packet to the peer connection.  If call is
2555  * specified, then the packet is directed to a specific call channel
2556  * associated with the connection, otherwise it is directed to the
2557  * connection only. Uses optionalPacket if it is supplied, rather than
2558  * allocating a new packet buffer.  Nbytes is the length of the data
2559  * portion of the packet.  If data is non-null, nbytes of data are
2560  * copied into the packet.  Type is the type of the packet, as defined
2561  * in rx.h.  Bug: there's a lot of duplication between this and other
2562  * routines.  This needs to be cleaned up. */
2563 struct rx_packet *
2564 rxi_SendSpecial(struct rx_call *call,
2565                 struct rx_connection *conn,
2566                 struct rx_packet *optionalPacket, int type, char *data,
2567                 int nbytes, int istack)
2568 {
2569     /* Some of the following stuff should be common code for all
2570      * packet sends (it's repeated elsewhere) */
2571     struct rx_packet *p;
2572     unsigned int i = 0;
2573     int savelen = 0, saven = 0;
2574     int channel, callNumber;
2575     if (call) {
2576         channel = call->channel;
2577         callNumber = *call->callNumber;
2578         /* BUSY packets refer to the next call on this connection */
2579         if (type == RX_PACKET_TYPE_BUSY) {
2580             callNumber++;
2581         }
2582     } else {
2583         channel = 0;
2584         callNumber = 0;
2585     }
2586     p = optionalPacket;
2587     if (!p) {
2588         p = rxi_AllocPacket(RX_PACKET_CLASS_SPECIAL);
2589         if (!p)
2590             osi_Panic("rxi_SendSpecial failure");
2591     }
2592
2593     if (nbytes != -1)
2594         p->length = nbytes;
2595     else
2596         nbytes = p->length;
2597     p->header.serviceId = conn->serviceId;
2598     p->header.securityIndex = conn->securityIndex;
2599     p->header.cid = (conn->cid | channel);
2600     p->header.callNumber = callNumber;
2601     p->header.seq = 0;
2602     p->header.epoch = conn->epoch;
2603     p->header.type = type;
2604     p->header.flags = 0;
2605     if (conn->type == RX_CLIENT_CONNECTION)
2606         p->header.flags |= RX_CLIENT_INITIATED;
2607     if (data)
2608         rx_packetwrite(p, 0, nbytes, data);
2609
2610     for (i = 1; i < p->niovecs; i++) {
2611         if (nbytes <= p->wirevec[i].iov_len) {
2612             savelen = p->wirevec[i].iov_len;
2613             saven = p->niovecs;
2614             p->wirevec[i].iov_len = nbytes;
2615             p->niovecs = i + 1; /* so condition fails because i == niovecs */
2616         } else
2617             nbytes -= p->wirevec[i].iov_len;
2618     }
2619
2620     if (call)
2621         rxi_Send(call, p, istack);
2622     else
2623         rxi_SendPacket((struct rx_call *)0, conn, p, istack);
2624     if (saven) {                /* means we truncated the packet above.  We probably don't  */
2625         /* really need to do this, but it seems safer this way, given that  */
2626         /* sneaky optionalPacket... */
2627         p->wirevec[i - 1].iov_len = savelen;
2628         p->niovecs = saven;
2629     }
2630     if (!optionalPacket)
2631         rxi_FreePacket(p);
2632     return optionalPacket;
2633 }
2634
2635
2636 /* Encode the packet's header (from the struct header in the packet to
2637  * the net byte order representation in the wire representation of the
2638  * packet, which is what is actually sent out on the wire) */
2639 void
2640 rxi_EncodePacketHeader(struct rx_packet *p)
2641 {
2642     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2643
2644     memset(buf, 0, RX_HEADER_SIZE);
2645     *buf++ = htonl(p->header.epoch);
2646     *buf++ = htonl(p->header.cid);
2647     *buf++ = htonl(p->header.callNumber);
2648     *buf++ = htonl(p->header.seq);
2649     *buf++ = htonl(p->header.serial);
2650     *buf++ = htonl((((afs_uint32) p->header.type) << 24)
2651                    | (((afs_uint32) p->header.flags) << 16)
2652                    | (p->header.userStatus << 8) | p->header.securityIndex);
2653     /* Note: top 16 bits of this next word were reserved */
2654     *buf++ = htonl((p->header.spare << 16) | (p->header.serviceId & 0xffff));
2655 }
2656
2657 /* Decode the packet's header (from net byte order to a struct header) */
2658 void
2659 rxi_DecodePacketHeader(struct rx_packet *p)
2660 {
2661     afs_uint32 *buf = (afs_uint32 *) (p->wirevec[0].iov_base);  /* MTUXXX */
2662     afs_uint32 temp;
2663
2664     p->header.epoch = ntohl(*buf);
2665     buf++;
2666     p->header.cid = ntohl(*buf);
2667     buf++;
2668     p->header.callNumber = ntohl(*buf);
2669     buf++;
2670     p->header.seq = ntohl(*buf);
2671     buf++;
2672     p->header.serial = ntohl(*buf);
2673     buf++;
2674
2675     temp = ntohl(*buf);
2676     buf++;
2677
2678     /* C will truncate byte fields to bytes for me */
2679     p->header.type = temp >> 24;
2680     p->header.flags = temp >> 16;
2681     p->header.userStatus = temp >> 8;
2682     p->header.securityIndex = temp >> 0;
2683
2684     temp = ntohl(*buf);
2685     buf++;
2686
2687     p->header.serviceId = (temp & 0xffff);
2688     p->header.spare = temp >> 16;
2689     /* Note: top 16 bits of this last word are the security checksum */
2690 }
2691
2692 void
2693 rxi_PrepareSendPacket(struct rx_call *call,
2694                       struct rx_packet *p, int last)
2695 {
2696     struct rx_connection *conn = call->conn;
2697     unsigned int i;
2698     afs_int32 len;              /* len must be a signed type; it can go negative */
2699
2700     p->flags &= ~RX_PKTFLAG_ACKED;
2701     p->header.cid = (conn->cid | call->channel);
2702     p->header.serviceId = conn->serviceId;
2703     p->header.securityIndex = conn->securityIndex;
2704
2705     /* No data packets on call 0. Where do these come from? */
2706     if (*call->callNumber == 0)
2707         *call->callNumber = 1;
2708
2709     p->header.callNumber = *call->callNumber;
2710     p->header.seq = call->tnext++;
2711     p->header.epoch = conn->epoch;
2712     p->header.type = RX_PACKET_TYPE_DATA;
2713     p->header.flags = 0;
2714     p->header.spare = 0;
2715     if (conn->type == RX_CLIENT_CONNECTION)
2716         p->header.flags |= RX_CLIENT_INITIATED;
2717
2718     if (last)
2719         p->header.flags |= RX_LAST_PACKET;
2720
2721     clock_Zero(&p->retryTime);  /* Never yet transmitted */
2722     clock_Zero(&p->firstSent);  /* Never yet transmitted */
2723     p->header.serial = 0;       /* Another way of saying never transmitted... */
2724     p->backoff = 0;
2725
2726     /* Now that we're sure this is the last data on the call, make sure
2727      * that the "length" and the sum of the iov_lens matches. */
2728     len = p->length + call->conn->securityHeaderSize;
2729
2730     for (i = 1; i < p->niovecs && len > 0; i++) {
2731         len -= p->wirevec[i].iov_len;
2732     }
2733     if (len > 0) {
2734         osi_Panic("PrepareSendPacket 1\n");     /* MTUXXX */
2735     } else if (i < p->niovecs) {
2736         /* Free any extra elements in the wirevec */
2737 #if defined(RX_ENABLE_TSFPQ)
2738         rxi_FreeDataBufsTSFPQ(p, i, 1 /* allow global pool flush if overquota */);
2739 #else /* !RX_ENABLE_TSFPQ */
2740         MUTEX_ENTER(&rx_freePktQ_lock);
2741         rxi_FreeDataBufsNoLock(p, i);
2742         MUTEX_EXIT(&rx_freePktQ_lock);
2743 #endif /* !RX_ENABLE_TSFPQ */
2744
2745         p->niovecs = i;
2746     }
2747     if (len)
2748         p->wirevec[i - 1].iov_len += len;
2749     RXS_PreparePacket(conn->securityObject, call, p);
2750 }
2751
2752 /* Given an interface MTU size, calculate an adjusted MTU size that
2753  * will make efficient use of the RX buffers when the peer is sending
2754  * either AFS 3.4a jumbograms or AFS 3.5 jumbograms.  */
2755 int
2756 rxi_AdjustIfMTU(int mtu)
2757 {
2758     int adjMTU;
2759     int frags;
2760
2761     if (rxi_nRecvFrags == 1 && rxi_nSendFrags == 1)
2762         return mtu;
2763     adjMTU = RX_HEADER_SIZE + RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE;
2764     if (mtu <= adjMTU) {
2765         return mtu;
2766     }
2767     mtu -= adjMTU;
2768     if (mtu <= 0) {
2769         return adjMTU;
2770     }
2771     frags = mtu / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE);
2772     return (adjMTU + (frags * (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2773 }
2774
2775 /* Given an interface MTU size, and the peer's advertised max receive
2776  * size, calculate an adjisted maxMTU size that makes efficient use
2777  * of our packet buffers when we are sending AFS 3.4a jumbograms. */
2778 int
2779 rxi_AdjustMaxMTU(int mtu, int peerMaxMTU)
2780 {
2781     int maxMTU = mtu * rxi_nSendFrags;
2782     maxMTU = MIN(maxMTU, peerMaxMTU);
2783     return rxi_AdjustIfMTU(maxMTU);
2784 }
2785
2786 /* Given a packet size, figure out how many datagram packet will fit.
2787  * The first buffer always contains RX_HEADER_SIZE+RX_JUMBOBUFFERSIZE+
2788  * RX_JUMBOHEADERSIZE, the middle buffers contain RX_JUMBOBUFFERSIZE+
2789  * RX_JUMBOHEADERSIZE, and the last buffer contains RX_JUMBOBUFFERSIZE */
2790 int
2791 rxi_AdjustDgramPackets(int frags, int mtu)
2792 {
2793     int maxMTU;
2794     if (mtu + IPv6_FRAG_HDR_SIZE < RX_JUMBOBUFFERSIZE + RX_HEADER_SIZE) {
2795         return 1;
2796     }
2797     maxMTU = (frags * (mtu + UDP_HDR_SIZE)) - UDP_HDR_SIZE;
2798     maxMTU = MIN(maxMTU, RX_MAX_PACKET_SIZE);
2799     /* subtract the size of the first and last packets */
2800     maxMTU -= RX_HEADER_SIZE + (2 * RX_JUMBOBUFFERSIZE) + RX_JUMBOHEADERSIZE;
2801     if (maxMTU < 0) {
2802         return 1;
2803     }
2804     return (2 + (maxMTU / (RX_JUMBOBUFFERSIZE + RX_JUMBOHEADERSIZE)));
2805 }
2806
2807 #ifndef KERNEL
2808 /*
2809  * This function can be used by the Windows Cache Manager
2810  * to dump the list of all rx packets so that we can determine
2811  * where the packet leakage is.
2812  */
2813 int rx_DumpPackets(FILE *outputFile, char *cookie)
2814 {
2815 #ifdef RXDEBUG_PACKET
2816     struct rx_packet *p;
2817 #ifdef AFS_NT40_ENV
2818     int zilch;
2819     char output[2048];
2820 #define RXDPRINTF sprintf
2821 #define RXDPRINTOUT output
2822 #else
2823 #define RXDPRINTF fprintf
2824 #define RXDPRINTOUT outputFile
2825 #endif
2826
2827     NETPRI;
2828     MUTEX_ENTER(&rx_freePktQ_lock);
2829     RXDPRINTF(RXDPRINTOUT, "%s - Start dumping all Rx Packets - count=%u\r\n", cookie, rx_packet_id);
2830 #ifdef AFS_NT40_ENV
2831     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2832 #endif
2833
2834     for (p = rx_mallocedP; p; p = p->allNextp) {
2835         RXDPRINTF(RXDPRINTOUT, "%s - packet=0x%p, id=%u, firstSent=%u.%08u, timeSent=%u.%08u, retryTime=%u.%08u, firstSerial=%u, niovecs=%u, flags=0x%x, backoff=%u, length=%u  header: epoch=%u, cid=%u, callNum=%u, seq=%u, serial=%u, type=%u, flags=0x%x, userStatus=%u, securityIndex=%u, serviceId=%u\r\n",
2836                 cookie, p, p->packetId, p->firstSent.sec, p->firstSent.usec, p->timeSent.sec, p->timeSent.usec, p->retryTime.sec, p->retryTime.usec,
2837                 p->firstSerial, p->niovecs, (afs_uint32)p->flags, (afs_uint32)p->backoff, (afs_uint32)p->length,
2838                 p->header.epoch, p->header.cid, p->header.callNumber, p->header.seq, p->header.serial,
2839                 (afs_uint32)p->header.type, (afs_uint32)p->header.flags, (afs_uint32)p->header.userStatus,
2840                 (afs_uint32)p->header.securityIndex, (afs_uint32)p->header.serviceId);
2841 #ifdef AFS_NT40_ENV
2842         WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2843 #endif
2844     }
2845
2846     RXDPRINTF(RXDPRINTOUT, "%s - End dumping all Rx Packets\r\n", cookie);
2847 #ifdef AFS_NT40_ENV
2848     WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);
2849 #endif
2850
2851     MUTEX_EXIT(&rx_freePktQ_lock);
2852     USERPRI;
2853 #endif /* RXDEBUG_PACKET */
2854     return 0;
2855 }
2856 #endif