src/afs/afs_cbqueue.c

   1 /*
   2  * Copyright 2000, International Business Machines Corporation and others.
   3  * All Rights Reserved.
   4  *
   5  * This software has been released under the terms of the IBM Public
   6  * License.  For details, see the LICENSE file in the top-level source
   7  * directory or online at http://www.openafs.org/dl/license10.html
   8  */
   9
  10 /*
  11  * This package is used to actively manage the expiration of callbacks,
  12  * so that the rest of the cache manager doesn't need to compute
  13  * whether a callback has expired or not, but can tell with one simple
  14  * check, that is, whether the CStatd bit is on or off.
  15  *
  16  * The base of the hash table moves periodically (every 128 seconds)
  17  * QueueCallback rarely touches the first 3 slots in the hash table
  18  * (only when called from CheckCallbacks) since MinTimeOut in
  19  * viced/callback.c is currently 7 minutes.
  20  * Therefore, CheckCallbacks should be able to run concurrently with
  21  * QueueCallback, given the proper locking, of course.
  22  *
  23  * Note:
  24  * 1. CheckCallbacks and BumpBase never run simultaneously.  This is because
  25  * they are only called from afs_Daemon.  Therefore, base and basetime will
  26  * always be consistent during CheckCallbacks.
  27  * 2. cbHashT [base] rarely (if ever) gets stuff queued in it.  The only way
  28  * that could happen is CheckCallbacks might fencepost and move something in
  29  * place, or BumpBase might push some stuff up.
  30  * 3. Hash chains aren't particularly sorted.
  31  * 4. The file server keeps its callback state around for 3 minutes
  32  * longer than it promises the cache manager in order to account for
  33  * clock skew, network delay, and other bogeymen.
  34  *
  35  * For now I just use one large lock, which is fine on a uniprocessor,
  36  * since it's not held during any RPCs or low-priority I/O operations.
  37  * To make this code MP-fast, you need no more locks than processors,
  38  * but probably more than one.  In measurements on MP-safe implementations,
  39  * I have never seen any contention over the xcbhash lock.
  40  *
  41  * Incompatible operations:
  42  * Enqueue and "dequeue of first vcache" in same slot
  43  * dequeue and "dequeue of preceding vcache" in same slot
  44  * dequeue and "dequeue of successive vcache" in same slot
  45  * BumpBase pushing a list and enqueue in the new base slot
  46  * Two enqueues in same slot
  47  * more...
  48  *
  49  * Certain invariants exist:
  50  *    1  Callback expiration times granted by a file server will never
  51  *       decrease for a particular vnode UNLESS a CallBack RPC is invoked
  52  *       by the server in the interim.
  53  *    2  A vcache will always expire no sooner than the slot in which it is
  54  *       currently enqueued.  Callback times granted by the server may
  55  *       increase, in which case the vcache will be updated in-place.  As a
  56  *       result, it may expire later than the slot in which it is enqueued.
  57  *       Not to worry, the CheckCallbacks code will move it if neccessary.
  58  *       This approach means that busy vnodes won't be continually moved
  59  *       around within the expiry queue: they are only moved when they
  60  *       finally advance to the lead bucket.
  61  *    3  Anything which has a callback on it must be in the expiry
  62  *       queue.  In AFS 3.3, that means everything but symlinks (which
  63  *       are immutable), including contents of Read-Only volumes
  64  *       (which have callbacks by virtue of the whole-volume callback)
  65  *
  66  * QueueCallback only checks that its vcache is in the list
  67  * somewhere, counting on invariant #1 to guarantee that the vcache
  68  * won't be in a slot later than QueueCallback would otherwise place
  69  * it. Therefore, whenever we turn off the CStatd bit on the vcache, we
  70  * *must* remove the vcache from the expiry queue.  Otherwise, we
  71  * might have missed a CallBack RPC, and a subsequent callback might be
  72  * granted with a shorter expiration time.
  73  */
  74 #include <afsconfig.h>
  75 #include "afs/param.h"
  76
  77 RCSID
  78     ("$Header$");
  79
  80 #include "afs/sysincludes.h"    /*Standard vendor system headers */
  81 #include "afsincludes.h"        /*AFS-based standard headers */
  82 #include "afs/afs_cbqueue.h"
  83 #include "afs/afs.h"
  84 #include "afs/lock.h"
  85 #include "afs/afs_stats.h"
  86
  87 static unsigned int base = 0;
  88 static unsigned int basetime = 0;
  89 static struct vcache *debugvc;  /* used only for post-mortem debugging */
  90 struct bucket {
  91     struct afs_q head;
  92     /*  struct afs_lock lock;  only if you want lots of locks... */
  93 };
  94 static struct bucket cbHashT[CBHTSIZE];
  95 struct afs_lock afs_xcbhash;
  96
  97 /* afs_QueueCallback
  98  * Takes a write-locked vcache pointer and a callback expiration time
  99  * as returned by the file server (ie, in units of 128 seconds from "now").
 100  *
 101  * Uses the time as an index into a hash table, and inserts the vcache
 102  * structure into the overflow chain.
 103  *
 104  * If the vcache is already on some hash chain, leave it there.
 105  * CheckCallbacks will get to it eventually.  In the meantime, it
 106  * might get flushed, or it might already be on the right hash chain,
 107  * so why bother messing with it now?
 108  *
 109  * NOTE: The caller must hold a write lock on afs_xcbhash
 110  */
 111
 112 void
 113 afs_QueueCallback(struct vcache *avc, unsigned int atime, struct volume *avp)
 114 {
 115     if (avp && (avp->expireTime < avc->cbExpires))
 116         avp->expireTime = avc->cbExpires;
 117     if (!(avc->callsort.next)) {
 118         atime = (atime + base) % CBHTSIZE;
 119         QAdd(&(cbHashT[atime].head), &(avc->callsort));
 120     }
 121
 122     return;
 123 }                               /* afs_QueueCallback */
 124
 125 /* afs_DequeueCallback
 126  * Takes a write-locked vcache pointer and removes it from the callback
 127  * hash table, without knowing beforehand which slot it was in.
 128  *
 129  * for now, just get a lock on everything when doing the dequeue, don't
 130  * worry about getting a lock on the individual slot.
 131  *
 132  * the only other places that do anything like dequeues are CheckCallbacks
 133  * and BumpBase.
 134  *
 135  * NOTE: The caller must hold a write lock on afs_xcbhash
 136  */
 137 void
 138 afs_DequeueCallback(struct vcache *avc)
 139 {
 140
 141     debugvc = avc;
 142     if (avc->callsort.prev) {
 143         QRemove(&(avc->callsort));
 144         avc->callsort.prev = avc->callsort.next = NULL;
 145     } else;                     /* must have got dequeued in a race */
 146     afs_symhint_inval(avc);
 147
 148     return;
 149 }                               /* afs_DequeueCallback */
 150
 151 /* afs_CheckCallbacks
 152  * called periodically to determine which callbacks are likely to
 153  * expire in the next n second interval.  Preemptively marks them as
 154  * expired.  Rehashes items which are now in the wrong hash bucket.
 155  * Preemptively renew recently-accessed items.  Only removes things
 156  * from the first and second bucket (as long as secs < 128), and
 157  * inserts things into other, later buckets.  either need to advance
 158  * to the second bucket if secs spans two intervals, or else be
 159  * certain to call afs_CheckCallbacks immediately after calling
 160  * BumpBase (allows a little more slop but it's ok because file server
 161  * keeps 3 minutes of slop time)
 162  *
 163  * There is a little race between CheckCallbacks and any code which
 164  * updates cbExpires, always just prior to calling QueueCallback. We
 165  * don't lock the vcache struct here (can't, or we'd risk deadlock),
 166  * so GetVCache (for example) may update cbExpires before or after #1
 167  * below.  If before, CheckCallbacks moves this entry to its proper
 168  * slot.  If after, GetVCache blocks in the call to QueueCallbacks,
 169  * this code dequeues the vcache, and then QueueCallbacks re-enqueues it.
 170  *
 171  * XXX to avoid the race, make QueueCallback take the "real" time
 172  * and update cbExpires under the xcbhash lock.
 173  *
 174  * NB #1: There's a little optimization here: if I go to invalidate a
 175  * RO vcache or volume, first check to see if the server is down.  If
 176  * it _is_, don't invalidate it, cuz we might just as well keep using
 177  * it.  Possibly, we could do the same thing for items in RW volumes,
 178  * but that bears some drinking about.
 179  *
 180  * Don't really need to invalidate the hints, we could just wait to see if
 181  * the dv has changed after a subsequent FetchStatus, but this is safer.
 182  */
 183
 184 /* Sanity check on the callback queue. Allow for slop in the computation. */
 185 #ifdef AFS_OSF_ENV
 186 #define CBQ_LIMIT (afs_maxvcount + 10)
 187 #else
 188 #define CBQ_LIMIT (afs_cacheStats + afs_stats_cmperf.vcacheXAllocs + 10)
 189 #endif
 190
 191 void
 192 afs_CheckCallbacks(unsigned int secs)
 193 {
 194     struct vcache *tvc;
 195     register struct afs_q *tq;
 196     struct afs_q *uq;
 197     afs_uint32 now;
 198     struct volume *tvp;
 199     register int safety;
 200
 201     ObtainWriteLock(&afs_xcbhash, 85);  /* pretty likely I'm going to remove something */
 202     now = osi_Time();
 203     for (safety = 0, tq = cbHashT[base].head.prev;
 204          (safety <= CBQ_LIMIT) && (tq != &(cbHashT[base].head));
 205          tq = uq, safety++) {
 206
 207         uq = QPrev(tq);
 208         tvc = CBQTOV(tq);
 209         if (tvc->cbExpires < now + secs) {      /* race #1 here */
 210             /* Get the volume, and if its callback expiration time is more than secs
 211              * seconds into the future, update this vcache entry and requeue it below
 212              */
 213             if ((tvc->states & CRO)
 214                 && (tvp = afs_FindVolume(&(tvc->fid), READ_LOCK))) {
 215                 if (tvp->expireTime > now + secs) {
 216                     tvc->cbExpires = tvp->expireTime;   /* XXX race here */
 217                 } else {
 218                     int i;
 219                     for (i = 0; i < MAXHOSTS && tvp->serverHost[i]; i++) {
 220                         if (!(tvp->serverHost[i]->flags & SRVR_ISDOWN)) {
 221                             /* What about locking xvcache or vrefcount++ or
 222                              * write locking tvc? */
 223                             QRemove(tq);
 224                             tq->prev = tq->next = NULL;
 225                             tvc->states &= ~(CStatd | CMValid | CUnique);
 226                             if ((tvc->fid.Fid.Vnode & 1)
 227                                 || (vType(tvc) == VDIR))
 228                                 osi_dnlc_purgedp(tvc);
 229                             tvc->quick.stamp = 0;
 230                             tvc->h1.dchint = NULL;      /*invalidate em */
 231                             afs_ResetVolumeInfo(tvp);
 232                             break;
 233                         }
 234                     }
 235                 }
 236                 afs_PutVolume(tvp, READ_LOCK);
 237             } else {
 238                 /* Do I need to worry about things like execsorwriters?
 239                  * What about locking xvcache or vrefcount++ or write locking tvc?
 240                  */
 241                 QRemove(tq);
 242                 tq->prev = tq->next = NULL;
 243                 tvc->states &= ~(CStatd | CMValid | CUnique);
 244                 if ((tvc->fid.Fid.Vnode & 1) || (vType(tvc) == VDIR))
 245                     osi_dnlc_purgedp(tvc);
 246             }
 247         }
 248
 249         if ((tvc->cbExpires > basetime) && CBHash(tvc->cbExpires - basetime)) {
 250             /* it's been renewed on us.  Have to be careful not to put it back
 251              * into this slot, or we may never get out of here.
 252              */
 253             int slot;
 254             slot = (CBHash(tvc->cbExpires - basetime) + base) % CBHTSIZE;
 255             if (slot != base) {
 256                 if (QPrev(tq))
 257                     QRemove(&(tvc->callsort));
 258                 QAdd(&(cbHashT[slot].head), &(tvc->callsort));
 259                 /* XXX remember to update volume expiration time */
 260                 /* -- not needed for correctness, though */
 261             }
 262         }
 263     }
 264
 265     if (safety > CBQ_LIMIT) {
 266         afs_stats_cmperf.cbloops++;
 267         if (afs_paniconwarn)
 268             osi_Panic("CheckCallbacks");
 269
 270         afs_warn
 271             ("AFS Internal Error (minor): please contact AFS Product Support.\n");
 272         ReleaseWriteLock(&afs_xcbhash);
 273         afs_FlushCBs();
 274         return;
 275     } else
 276         ReleaseWriteLock(&afs_xcbhash);
 277
 278
 279 /* XXX future optimization:
 280    if this item has been recently accessed, queue up a stat for it.
 281    {
 282    struct dcache * adc;
 283
 284    ObtainReadLock(&afs_xdcache);
 285    if ((adc = tvc->quick.dc) && (adc->stamp == tvc->quick.stamp)
 286    && (afs_indexTimes[adc->index] > afs_indexCounter - 20)) {
 287    queue up the stat request
 288    }
 289    ReleaseReadLock(&afs_xdcache);
 290    }
 291    */
 292
 293     return;
 294 }                               /* afs_CheckCallback */
 295
 296 /* afs_FlushCBs
 297  * to be used only in dire circumstances, this drops all callbacks on
 298  * the floor, without giving them back to the server.  It's ok, the server can
 299  * deal with it, but it is a little bit rude.
 300  */
 301 void
 302 afs_FlushCBs(void)
 303 {
 304     register int i;
 305     register struct vcache *tvc;
 306
 307     ObtainWriteLock(&afs_xcbhash, 86);  /* pretty likely I'm going to remove something */
 308
 309     for (i = 0; i < VCSIZE; i++)        /* reset all the vnodes */
 310         for (tvc = afs_vhashT[i]; tvc; tvc = tvc->hnext) {
 311             tvc->callback = 0;
 312             tvc->quick.stamp = 0;
 313             tvc->h1.dchint = NULL;      /* invalidate hints */
 314             tvc->states &= ~(CStatd);
 315             if ((tvc->fid.Fid.Vnode & 1) || (vType(tvc) == VDIR))
 316                 osi_dnlc_purgedp(tvc);
 317             tvc->callsort.prev = tvc->callsort.next = NULL;
 318         }
 319
 320     afs_InitCBQueue(0);
 321
 322     ReleaseWriteLock(&afs_xcbhash);
 323 }
 324
 325 /* afs_FlushServerCBs
 326  * to be used only in dire circumstances, this drops all callbacks on
 327  * the floor for a specific server, without giving them back to the server.
 328  * It's ok, the server can deal with it, but it is a little bit rude.
 329  */
 330 void
 331 afs_FlushServerCBs(struct server *srvp)
 332 {
 333     register int i;
 334     register struct vcache *tvc;
 335
 336     ObtainWriteLock(&afs_xcbhash, 86);  /* pretty likely I'm going to remove something */
 337
 338     for (i = 0; i < VCSIZE; i++) {      /* reset all the vnodes */
 339         for (tvc = afs_vhashT[i]; tvc; tvc = tvc->hnext) {
 340             if (tvc->callback == srvp) {
 341                 tvc->callback = 0;
 342                 tvc->quick.stamp = 0;
 343                 tvc->h1.dchint = NULL;  /* invalidate hints */
 344                 tvc->states &= ~(CStatd);
 345                 if ((tvc->fid.Fid.Vnode & 1) || (vType(tvc) == VDIR)) {
 346                     osi_dnlc_purgedp(tvc);
 347                 }
 348                 afs_DequeueCallback(tvc);
 349             }
 350         }
 351     }
 352
 353     ReleaseWriteLock(&afs_xcbhash);
 354 }
 355
 356 /* afs_InitCBQueue
 357  *  called to initialize static and global variables associated with
 358  *  the Callback expiration management mechanism.
 359  */
 360 void
 361 afs_InitCBQueue(int doLockInit)
 362 {
 363     register int i;
 364
 365     memset((char *)cbHashT, 0, CBHTSIZE * sizeof(struct bucket));
 366     for (i = 0; i < CBHTSIZE; i++) {
 367         QInit(&(cbHashT[i].head));
 368         /* Lock_Init(&(cbHashT[i].lock)); only if you want lots of locks, which
 369          * don't seem too useful at present.  */
 370     }
 371     base = 0;
 372     basetime = osi_Time();
 373     if (doLockInit)
 374         Lock_Init(&afs_xcbhash);
 375 }
 376
 377 /* Because there are no real-time guarantees, and especially because a
 378  * thread may wait on a lock indefinitely, this routine has to be
 379  * careful that it doesn't get permanently out-of-date.  Important
 380  * assumption: this routine is only called from afs_Daemon, so there
 381  * can't be more than one instance of this running at any one time.
 382  * Presumes that basetime is never 0, and is always sane.
 383  *
 384  * Before calling this routine, be sure that the first slot is pretty
 385  * empty.  This -20 is because the granularity of the checks in
 386  * afs_Daemon is pretty large, so I'd rather err on the side of safety
 387  * sometimes.  The fact that I only bump basetime by CBHTSLOTLEN-1
 388  * instead of the whole CBHTSLOTLEN is also for "safety".
 389  * Conceptually, it makes this clock run just a little faster than the
 390  * clock governing which slot a callback gets hashed into.  Both of these
 391  * things make CheckCallbacks work a little harder than it would have to
 392  * if I wanted to cut things finer.
 393  * Everything from the old first slot is carried over into the new first
 394  * slot.  Thus, if there were some things that ought to have been invalidated,
 395  * but weren't (say, if the server was down), they will be examined at every
 396  * opportunity thereafter.
 397  */
 398 int
 399 afs_BumpBase(void)
 400 {
 401     afs_uint32 now;
 402     int didbump;
 403     u_int oldbase;
 404
 405     ObtainWriteLock(&afs_xcbhash, 87);
 406     didbump = 0;
 407     now = osi_Time();
 408     while (basetime + (CBHTSLOTLEN - 20) <= now) {
 409         oldbase = base;
 410         basetime += CBHTSLOTLEN - 1;
 411         base = (base + 1) % CBHTSIZE;
 412         didbump++;
 413         if (!QEmpty(&(cbHashT[oldbase].head))) {
 414             QCat(&(cbHashT[oldbase].head), &(cbHashT[base].head));
 415         }
 416     }
 417     ReleaseWriteLock(&afs_xcbhash);
 418
 419     return didbump;
 420 }