2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
15 Institution: The Information Technology Center, Carnegie-Mellon University
18 #include <afsconfig.h>
19 #include <afs/param.h>
25 #ifdef HAVE_SYS_FILE_H
30 #include "rx/rx_queue.h"
31 #include <afs/afsint.h>
33 #include <afs/errors.h>
36 #include <afs/afssyscalls.h>
40 #include "volume_inline.h"
41 #include "vnode_inline.h"
42 #include "partition.h"
49 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
51 void VNLog(afs_int32 aop, afs_int32 anparms, ... );
58 #define BAD_IGET -1000
60 /* There are two separate vnode queue types defined here:
61 * Each hash conflict chain -- is singly linked, with a single head
62 * pointer. New entries are added at the beginning. Old
63 * entries are removed by linear search, which generally
64 * only occurs after a disk read).
65 * LRU chain -- is doubly linked, single head pointer.
66 * Entries are added at the head, reclaimed from the tail,
67 * or removed from anywhere in the queue.
71 /* Vnode hash table. Find hash chain by taking lower bits of
72 * (volume_hash_offset + vnode).
73 * This distributes the root inodes of the volumes over the
74 * hash table entries and also distributes the vnodes of
75 * volumes reasonably fairly. The volume_hash_offset field
76 * for each volume is established as the volume comes on line
77 * by using the VOLUME_HASH_OFFSET macro. This distributes the
78 * volumes fairly among the cache entries, both when servicing
79 * a small number of volumes and when servicing a large number.
82 /* logging stuff for finding bugs */
83 #define THELOGSIZE 5120
84 static afs_int32 theLog[THELOGSIZE];
85 static afs_int32 vnLogPtr = 0;
87 VNLog(afs_int32 aop, afs_int32 anparms, ... )
92 va_start(ap, anparms);
95 anparms = 4; /* do bounds checking */
97 temp = (aop << 16) | anparms;
98 theLog[vnLogPtr++] = temp;
99 if (vnLogPtr >= THELOGSIZE)
101 for (temp = 0; temp < anparms; temp++) {
102 theLog[vnLogPtr++] = va_arg(ap, afs_int32);
103 if (vnLogPtr >= THELOGSIZE)
109 /* VolumeHashOffset -- returns a new value to be stored in the
110 * volumeHashOffset of a Volume structure. Called when a
111 * volume is initialized. Sets the volumeHashOffset so that
112 * vnode cache entries are distributed reasonably between
113 * volumes (the root vnodes of the volumes will hash to
114 * different values, and spacing is maintained between volumes
115 * when there are not many volumes represented), and spread
116 * equally amongst vnodes within a single volume.
119 VolumeHashOffset_r(void)
121 static int nextVolumeHashOffset = 0;
122 /* hashindex Must be power of two in size */
124 # define hashMask ((1<<hashShift)-1)
125 static byte hashindex[1 << hashShift] =
126 { 0, 128, 64, 192, 32, 160, 96, 224 };
128 offset = hashindex[nextVolumeHashOffset & hashMask]
129 + (nextVolumeHashOffset >> hashShift);
130 nextVolumeHashOffset++;
134 /* Change hashindex (above) if you change this constant */
135 #define VNODE_HASH_TABLE_SIZE 256
136 private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
137 #define VNODE_HASH(volumeptr,vnodenumber)\
138 ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
142 * add a vnode to the volume's vnode list.
144 * @param[in] vp volume object pointer
145 * @param[in] vnp vnode object pointer
147 * @note for DAFS, it may seem like we should be acquiring a lightweight ref
148 * on vp, but this would actually break things. Right now, this is ok
149 * because we destroy all vnode cache contents during during volume
154 * @internal volume package internal use only
157 AddToVVnList(Volume * vp, Vnode * vnp)
159 if (queue_IsOnQueue(vnp))
163 Vn_cacheCheck(vnp) = vp->cacheCheck;
164 queue_Append(&vp->vnode_list, vnp);
165 Vn_stateFlags(vnp) |= VN_ON_VVN;
169 * delete a vnode from the volume's vnode list.
173 * @internal volume package internal use only
176 DeleteFromVVnList(Vnode * vnp)
178 Vn_volume(vnp) = NULL;
180 if (!queue_IsOnQueue(vnp))
184 Vn_stateFlags(vnp) &= ~(VN_ON_VVN);
188 * add a vnode to the end of the lru.
190 * @param[in] vcp vnode class info object pointer
191 * @param[in] vnp vnode object pointer
193 * @internal vnode package internal use only
196 AddToVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
198 if (Vn_stateFlags(vnp) & VN_ON_LRU) {
202 /* Add it to the circular LRU list */
203 if (vcp->lruHead == NULL)
204 Abort("VPutVnode: vcp->lruHead==NULL");
206 vnp->lruNext = vcp->lruHead;
207 vnp->lruPrev = vcp->lruHead->lruPrev;
208 vcp->lruHead->lruPrev = vnp;
209 vnp->lruPrev->lruNext = vnp;
213 /* If the vnode was just deleted, put it at the end of the chain so it
214 * will be reused immediately */
216 vcp->lruHead = vnp->lruNext;
218 Vn_stateFlags(vnp) |= VN_ON_LRU;
222 * delete a vnode from the lru.
224 * @param[in] vcp vnode class info object pointer
225 * @param[in] vnp vnode object pointer
227 * @internal vnode package internal use only
230 DeleteFromVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
232 if (!(Vn_stateFlags(vnp) & VN_ON_LRU)) {
236 if (vnp == vcp->lruHead)
237 vcp->lruHead = vcp->lruHead->lruNext;
239 if ((vnp == vcp->lruHead) ||
240 (vcp->lruHead == NULL))
241 Abort("DeleteFromVnLRU: lru chain addled!\n");
243 vnp->lruPrev->lruNext = vnp->lruNext;
244 vnp->lruNext->lruPrev = vnp->lruPrev;
246 Vn_stateFlags(vnp) &= ~(VN_ON_LRU);
250 * add a vnode to the vnode hash table.
252 * @param[in] vnp vnode object pointer
256 * @post vnode on hash
258 * @internal vnode package internal use only
261 AddToVnHash(Vnode * vnp)
263 unsigned int newHash;
265 if (!(Vn_stateFlags(vnp) & VN_ON_HASH)) {
266 newHash = VNODE_HASH(Vn_volume(vnp), Vn_id(vnp));
267 vnp->hashNext = VnodeHashTable[newHash];
268 VnodeHashTable[newHash] = vnp;
269 vnp->hashIndex = newHash;
271 Vn_stateFlags(vnp) |= VN_ON_HASH;
276 * delete a vnode from the vnode hash table.
283 * @post vnode removed from hash
285 * @internal vnode package internal use only
288 DeleteFromVnHash(Vnode * vnp)
292 if (Vn_stateFlags(vnp) & VN_ON_HASH) {
293 tvnp = VnodeHashTable[vnp->hashIndex];
295 VnodeHashTable[vnp->hashIndex] = vnp->hashNext;
297 while (tvnp && tvnp->hashNext != vnp)
298 tvnp = tvnp->hashNext;
300 tvnp->hashNext = vnp->hashNext;
303 vnp->hashNext = NULL;
305 Vn_stateFlags(vnp) &= ~(VN_ON_HASH);
311 * invalidate a vnode cache entry.
313 * @param[in] avnode vnode object pointer
317 * @post vnode metadata invalidated.
318 * vnode removed from hash table.
319 * DAFS: vnode state set to VN_STATE_INVALID.
321 * @internal vnode package internal use only
324 VInvalidateVnode_r(struct Vnode *avnode)
326 avnode->changed_newTime = 0; /* don't let it get flushed out again */
327 avnode->changed_oldTime = 0;
328 avnode->delete = 0; /* it isn't deleted, really */
329 avnode->cacheCheck = 0; /* invalid: prevents future vnode searches from working */
330 DeleteFromVnHash(avnode);
331 #ifdef AFS_DEMAND_ATTACH_FS
332 VnChangeState_r(avnode, VN_STATE_INVALID);
338 * initialize vnode cache for a given vnode class.
340 * @param[in] class vnode class
341 * @param[in] nVnodes size of cache
343 * @post vnode cache allocated and initialized
345 * @internal volume package internal use only
347 * @note generally called by VInitVolumePackage_r
349 * @see VInitVolumePackage_r
352 VInitVnodes(VnodeClass class, int nVnodes)
355 struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
357 vcp->allocs = vcp->gets = vcp->reads = vcp->writes = 0;
358 vcp->cacheSize = nVnodes;
361 osi_Assert(CHECKSIZE_SMALLVNODE);
363 vcp->residentSize = SIZEOF_SMALLVNODE;
364 vcp->diskSize = SIZEOF_SMALLDISKVNODE;
365 vcp->magic = SMALLVNODEMAGIC;
369 vcp->residentSize = SIZEOF_LARGEVNODE;
370 vcp->diskSize = SIZEOF_LARGEDISKVNODE;
371 vcp->magic = LARGEVNODEMAGIC;
375 int s = vcp->diskSize - 1;
385 va = (byte *) calloc(nVnodes, vcp->residentSize);
386 osi_Assert(va != NULL);
388 Vnode *vnp = (Vnode *) va;
389 Vn_refcount(vnp) = 0; /* no context switches */
390 Vn_stateFlags(vnp) |= VN_ON_LRU;
391 #ifdef AFS_DEMAND_ATTACH_FS
392 CV_INIT(&Vn_stateCV(vnp), "vnode state", CV_DEFAULT, 0);
393 Vn_state(vnp) = VN_STATE_INVALID;
395 #else /* !AFS_DEMAND_ATTACH_FS */
396 Lock_Init(&vnp->lock);
397 #endif /* !AFS_DEMAND_ATTACH_FS */
398 vnp->changed_oldTime = 0;
399 vnp->changed_newTime = 0;
400 Vn_volume(vnp) = NULL;
401 Vn_cacheCheck(vnp) = 0;
402 vnp->delete = Vn_id(vnp) = 0;
403 #ifdef AFS_PTHREAD_ENV
404 vnp->writer = (pthread_t) 0;
405 #else /* AFS_PTHREAD_ENV */
406 vnp->writer = (PROCESS) 0;
407 #endif /* AFS_PTHREAD_ENV */
411 if (vcp->lruHead == NULL)
412 vcp->lruHead = vnp->lruNext = vnp->lruPrev = vnp;
414 vnp->lruNext = vcp->lruHead;
415 vnp->lruPrev = vcp->lruHead->lruPrev;
416 vcp->lruHead->lruPrev = vnp;
417 vnp->lruPrev->lruNext = vnp;
420 va += vcp->residentSize;
427 * allocate an unused vnode from the lru chain.
429 * @param[in] vcp vnode class info object pointer
430 * @param[in] vp volume pointer
431 * @param[in] vnodeNumber new vnode number that the vnode will be used for
433 * @pre VOL_LOCK is held
435 * @post vnode object is removed from lru
436 * vnode is disassociated with its old volume, and associated with its
438 * vnode is removed from its old vnode hash table, and for DAFS, it is
439 * added to its new hash table
440 * state is set to VN_STATE_INVALID.
441 * inode handle is released.
442 * a reservation is held on the vnode object
444 * @note we traverse backwards along the lru circlist. It shouldn't
445 * be necessary to specify that nUsers == 0 since if it is in the list,
446 * nUsers should be 0. Things shouldn't be in lruq unless no one is
449 * @warning DAFS: VOL_LOCK is dropped while doing inode handle release
451 * @warning for non-DAFS, the vnode is _not_ hashed on the vnode hash table;
452 * non-DAFS must hash the vnode itself after loading data
454 * @return vnode object pointer
457 VGetFreeVnode_r(struct VnodeClassInfo * vcp, struct Volume *vp,
462 vnp = vcp->lruHead->lruPrev;
463 #ifdef AFS_DEMAND_ATTACH_FS
464 if (Vn_refcount(vnp) != 0 || VnIsExclusiveState(Vn_state(vnp)) ||
465 Vn_readers(vnp) != 0)
466 Abort("VGetFreeVnode_r: in-use vnode in lruq");
468 if (Vn_refcount(vnp) != 0 || CheckLock(&vnp->lock))
469 Abort("VGetFreeVnode_r: locked vnode in lruq");
471 VNLog(1, 2, Vn_id(vnp), (intptr_t)vnp, 0, 0);
474 * it's going to be overwritten soon enough.
475 * remove from LRU, delete hash entry, and
476 * disassociate from old parent volume before
477 * we have a chance to drop the vol glock
479 DeleteFromVnLRU(vcp, vnp);
480 DeleteFromVnHash(vnp);
481 if (Vn_volume(vnp)) {
482 DeleteFromVVnList(vnp);
485 /* we must re-hash the vnp _before_ we drop the glock again; otherwise,
486 * someone else might try to grab the same vnode id, and we'll both alloc
487 * a vnode object for the same vn id, bypassing vnode locking */
488 Vn_id(vnp) = vnodeNumber;
489 VnCreateReservation_r(vnp);
490 AddToVVnList(vp, vnp);
491 #ifdef AFS_DEMAND_ATTACH_FS
495 /* drop the file descriptor */
497 #ifdef AFS_DEMAND_ATTACH_FS
498 VnChangeState_r(vnp, VN_STATE_RELEASING);
501 /* release is, potentially, a highly latent operation due to a couple
503 * - ihandle package lock contention
504 * - closing file descriptor(s) associated with ih
506 * Hance, we perform outside of the volume package lock in order to
507 * reduce the probability of contention.
509 IH_RELEASE(vnp->handle);
510 #ifdef AFS_DEMAND_ATTACH_FS
515 #ifdef AFS_DEMAND_ATTACH_FS
516 VnChangeState_r(vnp, VN_STATE_INVALID);
524 * lookup a vnode in the vnode cache hash table.
526 * @param[in] vp pointer to volume object
527 * @param[in] vnodeId vnode id
531 * @post matching vnode object or NULL is returned
533 * @return vnode object pointer
534 * @retval NULL no matching vnode object was found in the cache
536 * @internal vnode package internal use only
538 * @note this symbol is exported strictly for fssync debug protocol use
541 VLookupVnode(Volume * vp, VnodeId vnodeId)
544 unsigned int newHash;
546 newHash = VNODE_HASH(vp, vnodeId);
547 for (vnp = VnodeHashTable[newHash];
549 ((Vn_id(vnp) != vnodeId) ||
550 (Vn_volume(vnp) != vp) ||
551 (vp->cacheCheck != Vn_cacheCheck(vnp))));
552 vnp = vnp->hashNext);
559 VAllocVnode(Error * ec, Volume * vp, VnodeType type)
563 retVal = VAllocVnode_r(ec, vp, type);
569 * allocate a new vnode.
571 * @param[out] ec error code return
572 * @param[in] vp volume object pointer
573 * @param[in] type desired vnode type
575 * @return vnode object pointer
577 * @pre VOL_LOCK held;
578 * heavyweight ref held on vp
580 * @post vnode allocated and returned
583 VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
588 struct VnodeClassInfo *vcp;
591 #ifdef AFS_DEMAND_ATTACH_FS
592 VolState vol_state_save;
597 #ifdef AFS_DEMAND_ATTACH_FS
599 * once a volume has entered an error state, don't permit
600 * further operations to proceed
601 * -- tkeiser 11/21/2007
603 VWaitExclusiveState_r(vp);
604 if (VIsErrorState(V_attachState(vp))) {
605 /* XXX is VSALVAGING acceptable here? */
611 if (programType == fileServer && !V_inUse(vp)) {
612 if (vp->specialStatus) {
613 *ec = vp->specialStatus;
619 class = vnodeTypeToClass(type);
620 vcp = &VnodeClassInfo[class];
622 if (!VolumeWriteable(vp)) {
623 *ec = (bit32) VREADONLY;
627 unique = vp->nextVnodeUnique++;
629 unique = vp->nextVnodeUnique++;
631 if (vp->nextVnodeUnique > V_uniquifier(vp)) {
632 VUpdateVolume_r(ec, vp, 0);
637 if (programType == fileServer) {
638 VAddToVolumeUpdateList_r(ec, vp);
643 /* Find a slot in the bit map */
644 bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
645 VOL_ALLOC_BITMAP_WAIT);
648 vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
652 * at this point we should be assured that V_attachState(vp) is non-exclusive
656 VNLog(2, 1, vnodeNumber, 0, 0, 0);
657 /* Prepare to move it to the new hash chain */
658 vnp = VLookupVnode(vp, vnodeNumber);
660 /* slot already exists. May even not be in lruq (consider store file locking a file being deleted)
661 * so we may have to wait for it below */
662 VNLog(3, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
664 VnCreateReservation_r(vnp);
665 if (Vn_refcount(vnp) == 1) {
666 /* we're the only user */
667 /* This won't block */
668 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
670 #ifdef AFS_DEMAND_ATTACH_FS
673 * vnode was cached, wait for any existing exclusive ops to finish.
674 * once we have reacquired the lock, re-verify volume state.
676 * note: any vnode error state is related to the old vnode; disregard.
678 VnWaitQuiescent_r(vnp);
679 if (VIsErrorState(V_attachState(vp))) {
680 VnUnlock(vnp, WRITE_LOCK);
681 VnCancelReservation_r(vnp);
687 /* other users present; follow locking hierarchy */
688 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, MIGHT_DEADLOCK);
691 * verify state of the world hasn't changed
693 * (technically, this should never happen because cachecheck
694 * is only updated during a volume attach, which should not
695 * happen when refs are held)
697 if (Vn_volume(vnp)->cacheCheck != Vn_cacheCheck(vnp)) {
698 VnUnlock(vnp, WRITE_LOCK);
699 VnCancelReservation_r(vnp);
704 /* sanity check: vnode should be blank if it was deleted. If it's
705 * not blank, it is still in use somewhere; but the bitmap told us
706 * this vnode number was free, so something is wrong. */
707 if (vnp->disk.type != vNull) {
709 Log("VAllocVnode: addled bitmap or vnode object! (vol %ld, "
710 "vnode %p, number %ld, type %ld)\n", (long)vp->hashid, vnp,
711 (long)Vn_id(vnp), (long)vnp->disk.type);
713 VFreeBitMapEntry_r(&tmp, vp, &vp->vnodeIndex[class], bitNumber,
714 VOL_FREE_BITMAP_WAIT);
715 VInvalidateVnode_r(vnp);
716 VnUnlock(vnp, WRITE_LOCK);
717 VnCancelReservation_r(vnp);
718 #ifdef AFS_DEMAND_ATTACH_FS
719 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
721 VForceOffline_r(vp, 0);
727 /* no such vnode in the cache */
729 vnp = VGetFreeVnode_r(vcp, vp, vnodeNumber);
731 /* This will never block (guaranteed by check in VGetFreeVnode_r() */
732 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
734 #ifdef AFS_DEMAND_ATTACH_FS
735 VnChangeState_r(vnp, VN_STATE_ALLOC);
738 /* Sanity check: is this vnode really not in use? */
741 IHandle_t *ihP = vp->vnodeIndex[class].handle;
743 afs_foff_t off = vnodeIndexOffset(vcp, vnodeNumber);
746 /* XXX we have a potential race here if two threads
747 * allocate new vnodes at the same time, and they
748 * both decide it's time to extend the index
751 #ifdef AFS_DEMAND_ATTACH_FS
753 * this race has been eliminated for the DAFS case
754 * using exclusive state VOL_STATE_VNODE_ALLOC
756 * if this becomes a bottleneck, there are ways to
757 * improve parallelism for this code path
758 * -- tkeiser 11/28/2007
760 VCreateReservation_r(vp);
761 VWaitExclusiveState_r(vp);
762 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_ALLOC);
768 Log("VAllocVnode: can't open index file!\n");
770 goto error_encountered;
772 if ((size = FDH_SIZE(fdP)) < 0) {
773 Log("VAllocVnode: can't stat index file!\n");
775 goto error_encountered;
777 if (off + vcp->diskSize <= size) {
778 if (FDH_PREAD(fdP, &vnp->disk, vcp->diskSize, off) != vcp->diskSize) {
779 Log("VAllocVnode: can't read index file!\n");
781 goto error_encountered;
783 if (vnp->disk.type != vNull) {
784 Log("VAllocVnode: addled bitmap or index!\n");
786 goto error_encountered;
789 /* growing file - grow in a reasonable increment */
790 char *buf = (char *)malloc(16 * 1024);
792 Log("VAllocVnode: can't grow vnode index: out of memory\n");
794 goto error_encountered;
796 memset(buf, 0, 16 * 1024);
797 if ((FDH_PWRITE(fdP, buf, 16 * 1024, off)) != 16 * 1024) {
798 Log("VAllocVnode: can't grow vnode index: write failed\n");
801 goto error_encountered;
808 #ifdef AFS_DEMAND_ATTACH_FS
809 VChangeState_r(vp, vol_state_save);
810 VCancelReservation_r(vp);
817 * close the file handle
819 * invalidate the vnode
820 * free up the bitmap entry (although salvager should take care of it)
822 * drop vnode lock and refs
827 VFreeBitMapEntry_r(&tmp, vp, &vp->vnodeIndex[class], bitNumber, 0 /*flags*/);
828 VInvalidateVnode_r(vnp);
829 VnUnlock(vnp, WRITE_LOCK);
830 VnCancelReservation_r(vnp);
831 #ifdef AFS_DEMAND_ATTACH_FS
832 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
833 VCancelReservation_r(vp);
835 VForceOffline_r(vp, 0);
840 VNLog(4, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
841 #ifndef AFS_DEMAND_ATTACH_FS
846 VNLog(5, 1, (intptr_t)vnp, 0, 0, 0);
847 memset(&vnp->disk, 0, sizeof(vnp->disk));
848 vnp->changed_newTime = 0; /* set this bit when vnode is updated */
849 vnp->changed_oldTime = 0; /* set this on CopyOnWrite. */
851 vnp->disk.vnodeMagic = vcp->magic;
852 vnp->disk.type = type;
853 vnp->disk.uniquifier = unique;
856 vp->header->diskstuff.filecount++;
857 #ifdef AFS_DEMAND_ATTACH_FS
858 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
864 * load a vnode from disk.
866 * @param[out] ec client error code return
867 * @param[in] vp volume object pointer
868 * @param[in] vnp vnode object pointer
869 * @param[in] vcp vnode class info object pointer
870 * @param[in] class vnode class enumeration
872 * @pre vnode is registered in appropriate data structures;
873 * caller holds a ref on vnode; VOL_LOCK is held
875 * @post vnode data is loaded from disk.
876 * vnode state is set to VN_STATE_ONLINE.
877 * on failure, vnode is invalidated.
879 * @internal vnode package internal use only
882 VnLoad(Error * ec, Volume * vp, Vnode * vnp,
883 struct VnodeClassInfo * vcp, VnodeClass class)
885 /* vnode not cached */
889 IHandle_t *ihP = vp->vnodeIndex[class].handle;
896 #ifdef AFS_DEMAND_ATTACH_FS
897 VnChangeState_r(vnp, VN_STATE_LOAD);
900 /* This will never block */
901 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
906 Log("VnLoad: can't open index dev=%u, i=%s\n", vp->device,
907 PrintInode(stmp, vp->vnodeIndex[class].handle->ih_ino));
909 goto error_encountered_nolock;
910 } else if ((nBytes = FDH_PREAD(fdP, (char *)&vnp->disk, vcp->diskSize, vnodeIndexOffset(vcp, Vn_id(vnp))))
912 /* Don't take volume off line if the inumber is out of range
913 * or the inode table is full. */
914 if (nBytes == BAD_IGET) {
915 Log("VnLoad: bad inumber %s\n",
916 PrintInode(stmp, vp->vnodeIndex[class].handle->ih_ino));
919 } else if (nBytes == -1 && errno == EIO) {
920 /* disk error; salvage */
921 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
923 /* vnode is not allocated */
925 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); read %d bytes, errno %d\n",
926 Vn_id(vnp), V_id(vp), V_name(vp), (int)nBytes, errno);
930 goto error_encountered_nolock;
935 /* Quick check to see that the data is reasonable */
936 if (vnp->disk.vnodeMagic != vcp->magic || vnp->disk.type == vNull) {
937 if (vnp->disk.type == vNull) {
941 struct vnodeIndex *index = &vp->vnodeIndex[class];
942 unsigned int bitNumber = vnodeIdToBitNumber(Vn_id(vnp));
943 unsigned int offset = bitNumber >> 3;
945 #ifdef AFS_DEMAND_ATTACH_FS
946 /* Make sure the volume bitmap isn't getting updated while we are
948 VWaitExclusiveState_r(vp);
951 /* Test to see if vnode number is valid. */
952 if ((offset >= index->bitmapSize)
953 || ((*(index->bitmap + offset) & (1 << (bitNumber & 0x7)))
955 Log("VnLoad: Request for unallocated vnode %u, volume %u (%s) denied.\n", Vn_id(vnp), V_id(vp), V_name(vp));
959 Log("VnLoad: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
962 goto error_encountered;
965 IH_INIT(vnp->handle, V_device(vp), V_parentId(vp), VN_GET_INO(vnp));
966 VnUnlock(vnp, WRITE_LOCK);
967 #ifdef AFS_DEMAND_ATTACH_FS
968 VnChangeState_r(vnp, VN_STATE_ONLINE);
973 error_encountered_nolock:
975 FDH_REALLYCLOSE(fdP);
981 #ifdef AFS_DEMAND_ATTACH_FS
982 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
984 VForceOffline_r(vp, 0);
991 VInvalidateVnode_r(vnp);
992 VnUnlock(vnp, WRITE_LOCK);
996 * store a vnode to disk.
998 * @param[out] ec error code output
999 * @param[in] vp volume object pointer
1000 * @param[in] vnp vnode object pointer
1001 * @param[in] vcp vnode class info object pointer
1002 * @param[in] class vnode class enumeration
1004 * @pre VOL_LOCK held.
1005 * caller holds refs to volume and vnode.
1006 * DAFS: caller is responsible for performing state sanity checks.
1008 * @post vnode state is stored to disk.
1010 * @internal vnode package internal use only
1013 VnStore(Error * ec, Volume * vp, Vnode * vnp,
1014 struct VnodeClassInfo * vcp, VnodeClass class)
1018 IHandle_t *ihP = vp->vnodeIndex[class].handle;
1021 #ifdef AFS_DEMAND_ATTACH_FS
1022 VnState vn_state_save;
1027 #ifdef AFS_DEMAND_ATTACH_FS
1028 vn_state_save = VnChangeState_r(vnp, VN_STATE_STORE);
1031 offset = vnodeIndexOffset(vcp, Vn_id(vnp));
1035 Log("VnStore: can't open index file!\n");
1036 goto error_encountered;
1038 nBytes = FDH_PWRITE(fdP, &vnp->disk, vcp->diskSize, offset);
1039 if (nBytes != vcp->diskSize) {
1040 /* Don't force volume offline if the inumber is out of
1041 * range or the inode table is full.
1043 FDH_REALLYCLOSE(fdP);
1044 if (nBytes == BAD_IGET) {
1045 Log("VnStore: bad inumber %s\n",
1047 vp->vnodeIndex[class].handle->ih_ino));
1050 #ifdef AFS_DEMAND_ATTACH_FS
1051 VnChangeState_r(vnp, VN_STATE_ERROR);
1054 Log("VnStore: Couldn't write vnode %u, volume %u (%s) (error %d)\n", Vn_id(vnp), V_id(Vn_volume(vnp)), V_name(Vn_volume(vnp)), (int)nBytes);
1055 #ifdef AFS_DEMAND_ATTACH_FS
1056 goto error_encountered;
1059 VForceOffline_r(vp, 0);
1069 #ifdef AFS_DEMAND_ATTACH_FS
1070 VnChangeState_r(vnp, vn_state_save);
1075 #ifdef AFS_DEMAND_ATTACH_FS
1076 /* XXX instead of dumping core, let's try to request a salvage
1077 * and just fail the putvnode */
1081 VnChangeState_r(vnp, VN_STATE_ERROR);
1082 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1089 * get a handle to a vnode object.
1091 * @param[out] ec error code
1092 * @param[in] vp volume object
1093 * @param[in] vnodeNumber vnode id
1094 * @param[in] locktype type of lock to acquire
1096 * @return vnode object pointer
1101 VGetVnode(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1102 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1105 retVal = VGetVnode_r(ec, vp, vnodeNumber, locktype);
1111 * get a handle to a vnode object.
1113 * @param[out] ec error code
1114 * @param[in] vp volume object
1115 * @param[in] vnodeNumber vnode id
1116 * @param[in] locktype type of lock to acquire
1118 * @return vnode object pointer
1120 * @internal vnode package internal use only
1122 * @pre VOL_LOCK held.
1123 * heavyweight ref held on volume object.
1126 VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1127 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1130 struct VnodeClassInfo *vcp;
1134 if (vnodeNumber == 0) {
1139 VNLog(100, 1, vnodeNumber, 0, 0, 0);
1141 #ifdef AFS_DEMAND_ATTACH_FS
1143 * once a volume has entered an error state, don't permit
1144 * further operations to proceed
1145 * -- tkeiser 11/21/2007
1147 VWaitExclusiveState_r(vp);
1148 if (VIsErrorState(V_attachState(vp))) {
1149 /* XXX is VSALVAGING acceptable here? */
1155 if (programType == fileServer && !V_inUse(vp)) {
1156 *ec = (vp->specialStatus ? vp->specialStatus : VOFFLINE);
1158 /* If the volume is VBUSY (being cloned or dumped) and this is
1159 * a READ operation, then don't fail.
1161 if ((*ec != VBUSY) || (locktype != READ_LOCK)) {
1166 class = vnodeIdToClass(vnodeNumber);
1167 vcp = &VnodeClassInfo[class];
1168 if (locktype == WRITE_LOCK && !VolumeWriteable(vp)) {
1169 *ec = (bit32) VREADONLY;
1173 if (locktype == WRITE_LOCK && programType == fileServer) {
1174 VAddToVolumeUpdateList_r(ec, vp);
1182 /* See whether the vnode is in the cache. */
1183 vnp = VLookupVnode(vp, vnodeNumber);
1185 /* vnode is in cache */
1187 VNLog(101, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
1188 VnCreateReservation_r(vnp);
1190 #ifdef AFS_DEMAND_ATTACH_FS
1192 * this is the one DAFS case where we may run into contention.
1193 * here's the basic control flow:
1195 * if locktype is READ_LOCK:
1196 * wait until vnode is not exclusive
1197 * set to VN_STATE_READ
1198 * increment read count
1201 * wait until vnode is quiescent
1202 * set to VN_STATE_EXCLUSIVE
1205 if (locktype == READ_LOCK) {
1206 VnWaitExclusiveState_r(vnp);
1208 VnWaitQuiescent_r(vnp);
1211 if (VnIsErrorState(Vn_state(vnp))) {
1212 VnCancelReservation_r(vnp);
1216 #endif /* AFS_DEMAND_ATTACH_FS */
1218 /* vnode not cached */
1220 /* Not in cache; tentatively grab most distantly used one from the LRU
1223 vnp = VGetFreeVnode_r(vcp, vp, vnodeNumber);
1226 vnp->changed_newTime = vnp->changed_oldTime = 0;
1230 * XXX for non-DAFS, there is a serious
1231 * race condition here:
1233 * two threads can race to load a vnode. the net
1234 * result is two struct Vnodes can be allocated
1235 * and hashed, which point to the same underlying
1236 * disk data store. conflicting vnode locks can
1237 * thus be held concurrently.
1239 * for non-DAFS to be safe, VOL_LOCK really shouldn't
1240 * be dropped in VnLoad. Of course, this would likely
1241 * lead to an unacceptable slow-down.
1244 VnLoad(ec, vp, vnp, vcp, class);
1246 VnCancelReservation_r(vnp);
1249 #ifndef AFS_DEMAND_ATTACH_FS
1254 * there is no possibility for contention. we "own" this vnode.
1260 * it is imperative that nothing drop vol lock between here
1261 * and the VnBeginRead/VnChangeState stanza below
1264 VnLock(vnp, locktype, VOL_LOCK_HELD, MIGHT_DEADLOCK);
1266 /* Check that the vnode hasn't been removed while we were obtaining
1268 VNLog(102, 2, vnodeNumber, (intptr_t) vnp, 0, 0);
1269 if ((vnp->disk.type == vNull) || (Vn_cacheCheck(vnp) == 0)) {
1270 VnUnlock(vnp, locktype);
1271 VnCancelReservation_r(vnp);
1273 /* vnode is labelled correctly by now, so we don't have to invalidate it */
1277 #ifdef AFS_DEMAND_ATTACH_FS
1278 if (locktype == READ_LOCK) {
1281 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
1285 if (programType == fileServer)
1286 VBumpVolumeUsage_r(Vn_volume(vnp)); /* Hack; don't know where it should be
1287 * called from. Maybe VGetVolume */
1292 int TrustVnodeCacheEntry = 1;
1293 /* This variable is bogus--when it's set to 0, the hash chains fill
1294 up with multiple versions of the same vnode. Should fix this!! */
1296 VPutVnode(Error * ec, Vnode * vnp)
1299 VPutVnode_r(ec, vnp);
1304 * put back a handle to a vnode object.
1306 * @param[out] ec client error code
1307 * @param[in] vnp vnode object pointer
1309 * @pre VOL_LOCK held.
1310 * ref held on vnode.
1312 * @post ref dropped on vnode.
1313 * if vnode was modified or deleted, it is written out to disk
1314 * (assuming a write lock was held).
1316 * @internal volume package internal use only
1319 VPutVnode_r(Error * ec, Vnode * vnp)
1323 struct VnodeClassInfo *vcp;
1326 osi_Assert(Vn_refcount(vnp) != 0);
1327 class = vnodeIdToClass(Vn_id(vnp));
1328 vcp = &VnodeClassInfo[class];
1329 osi_Assert(vnp->disk.vnodeMagic == vcp->magic);
1330 VNLog(200, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1332 #ifdef AFS_DEMAND_ATTACH_FS
1333 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1335 writeLocked = WriteLocked(&vnp->lock);
1340 #ifdef AFS_PTHREAD_ENV
1341 pthread_t thisProcess = pthread_self();
1342 #else /* AFS_PTHREAD_ENV */
1343 PROCESS thisProcess;
1344 LWP_CurrentProcess(&thisProcess);
1345 #endif /* AFS_PTHREAD_ENV */
1346 VNLog(201, 2, (intptr_t) vnp,
1347 ((vnp->changed_newTime) << 1) | ((vnp->
1348 changed_oldTime) << 1) | vnp->
1350 if (thisProcess != vnp->writer)
1351 Abort("VPutVnode: Vnode at %"AFS_PTR_FMT" locked by another process!\n",
1355 if (vnp->changed_oldTime || vnp->changed_newTime || vnp->delete) {
1356 Volume *vp = Vn_volume(vnp);
1357 afs_uint32 now = FT_ApproxTime();
1358 osi_Assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1361 /* No longer any directory entries for this vnode. Free the Vnode */
1362 memset(&vnp->disk, 0, sizeof(vnp->disk));
1363 /* delete flag turned off further down */
1364 VNLog(202, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1365 } else if (vnp->changed_newTime) {
1366 vnp->disk.serverModifyTime = now;
1368 if (vnp->changed_newTime)
1370 V_updateDate(vp) = vp->updateTime = now;
1371 if(V_volUpCounter(vp)< UINT_MAX)
1372 V_volUpCounter(vp)++;
1375 /* The vnode has been changed. Write it out to disk */
1377 #ifdef AFS_DEMAND_ATTACH_FS
1378 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1380 osi_Assert(V_needsSalvaged(vp));
1384 VnStore(ec, vp, vnp, vcp, class);
1386 /* If the vnode is to be deleted, and we wrote the vnode out,
1387 * free its bitmap entry. Do after the vnode is written so we
1388 * don't allocate from bitmap before the vnode is written
1389 * (doing so could cause a "addled bitmap" message).
1391 if (vnp->delete && !*ec) {
1392 if (Vn_volume(vnp)->header->diskstuff.filecount-- < 1)
1393 Vn_volume(vnp)->header->diskstuff.filecount = 0;
1394 VFreeBitMapEntry_r(ec, vp, &vp->vnodeIndex[class],
1395 vnodeIdToBitNumber(Vn_id(vnp)),
1396 VOL_FREE_BITMAP_WAIT);
1400 vnp->changed_newTime = vnp->changed_oldTime = 0;
1402 #ifdef AFS_DEMAND_ATTACH_FS
1403 VnChangeState_r(vnp, VN_STATE_ONLINE);
1405 } else { /* Not write locked */
1406 if (vnp->changed_newTime || vnp->changed_oldTime || vnp->delete)
1408 ("VPutVnode: Change or delete flag for vnode "
1409 "%"AFS_PTR_FMT" is set but vnode is not write locked!\n",
1411 #ifdef AFS_DEMAND_ATTACH_FS
1416 /* Do not look at disk portion of vnode after this point; it may
1417 * have been deleted above */
1419 VnUnlock(vnp, ((writeLocked) ? WRITE_LOCK : READ_LOCK));
1420 VnCancelReservation_r(vnp);
1424 * Make an attempt to convert a vnode lock from write to read.
1425 * Do nothing if the vnode isn't write locked or the vnode has
1429 VVnodeWriteToRead(Error * ec, Vnode * vnp)
1433 retVal = VVnodeWriteToRead_r(ec, vnp);
1439 * convert vnode handle from mutually exclusive to shared access.
1441 * @param[out] ec client error code
1442 * @param[in] vnp vnode object pointer
1444 * @return unspecified use (see out argument 'ec' for error code return)
1446 * @pre VOL_LOCK held.
1447 * ref held on vnode.
1448 * write lock held on vnode.
1450 * @post read lock held on vnode.
1451 * if vnode was modified, it has been written to disk.
1453 * @internal volume package internal use only
1456 VVnodeWriteToRead_r(Error * ec, Vnode * vnp)
1460 struct VnodeClassInfo *vcp;
1461 #ifdef AFS_PTHREAD_ENV
1462 pthread_t thisProcess;
1463 #else /* AFS_PTHREAD_ENV */
1464 PROCESS thisProcess;
1465 #endif /* AFS_PTHREAD_ENV */
1468 osi_Assert(Vn_refcount(vnp) != 0);
1469 class = vnodeIdToClass(Vn_id(vnp));
1470 vcp = &VnodeClassInfo[class];
1471 osi_Assert(vnp->disk.vnodeMagic == vcp->magic);
1472 VNLog(300, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1474 #ifdef AFS_DEMAND_ATTACH_FS
1475 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1477 writeLocked = WriteLocked(&vnp->lock);
1484 VNLog(301, 2, (intptr_t) vnp,
1485 ((vnp->changed_newTime) << 1) | ((vnp->
1486 changed_oldTime) << 1) | vnp->
1490 #ifdef AFS_PTHREAD_ENV
1491 thisProcess = pthread_self();
1492 #else /* AFS_PTHREAD_ENV */
1493 LWP_CurrentProcess(&thisProcess);
1494 #endif /* AFS_PTHREAD_ENV */
1495 if (thisProcess != vnp->writer)
1496 Abort("VPutVnode: Vnode at %"AFS_PTR_FMT
1497 " locked by another process!\n", vnp);
1502 if (vnp->changed_oldTime || vnp->changed_newTime) {
1503 Volume *vp = Vn_volume(vnp);
1504 afs_uint32 now = FT_ApproxTime();
1505 osi_Assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1506 if (vnp->changed_newTime)
1507 vnp->disk.serverModifyTime = now;
1508 if (vnp->changed_newTime)
1509 V_updateDate(vp) = vp->updateTime = now;
1511 /* The inode has been changed. Write it out to disk */
1513 #ifdef AFS_DEMAND_ATTACH_FS
1514 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1516 osi_Assert(V_needsSalvaged(vp));
1520 VnStore(ec, vp, vnp, vcp, class);
1523 vnp->changed_newTime = vnp->changed_oldTime = 0;
1527 #ifdef AFS_DEMAND_ATTACH_FS
1528 VnChangeState_r(vnp, VN_STATE_ONLINE);
1531 ConvertWriteToReadLock(&vnp->lock);
1537 * initial size of ihandle pointer vector.
1539 * @see VInvalidateVnodesByVolume_r
1541 #define IH_VEC_BASE_SIZE 256
1544 * increment amount for growing ihandle pointer vector.
1546 * @see VInvalidateVnodesByVolume_r
1548 #define IH_VEC_INCREMENT 256
1551 * Compile list of ihandles to be released/reallyclosed at a later time.
1553 * @param[in] vp volume object pointer
1554 * @param[out] vec_out vector of ihandle pointers to be released/reallyclosed
1555 * @param[out] vec_len_out number of valid elements in ihandle vector
1557 * @pre - VOL_LOCK is held
1558 * - volume is in appropriate exclusive state (e.g. VOL_STATE_VNODE_CLOSE,
1559 * VOL_STATE_VNODE_RELEASE)
1561 * @post - all vnodes on VVn list are invalidated
1562 * - ih_vec is populated with all valid ihandles
1564 * @return operation status
1566 * @retval ENOMEM out of memory
1568 * @todo we should handle out of memory conditions more gracefully.
1570 * @internal vnode package internal use only
1573 VInvalidateVnodesByVolume_r(Volume * vp,
1574 IHandle_t *** vec_out,
1575 size_t * vec_len_out)
1579 size_t i = 0, vec_len;
1580 IHandle_t **ih_vec, **ih_vec_new;
1582 #ifdef AFS_DEMAND_ATTACH_FS
1584 #endif /* AFS_DEMAND_ATTACH_FS */
1586 vec_len = IH_VEC_BASE_SIZE;
1587 ih_vec = malloc(sizeof(IHandle_t *) * vec_len);
1588 #ifdef AFS_DEMAND_ATTACH_FS
1595 * Traverse the volume's vnode list. Pull all the ihandles out into a
1596 * thread-private array for later asynchronous processing.
1598 #ifdef AFS_DEMAND_ATTACH_FS
1601 for (queue_Scan(&vp->vnode_list, vnp, nvnp, Vnode)) {
1602 if (vnp->handle != NULL) {
1604 #ifdef AFS_DEMAND_ATTACH_FS
1607 vec_len += IH_VEC_INCREMENT;
1608 ih_vec_new = realloc(ih_vec, sizeof(IHandle_t *) * vec_len);
1609 #ifdef AFS_DEMAND_ATTACH_FS
1612 if (ih_vec_new == NULL) {
1616 ih_vec = ih_vec_new;
1617 #ifdef AFS_DEMAND_ATTACH_FS
1619 * Theoretically, the volume's VVn list should not change
1620 * because the volume is in an exclusive state. For the
1621 * sake of safety, we will restart the traversal from the
1622 * the beginning (which is not expensive because we're
1623 * deleting the items from the list as we go).
1625 goto restart_traversal;
1628 ih_vec[i++] = vnp->handle;
1631 DeleteFromVVnList(vnp);
1632 VInvalidateVnode_r(vnp);
1642 /* VCloseVnodeFiles - called when a volume is going off line. All open
1643 * files for vnodes in that volume are closed. This might be excessive,
1644 * since we may only be taking one volume of a volume group offline.
1647 VCloseVnodeFiles_r(Volume * vp)
1649 #ifdef AFS_DEMAND_ATTACH_FS
1650 VolState vol_state_save;
1652 IHandle_t ** ih_vec;
1655 #ifdef AFS_DEMAND_ATTACH_FS
1656 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_CLOSE);
1657 #endif /* AFS_DEMAND_ATTACH_FS */
1659 /* XXX need better error handling here */
1660 osi_Assert(VInvalidateVnodesByVolume_r(vp,
1666 * now we drop VOL_LOCK while we perform some potentially very
1667 * expensive operations in the background
1669 #ifdef AFS_DEMAND_ATTACH_FS
1673 for (i = 0; i < vec_len; i++) {
1674 IH_REALLYCLOSE(ih_vec[i]);
1675 IH_RELEASE(ih_vec[i]);
1680 #ifdef AFS_DEMAND_ATTACH_FS
1682 VChangeState_r(vp, vol_state_save);
1683 #endif /* AFS_DEMAND_ATTACH_FS */
1688 * shut down all vnode cache state for a given volume.
1690 * @param[in] vp volume object pointer
1692 * @pre VOL_LOCK is held
1694 * @post all file descriptors closed.
1695 * all inode handles released.
1696 * all vnode cache objects disassociated from volume.
1698 * @note for DAFS, these operations are performed outside the vol glock under
1699 * volume exclusive state VOL_STATE_VNODE_RELEASE. Please further note
1700 * that it would be a bug to acquire and release a volume reservation
1701 * during this exclusive operation. This is due to the fact that we are
1702 * generally called during the refcount 1->0 transition.
1704 * @todo we should handle failures in VInvalidateVnodesByVolume_r more
1707 * @see VInvalidateVnodesByVolume_r
1709 * @internal this routine is internal to the volume package
1712 VReleaseVnodeFiles_r(Volume * vp)
1714 #ifdef AFS_DEMAND_ATTACH_FS
1715 VolState vol_state_save;
1717 IHandle_t ** ih_vec;
1720 #ifdef AFS_DEMAND_ATTACH_FS
1721 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_RELEASE);
1722 #endif /* AFS_DEMAND_ATTACH_FS */
1724 /* XXX need better error handling here */
1725 osi_Assert(VInvalidateVnodesByVolume_r(vp,
1731 * now we drop VOL_LOCK while we perform some potentially very
1732 * expensive operations in the background
1734 #ifdef AFS_DEMAND_ATTACH_FS
1738 for (i = 0; i < vec_len; i++) {
1739 IH_RELEASE(ih_vec[i]);
1744 #ifdef AFS_DEMAND_ATTACH_FS
1746 VChangeState_r(vp, vol_state_save);
1747 #endif /* AFS_DEMAND_ATTACH_FS */