2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
15 Institution: The Information Technology Center, Carnegie-Mellon University
18 #include <afsconfig.h>
19 #include <afs/param.h>
20 #define MAXINT (~(1<<((sizeof(int)*8)-1)))
29 #ifdef AFS_PTHREAD_ENV
31 #else /* AFS_PTHREAD_ENV */
32 #include <afs/assert.h>
33 #endif /* AFS_PTHREAD_ENV */
36 #include "rx/rx_queue.h"
37 #include <afs/afsint.h>
39 #include <afs/errors.h>
42 #include <afs/afssyscalls.h>
46 #include "volume_inline.h"
47 #include "vnode_inline.h"
48 #include "partition.h"
50 #if defined(AFS_SGI_ENV)
51 #include "sys/types.h"
63 #include <sys/fcntl.h>
66 #endif /* AFS_NT40_ENV */
69 /*@printflike@*/ extern void Log(const char *format, ...);
71 /*@printflike@*/ extern void Abort(const char *format, ...);
74 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
76 void VNLog(afs_int32 aop, afs_int32 anparms, ... );
83 #define BAD_IGET -1000
85 /* There are two separate vnode queue types defined here:
86 * Each hash conflict chain -- is singly linked, with a single head
87 * pointer. New entries are added at the beginning. Old
88 * entries are removed by linear search, which generally
89 * only occurs after a disk read).
90 * LRU chain -- is doubly linked, single head pointer.
91 * Entries are added at the head, reclaimed from the tail,
92 * or removed from anywhere in the queue.
96 /* Vnode hash table. Find hash chain by taking lower bits of
97 * (volume_hash_offset + vnode).
98 * This distributes the root inodes of the volumes over the
99 * hash table entries and also distributes the vnodes of
100 * volumes reasonably fairly. The volume_hash_offset field
101 * for each volume is established as the volume comes on line
102 * by using the VOLUME_HASH_OFFSET macro. This distributes the
103 * volumes fairly among the cache entries, both when servicing
104 * a small number of volumes and when servicing a large number.
107 /* logging stuff for finding bugs */
108 #define THELOGSIZE 5120
109 static afs_int32 theLog[THELOGSIZE];
110 static afs_int32 vnLogPtr = 0;
112 VNLog(afs_int32 aop, afs_int32 anparms, ... )
114 register afs_int32 temp;
117 va_start(ap, anparms);
120 anparms = 4; /* do bounds checking */
122 temp = (aop << 16) | anparms;
123 theLog[vnLogPtr++] = temp;
124 if (vnLogPtr >= THELOGSIZE)
126 for (temp = 0; temp < anparms; temp++) {
127 theLog[vnLogPtr++] = va_arg(ap, afs_int32);
128 if (vnLogPtr >= THELOGSIZE)
134 /* VolumeHashOffset -- returns a new value to be stored in the
135 * volumeHashOffset of a Volume structure. Called when a
136 * volume is initialized. Sets the volumeHashOffset so that
137 * vnode cache entries are distributed reasonably between
138 * volumes (the root vnodes of the volumes will hash to
139 * different values, and spacing is maintained between volumes
140 * when there are not many volumes represented), and spread
141 * equally amongst vnodes within a single volume.
144 VolumeHashOffset_r(void)
146 static int nextVolumeHashOffset = 0;
147 /* hashindex Must be power of two in size */
149 # define hashMask ((1<<hashShift)-1)
150 static byte hashindex[1 << hashShift] =
151 { 0, 128, 64, 192, 32, 160, 96, 224 };
153 offset = hashindex[nextVolumeHashOffset & hashMask]
154 + (nextVolumeHashOffset >> hashShift);
155 nextVolumeHashOffset++;
159 /* Change hashindex (above) if you change this constant */
160 #define VNODE_HASH_TABLE_SIZE 256
161 private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
162 #define VNODE_HASH(volumeptr,vnodenumber)\
163 ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
167 * add a vnode to the volume's vnode list.
169 * @param[in] vp volume object pointer
170 * @param[in] vnp vnode object pointer
172 * @note for DAFS, it may seem like we should be acquiring a lightweight ref
173 * on vp, but this would actually break things. Right now, this is ok
174 * because we destroy all vnode cache contents during during volume
179 * @internal volume package internal use only
182 AddToVVnList(Volume * vp, Vnode * vnp)
184 if (queue_IsOnQueue(vnp))
188 Vn_cacheCheck(vnp) = vp->cacheCheck;
189 queue_Append(&vp->vnode_list, vnp);
190 Vn_stateFlags(vnp) |= VN_ON_VVN;
194 * delete a vnode from the volume's vnode list.
198 * @internal volume package internal use only
201 DeleteFromVVnList(register Vnode * vnp)
203 Vn_volume(vnp) = NULL;
205 if (!queue_IsOnQueue(vnp))
209 Vn_stateFlags(vnp) &= ~(VN_ON_VVN);
213 * add a vnode to the end of the lru.
215 * @param[in] vcp vnode class info object pointer
216 * @param[in] vnp vnode object pointer
218 * @internal vnode package internal use only
221 AddToVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
223 if (Vn_stateFlags(vnp) & VN_ON_LRU) {
227 /* Add it to the circular LRU list */
228 if (vcp->lruHead == NULL)
229 Abort("VPutVnode: vcp->lruHead==NULL");
231 vnp->lruNext = vcp->lruHead;
232 vnp->lruPrev = vcp->lruHead->lruPrev;
233 vcp->lruHead->lruPrev = vnp;
234 vnp->lruPrev->lruNext = vnp;
238 /* If the vnode was just deleted, put it at the end of the chain so it
239 * will be reused immediately */
241 vcp->lruHead = vnp->lruNext;
243 Vn_stateFlags(vnp) |= VN_ON_LRU;
247 * delete a vnode from the lru.
249 * @param[in] vcp vnode class info object pointer
250 * @param[in] vnp vnode object pointer
252 * @internal vnode package internal use only
255 DeleteFromVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
257 if (!(Vn_stateFlags(vnp) & VN_ON_LRU)) {
261 if (vnp == vcp->lruHead)
262 vcp->lruHead = vcp->lruHead->lruNext;
264 if ((vnp == vcp->lruHead) ||
265 (vcp->lruHead == NULL))
266 Abort("DeleteFromVnLRU: lru chain addled!\n");
268 vnp->lruPrev->lruNext = vnp->lruNext;
269 vnp->lruNext->lruPrev = vnp->lruPrev;
271 Vn_stateFlags(vnp) &= ~(VN_ON_LRU);
275 * add a vnode to the vnode hash table.
277 * @param[in] vnp vnode object pointer
281 * @post vnode on hash
283 * @internal vnode package internal use only
286 AddToVnHash(Vnode * vnp)
288 unsigned int newHash;
290 if (!(Vn_stateFlags(vnp) & VN_ON_HASH)) {
291 newHash = VNODE_HASH(Vn_volume(vnp), Vn_id(vnp));
292 vnp->hashNext = VnodeHashTable[newHash];
293 VnodeHashTable[newHash] = vnp;
294 vnp->hashIndex = newHash;
296 Vn_stateFlags(vnp) |= VN_ON_HASH;
301 * delete a vnode from the vnode hash table.
308 * @post vnode removed from hash
310 * @internal vnode package internal use only
313 DeleteFromVnHash(Vnode * vnp)
317 if (Vn_stateFlags(vnp) & VN_ON_HASH) {
318 tvnp = VnodeHashTable[vnp->hashIndex];
320 VnodeHashTable[vnp->hashIndex] = vnp->hashNext;
322 while (tvnp && tvnp->hashNext != vnp)
323 tvnp = tvnp->hashNext;
325 tvnp->hashNext = vnp->hashNext;
328 vnp->hashNext = NULL;
330 Vn_stateFlags(vnp) &= ~(VN_ON_HASH);
336 * invalidate a vnode cache entry.
338 * @param[in] avnode vnode object pointer
342 * @post vnode metadata invalidated.
343 * vnode removed from hash table.
344 * DAFS: vnode state set to VN_STATE_INVALID.
346 * @internal vnode package internal use only
349 VInvalidateVnode_r(register struct Vnode *avnode)
351 avnode->changed_newTime = 0; /* don't let it get flushed out again */
352 avnode->changed_oldTime = 0;
353 avnode->delete = 0; /* it isn't deleted, really */
354 avnode->cacheCheck = 0; /* invalid: prevents future vnode searches from working */
355 DeleteFromVnHash(avnode);
356 #ifdef AFS_DEMAND_ATTACH_FS
357 VnChangeState_r(avnode, VN_STATE_INVALID);
363 * initialize vnode cache for a given vnode class.
365 * @param[in] class vnode class
366 * @param[in] nVnodes size of cache
368 * @post vnode cache allocated and initialized
370 * @internal volume package internal use only
372 * @note generally called by VInitVolumePackage_r
374 * @see VInitVolumePackage_r
377 VInitVnodes(VnodeClass class, int nVnodes)
380 register struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
382 vcp->allocs = vcp->gets = vcp->reads = vcp->writes = 0;
383 vcp->cacheSize = nVnodes;
386 assert(CHECKSIZE_SMALLVNODE);
388 vcp->residentSize = SIZEOF_SMALLVNODE;
389 vcp->diskSize = SIZEOF_SMALLDISKVNODE;
390 vcp->magic = SMALLVNODEMAGIC;
394 vcp->residentSize = SIZEOF_LARGEVNODE;
395 vcp->diskSize = SIZEOF_LARGEDISKVNODE;
396 vcp->magic = LARGEVNODEMAGIC;
400 int s = vcp->diskSize - 1;
410 va = (byte *) calloc(nVnodes, vcp->residentSize);
413 Vnode *vnp = (Vnode *) va;
414 Vn_refcount(vnp) = 0; /* no context switches */
415 Vn_stateFlags(vnp) |= VN_ON_LRU;
416 #ifdef AFS_DEMAND_ATTACH_FS
417 assert(pthread_cond_init(&Vn_stateCV(vnp), NULL) == 0);
418 Vn_state(vnp) = VN_STATE_INVALID;
420 #else /* !AFS_DEMAND_ATTACH_FS */
421 Lock_Init(&vnp->lock);
422 #endif /* !AFS_DEMAND_ATTACH_FS */
423 vnp->changed_oldTime = 0;
424 vnp->changed_newTime = 0;
425 Vn_volume(vnp) = NULL;
426 Vn_cacheCheck(vnp) = 0;
427 vnp->delete = Vn_id(vnp) = 0;
428 #ifdef AFS_PTHREAD_ENV
429 vnp->writer = (pthread_t) 0;
430 #else /* AFS_PTHREAD_ENV */
431 vnp->writer = (PROCESS) 0;
432 #endif /* AFS_PTHREAD_ENV */
436 if (vcp->lruHead == NULL)
437 vcp->lruHead = vnp->lruNext = vnp->lruPrev = vnp;
439 vnp->lruNext = vcp->lruHead;
440 vnp->lruPrev = vcp->lruHead->lruPrev;
441 vcp->lruHead->lruPrev = vnp;
442 vnp->lruPrev->lruNext = vnp;
445 va += vcp->residentSize;
452 * allocate an unused vnode from the lru chain.
454 * @param[in] vcp vnode class info object pointer
456 * @pre VOL_LOCK is held
458 * @post vnode object is removed from lru, and vnode hash table.
459 * vnode is disassociated from volume object.
460 * state is set to VN_STATE_INVALID.
461 * inode handle is released.
463 * @note we traverse backwards along the lru circlist. It shouldn't
464 * be necessary to specify that nUsers == 0 since if it is in the list,
465 * nUsers should be 0. Things shouldn't be in lruq unless no one is
468 * @warning DAFS: VOL_LOCK is dropped while doing inode handle release
470 * @return vnode object pointer
473 VGetFreeVnode_r(struct VnodeClassInfo * vcp)
477 vnp = vcp->lruHead->lruPrev;
478 #ifdef AFS_DEMAND_ATTACH_FS
479 if (Vn_refcount(vnp) != 0 || VnIsExclusiveState(Vn_state(vnp)) ||
480 Vn_readers(vnp) != 0)
481 Abort("VGetFreeVnode_r: in-use vnode in lruq");
483 if (Vn_refcount(vnp) != 0 || CheckLock(&vnp->lock))
484 Abort("VGetFreeVnode_r: locked vnode in lruq");
486 VNLog(1, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
489 * it's going to be overwritten soon enough.
490 * remove from LRU, delete hash entry, and
491 * disassociate from old parent volume before
492 * we have a chance to drop the vol glock
494 DeleteFromVnLRU(vcp, vnp);
495 DeleteFromVnHash(vnp);
496 if (Vn_volume(vnp)) {
497 DeleteFromVVnList(vnp);
500 /* drop the file descriptor */
502 #ifdef AFS_DEMAND_ATTACH_FS
503 VnChangeState_r(vnp, VN_STATE_RELEASING);
506 /* release is, potentially, a highly latent operation due to a couple
508 * - ihandle package lock contention
509 * - closing file descriptor(s) associated with ih
511 * Hance, we perform outside of the volume package lock in order to
512 * reduce the probability of contention.
514 IH_RELEASE(vnp->handle);
515 #ifdef AFS_DEMAND_ATTACH_FS
520 #ifdef AFS_DEMAND_ATTACH_FS
521 VnChangeState_r(vnp, VN_STATE_INVALID);
529 * lookup a vnode in the vnode cache hash table.
531 * @param[in] vp pointer to volume object
532 * @param[in] vnodeId vnode id
536 * @post matching vnode object or NULL is returned
538 * @return vnode object pointer
539 * @retval NULL no matching vnode object was found in the cache
541 * @internal vnode package internal use only
543 * @note this symbol is exported strictly for fssync debug protocol use
546 VLookupVnode(Volume * vp, VnodeId vnodeId)
549 unsigned int newHash;
551 newHash = VNODE_HASH(vp, vnodeId);
552 for (vnp = VnodeHashTable[newHash];
554 ((Vn_id(vnp) != vnodeId) ||
555 (Vn_volume(vnp) != vp) ||
556 (vp->cacheCheck != Vn_cacheCheck(vnp))));
557 vnp = vnp->hashNext);
564 VAllocVnode(Error * ec, Volume * vp, VnodeType type)
568 retVal = VAllocVnode_r(ec, vp, type);
574 * allocate a new vnode.
576 * @param[out] ec error code return
577 * @param[in] vp volume object pointer
578 * @param[in] type desired vnode type
580 * @return vnode object pointer
582 * @pre VOL_LOCK held;
583 * heavyweight ref held on vp
585 * @post vnode allocated and returned
588 VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
593 register struct VnodeClassInfo *vcp;
596 #ifdef AFS_DEMAND_ATTACH_FS
597 VolState vol_state_save;
602 #ifdef AFS_DEMAND_ATTACH_FS
604 * once a volume has entered an error state, don't permit
605 * further operations to proceed
606 * -- tkeiser 11/21/2007
608 VWaitExclusiveState_r(vp);
609 if (VIsErrorState(V_attachState(vp))) {
610 /* XXX is VSALVAGING acceptable here? */
616 if (programType == fileServer && !V_inUse(vp)) {
617 if (vp->specialStatus) {
618 *ec = vp->specialStatus;
624 class = vnodeTypeToClass(type);
625 vcp = &VnodeClassInfo[class];
627 if (!VolumeWriteable(vp)) {
628 *ec = (bit32) VREADONLY;
632 unique = vp->nextVnodeUnique++;
634 unique = vp->nextVnodeUnique++;
636 if (vp->nextVnodeUnique > V_uniquifier(vp)) {
637 VUpdateVolume_r(ec, vp, 0);
642 if (programType == fileServer) {
643 VAddToVolumeUpdateList_r(ec, vp);
648 /* Find a slot in the bit map */
649 bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
650 VOL_ALLOC_BITMAP_WAIT);
653 vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
657 * at this point we should be assured that V_attachState(vp) is non-exclusive
661 VNLog(2, 1, vnodeNumber, 0, 0, 0);
662 /* Prepare to move it to the new hash chain */
663 vnp = VLookupVnode(vp, vnodeNumber);
665 /* slot already exists. May even not be in lruq (consider store file locking a file being deleted)
666 * so we may have to wait for it below */
667 VNLog(3, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
669 VnCreateReservation_r(vnp);
670 if (Vn_refcount(vnp) == 1) {
671 /* we're the only user */
672 /* This won't block */
673 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
675 /* other users present; follow locking hierarchy */
676 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, MIGHT_DEADLOCK);
678 #ifdef AFS_DEMAND_ATTACH_FS
681 * vnode was cached, wait for any existing exclusive ops to finish.
682 * once we have reacquired the lock, re-verify volume state.
684 * note: any vnode error state is related to the old vnode; disregard.
686 VnWaitQuiescent_r(vnp);
687 if (VIsErrorState(V_attachState(vp))) {
688 VnUnlock(vnp, WRITE_LOCK);
689 VnCancelReservation_r(vnp);
696 * verify state of the world hasn't changed
698 * (technically, this should never happen because cachecheck
699 * is only updated during a volume attach, which should not
700 * happen when refs are held)
702 if (Vn_volume(vnp)->cacheCheck != Vn_cacheCheck(vnp)) {
703 VnUnlock(vnp, WRITE_LOCK);
704 VnCancelReservation_r(vnp);
710 /* no such vnode in the cache */
712 vnp = VGetFreeVnode_r(vcp);
714 /* Initialize the header fields so noone allocates another
715 * vnode with the same number */
716 Vn_id(vnp) = vnodeNumber;
717 VnCreateReservation_r(vnp);
718 AddToVVnList(vp, vnp);
719 #ifdef AFS_DEMAND_ATTACH_FS
723 /* This will never block (guaranteed by check in VGetFreeVnode_r() */
724 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
726 #ifdef AFS_DEMAND_ATTACH_FS
727 VnChangeState_r(vnp, VN_STATE_ALLOC);
730 /* Sanity check: is this vnode really not in use? */
733 IHandle_t *ihP = vp->vnodeIndex[class].handle;
735 off_t off = vnodeIndexOffset(vcp, vnodeNumber);
737 /* XXX we have a potential race here if two threads
738 * allocate new vnodes at the same time, and they
739 * both decide it's time to extend the index
742 #ifdef AFS_DEMAND_ATTACH_FS
744 * this race has been eliminated for the DAFS case
745 * using exclusive state VOL_STATE_VNODE_ALLOC
747 * if this becomes a bottleneck, there are ways to
748 * improve parallelism for this code path
749 * -- tkeiser 11/28/2007
751 VCreateReservation_r(vp);
752 VWaitExclusiveState_r(vp);
753 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_ALLOC);
759 Log("VAllocVnode: can't open index file!\n");
760 goto error_encountered;
762 if ((size = FDH_SIZE(fdP)) < 0) {
763 Log("VAllocVnode: can't stat index file!\n");
764 goto error_encountered;
766 if (FDH_SEEK(fdP, off, SEEK_SET) < 0) {
767 Log("VAllocVnode: can't seek on index file!\n");
768 goto error_encountered;
770 if (off + vcp->diskSize <= size) {
771 if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) != vcp->diskSize) {
772 Log("VAllocVnode: can't read index file!\n");
773 goto error_encountered;
775 if (vnp->disk.type != vNull) {
776 Log("VAllocVnode: addled bitmap or index!\n");
777 goto error_encountered;
780 /* growing file - grow in a reasonable increment */
781 char *buf = (char *)malloc(16 * 1024);
783 Abort("VAllocVnode: malloc failed\n");
784 memset(buf, 0, 16 * 1024);
785 (void)FDH_WRITE(fdP, buf, 16 * 1024);
791 #ifdef AFS_DEMAND_ATTACH_FS
792 VChangeState_r(vp, vol_state_save);
793 VCancelReservation_r(vp);
799 #ifdef AFS_DEMAND_ATTACH_FS
801 * close the file handle
803 * invalidate the vnode
804 * free up the bitmap entry (although salvager should take care of it)
806 * drop vnode lock and refs
811 VFreeBitMapEntry_r(ec, &vp->vnodeIndex[class], bitNumber);
812 VInvalidateVnode_r(vnp);
813 VnUnlock(vnp, WRITE_LOCK);
814 VnCancelReservation_r(vnp);
815 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
816 VCancelReservation_r(vp);
824 VNLog(4, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
825 #ifndef AFS_DEMAND_ATTACH_FS
830 VNLog(5, 1, (afs_int32) vnp, 0, 0, 0);
831 memset(&vnp->disk, 0, sizeof(vnp->disk));
832 vnp->changed_newTime = 0; /* set this bit when vnode is updated */
833 vnp->changed_oldTime = 0; /* set this on CopyOnWrite. */
835 vnp->disk.vnodeMagic = vcp->magic;
836 vnp->disk.type = type;
837 vnp->disk.uniquifier = unique;
840 vp->header->diskstuff.filecount++;
841 #ifdef AFS_DEMAND_ATTACH_FS
842 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
848 * load a vnode from disk.
850 * @param[out] ec client error code return
851 * @param[in] vp volume object pointer
852 * @param[in] vnp vnode object pointer
853 * @param[in] vcp vnode class info object pointer
854 * @param[in] class vnode class enumeration
856 * @pre vnode is registered in appropriate data structures;
857 * caller holds a ref on vnode; VOL_LOCK is held
859 * @post vnode data is loaded from disk.
860 * vnode state is set to VN_STATE_ONLINE.
861 * on failure, vnode is invalidated.
863 * @internal vnode package internal use only
866 VnLoad(Error * ec, Volume * vp, Vnode * vnp,
867 struct VnodeClassInfo * vcp, VnodeClass class)
869 /* vnode not cached */
872 IHandle_t *ihP = vp->vnodeIndex[class].handle;
878 #ifdef AFS_DEMAND_ATTACH_FS
879 VnChangeState_r(vnp, VN_STATE_LOAD);
882 /* This will never block */
883 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
888 Log("VnLoad: can't open index dev=%u, i=%s\n", vp->device,
889 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
891 goto error_encountered_nolock;
892 } else if (FDH_SEEK(fdP, vnodeIndexOffset(vcp, Vn_id(vnp)), SEEK_SET)
894 Log("VnLoad: can't seek on index file vn=%u\n", Vn_id(vnp));
896 goto error_encountered_nolock;
897 } else if ((n = FDH_READ(fdP, (char *)&vnp->disk, vcp->diskSize))
899 /* Don't take volume off line if the inumber is out of range
900 * or the inode table is full. */
902 Log("VnLoad: bad inumber %s\n",
903 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
906 } else if (n == -1 && errno == EIO) {
907 /* disk error; salvage */
908 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
910 /* vnode is not allocated */
912 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); read %d bytes, errno %d\n",
913 Vn_id(vnp), V_id(vp), V_name(vp), n, errno);
917 goto error_encountered_nolock;
922 /* Quick check to see that the data is reasonable */
923 if (vnp->disk.vnodeMagic != vcp->magic || vnp->disk.type == vNull) {
924 if (vnp->disk.type == vNull) {
928 struct vnodeIndex *index = &vp->vnodeIndex[class];
929 unsigned int bitNumber = vnodeIdToBitNumber(Vn_id(vnp));
930 unsigned int offset = bitNumber >> 3;
932 /* Test to see if vnode number is valid. */
933 if ((offset >= index->bitmapSize)
934 || ((*(index->bitmap + offset) & (1 << (bitNumber & 0x7)))
936 Log("VnLoad: Request for unallocated vnode %u, volume %u (%s) denied.\n", Vn_id(vnp), V_id(vp), V_name(vp));
940 Log("VnLoad: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
943 goto error_encountered;
946 IH_INIT(vnp->handle, V_device(vp), V_parentId(vp), VN_GET_INO(vnp));
947 VnUnlock(vnp, WRITE_LOCK);
948 #ifdef AFS_DEMAND_ATTACH_FS
949 VnChangeState_r(vnp, VN_STATE_ONLINE);
954 error_encountered_nolock:
956 FDH_REALLYCLOSE(fdP);
962 #ifdef AFS_DEMAND_ATTACH_FS
963 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
965 VForceOffline_r(vp, 0);
972 VInvalidateVnode_r(vnp);
973 VnUnlock(vnp, WRITE_LOCK);
977 * store a vnode to disk.
979 * @param[out] ec error code output
980 * @param[in] vp volume object pointer
981 * @param[in] vnp vnode object pointer
982 * @param[in] vcp vnode class info object pointer
983 * @param[in] class vnode class enumeration
985 * @pre VOL_LOCK held.
986 * caller holds refs to volume and vnode.
987 * DAFS: caller is responsible for performing state sanity checks.
989 * @post vnode state is stored to disk.
991 * @internal vnode package internal use only
994 VnStore(Error * ec, Volume * vp, Vnode * vnp,
995 struct VnodeClassInfo * vcp, VnodeClass class)
998 IHandle_t *ihP = vp->vnodeIndex[class].handle;
1000 #ifdef AFS_DEMAND_ATTACH_FS
1001 VnState vn_state_save;
1006 #ifdef AFS_DEMAND_ATTACH_FS
1007 vn_state_save = VnChangeState_r(vnp, VN_STATE_STORE);
1010 offset = vnodeIndexOffset(vcp, Vn_id(vnp));
1014 Log("VnStore: can't open index file!\n");
1015 goto error_encountered;
1017 if (FDH_SEEK(fdP, offset, SEEK_SET) < 0) {
1018 Log("VnStore: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
1019 fdP, offset, errno);
1020 goto error_encountered;
1023 code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
1024 if (code != vcp->diskSize) {
1025 /* Don't force volume offline if the inumber is out of
1026 * range or the inode table is full.
1028 FDH_REALLYCLOSE(fdP);
1029 if (code == BAD_IGET) {
1030 Log("VnStore: bad inumber %s\n",
1032 vp->vnodeIndex[class].handle->ih_ino));
1035 #ifdef AFS_DEMAND_ATTACH_FS
1036 VnChangeState_r(vnp, VN_STATE_ERROR);
1039 Log("VnStore: Couldn't write vnode %u, volume %u (%s) (error %d)\n", Vn_id(vnp), V_id(Vn_volume(vnp)), V_name(Vn_volume(vnp)), code);
1040 #ifdef AFS_DEMAND_ATTACH_FS
1041 goto error_encountered;
1044 VForceOffline_r(vp, 0);
1054 #ifdef AFS_DEMAND_ATTACH_FS
1055 VnChangeState_r(vnp, vn_state_save);
1060 #ifdef AFS_DEMAND_ATTACH_FS
1061 /* XXX instead of dumping core, let's try to request a salvage
1062 * and just fail the putvnode */
1066 VnChangeState_r(vnp, VN_STATE_ERROR);
1067 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1074 * get a handle to a vnode object.
1076 * @param[out] ec error code
1077 * @param[in] vp volume object
1078 * @param[in] vnodeNumber vnode id
1079 * @param[in] locktype type of lock to acquire
1081 * @return vnode object pointer
1086 VGetVnode(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1087 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1090 retVal = VGetVnode_r(ec, vp, vnodeNumber, locktype);
1096 * get a handle to a vnode object.
1098 * @param[out] ec error code
1099 * @param[in] vp volume object
1100 * @param[in] vnodeNumber vnode id
1101 * @param[in] locktype type of lock to acquire
1103 * @return vnode object pointer
1105 * @internal vnode package internal use only
1107 * @pre VOL_LOCK held.
1108 * heavyweight ref held on volume object.
1111 VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1112 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1113 register Vnode *vnp;
1115 struct VnodeClassInfo *vcp;
1119 if (vnodeNumber == 0) {
1124 VNLog(100, 1, vnodeNumber, 0, 0, 0);
1126 #ifdef AFS_DEMAND_ATTACH_FS
1128 * once a volume has entered an error state, don't permit
1129 * further operations to proceed
1130 * -- tkeiser 11/21/2007
1132 VWaitExclusiveState_r(vp);
1133 if (VIsErrorState(V_attachState(vp))) {
1134 /* XXX is VSALVAGING acceptable here? */
1140 if (programType == fileServer && !V_inUse(vp)) {
1141 *ec = (vp->specialStatus ? vp->specialStatus : VOFFLINE);
1143 /* If the volume is VBUSY (being cloned or dumped) and this is
1144 * a READ operation, then don't fail.
1146 if ((*ec != VBUSY) || (locktype != READ_LOCK)) {
1151 class = vnodeIdToClass(vnodeNumber);
1152 vcp = &VnodeClassInfo[class];
1153 if (locktype == WRITE_LOCK && !VolumeWriteable(vp)) {
1154 *ec = (bit32) VREADONLY;
1158 if (locktype == WRITE_LOCK && programType == fileServer) {
1159 VAddToVolumeUpdateList_r(ec, vp);
1167 /* See whether the vnode is in the cache. */
1168 vnp = VLookupVnode(vp, vnodeNumber);
1170 /* vnode is in cache */
1172 VNLog(101, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
1173 VnCreateReservation_r(vnp);
1175 #ifdef AFS_DEMAND_ATTACH_FS
1177 * this is the one DAFS case where we may run into contention.
1178 * here's the basic control flow:
1180 * if locktype is READ_LOCK:
1181 * wait until vnode is not exclusive
1182 * set to VN_STATE_READ
1183 * increment read count
1186 * wait until vnode is quiescent
1187 * set to VN_STATE_EXCLUSIVE
1190 if (locktype == READ_LOCK) {
1191 VnWaitExclusiveState_r(vnp);
1193 VnWaitQuiescent_r(vnp);
1196 if (VnIsErrorState(Vn_state(vnp))) {
1197 VnCancelReservation_r(vnp);
1201 #endif /* AFS_DEMAND_ATTACH_FS */
1203 /* vnode not cached */
1205 /* Not in cache; tentatively grab most distantly used one from the LRU
1208 vnp = VGetFreeVnode_r(vcp);
1211 vnp->changed_newTime = vnp->changed_oldTime = 0;
1213 Vn_id(vnp) = vnodeNumber;
1214 VnCreateReservation_r(vnp);
1215 AddToVVnList(vp, vnp);
1216 #ifdef AFS_DEMAND_ATTACH_FS
1221 * XXX for non-DAFS, there is a serious
1222 * race condition here:
1224 * two threads can race to load a vnode. the net
1225 * result is two struct Vnodes can be allocated
1226 * and hashed, which point to the same underlying
1227 * disk data store. conflicting vnode locks can
1228 * thus be held concurrently.
1230 * for non-DAFS to be safe, VOL_LOCK really shouldn't
1231 * be dropped in VnLoad. Of course, this would likely
1232 * lead to an unacceptable slow-down.
1235 VnLoad(ec, vp, vnp, vcp, class);
1237 VnCancelReservation_r(vnp);
1240 #ifndef AFS_DEMAND_ATTACH_FS
1245 * there is no possibility for contention. we "own" this vnode.
1251 * it is imperative that nothing drop vol lock between here
1252 * and the VnBeginRead/VnChangeState stanza below
1255 VnLock(vnp, locktype, VOL_LOCK_HELD, MIGHT_DEADLOCK);
1257 /* Check that the vnode hasn't been removed while we were obtaining
1259 VNLog(102, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
1260 if ((vnp->disk.type == vNull) || (Vn_cacheCheck(vnp) == 0)) {
1261 VnUnlock(vnp, locktype);
1262 VnCancelReservation_r(vnp);
1264 /* vnode is labelled correctly by now, so we don't have to invalidate it */
1268 #ifdef AFS_DEMAND_ATTACH_FS
1269 if (locktype == READ_LOCK) {
1272 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
1276 if (programType == fileServer)
1277 VBumpVolumeUsage_r(Vn_volume(vnp)); /* Hack; don't know where it should be
1278 * called from. Maybe VGetVolume */
1283 int TrustVnodeCacheEntry = 1;
1284 /* This variable is bogus--when it's set to 0, the hash chains fill
1285 up with multiple versions of the same vnode. Should fix this!! */
1287 VPutVnode(Error * ec, register Vnode * vnp)
1290 VPutVnode_r(ec, vnp);
1295 * put back a handle to a vnode object.
1297 * @param[out] ec client error code
1298 * @param[in] vnp vnode object pointer
1300 * @pre VOL_LOCK held.
1301 * ref held on vnode.
1303 * @post ref dropped on vnode.
1304 * if vnode was modified or deleted, it is written out to disk
1305 * (assuming a write lock was held).
1307 * @internal volume package internal use only
1310 VPutVnode_r(Error * ec, register Vnode * vnp)
1314 struct VnodeClassInfo *vcp;
1317 assert(Vn_refcount(vnp) != 0);
1318 class = vnodeIdToClass(Vn_id(vnp));
1319 vcp = &VnodeClassInfo[class];
1320 assert(vnp->disk.vnodeMagic == vcp->magic);
1321 VNLog(200, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
1323 #ifdef AFS_DEMAND_ATTACH_FS
1324 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1326 writeLocked = WriteLocked(&vnp->lock);
1331 #ifdef AFS_PTHREAD_ENV
1332 pthread_t thisProcess = pthread_self();
1333 #else /* AFS_PTHREAD_ENV */
1334 PROCESS thisProcess;
1335 LWP_CurrentProcess(&thisProcess);
1336 #endif /* AFS_PTHREAD_ENV */
1337 VNLog(201, 2, (afs_int32) vnp,
1338 ((vnp->changed_newTime) << 1) | ((vnp->
1339 changed_oldTime) << 1) | vnp->
1341 if (thisProcess != vnp->writer)
1342 Abort("VPutVnode: Vnode at 0x%x locked by another process!\n",
1346 if (vnp->changed_oldTime || vnp->changed_newTime || vnp->delete) {
1347 Volume *vp = Vn_volume(vnp);
1348 afs_uint32 now = FT_ApproxTime();
1349 assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1352 /* No longer any directory entries for this vnode. Free the Vnode */
1353 memset(&vnp->disk, 0, sizeof(vnp->disk));
1354 /* delete flag turned off further down */
1355 VNLog(202, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
1356 } else if (vnp->changed_newTime) {
1357 vnp->disk.serverModifyTime = now;
1359 if (vnp->changed_newTime)
1361 V_updateDate(vp) = vp->updateTime = now;
1362 if(V_volUpCounter(vp)<MAXINT)
1363 V_volUpCounter(vp)++;
1366 /* The vnode has been changed. Write it out to disk */
1368 #ifdef AFS_DEMAND_ATTACH_FS
1369 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1371 assert(V_needsSalvaged(vp));
1375 VnStore(ec, vp, vnp, vcp, class);
1377 /* If the vnode is to be deleted, and we wrote the vnode out,
1378 * free its bitmap entry. Do after the vnode is written so we
1379 * don't allocate from bitmap before the vnode is written
1380 * (doing so could cause a "addled bitmap" message).
1382 if (vnp->delete && !*ec) {
1383 if (Vn_volume(vnp)->header->diskstuff.filecount-- < 1)
1384 Vn_volume(vnp)->header->diskstuff.filecount = 0;
1385 VFreeBitMapEntry_r(ec, &vp->vnodeIndex[class],
1386 vnodeIdToBitNumber(Vn_id(vnp)));
1390 vnp->changed_newTime = vnp->changed_oldTime = 0;
1392 #ifdef AFS_DEMAND_ATTACH_FS
1393 VnChangeState_r(vnp, VN_STATE_ONLINE);
1395 } else { /* Not write locked */
1396 if (vnp->changed_newTime || vnp->changed_oldTime || vnp->delete)
1398 ("VPutVnode: Change or delete flag for vnode 0x%x is set but vnode is not write locked!\n",
1400 #ifdef AFS_DEMAND_ATTACH_FS
1405 /* Do not look at disk portion of vnode after this point; it may
1406 * have been deleted above */
1408 VnUnlock(vnp, ((writeLocked) ? WRITE_LOCK : READ_LOCK));
1409 VnCancelReservation_r(vnp);
1413 * Make an attempt to convert a vnode lock from write to read.
1414 * Do nothing if the vnode isn't write locked or the vnode has
1418 VVnodeWriteToRead(Error * ec, register Vnode * vnp)
1422 retVal = VVnodeWriteToRead_r(ec, vnp);
1428 * convert vnode handle from mutually exclusive to shared access.
1430 * @param[out] ec client error code
1431 * @param[in] vnp vnode object pointer
1433 * @return unspecified use (see out argument 'ec' for error code return)
1435 * @pre VOL_LOCK held.
1436 * ref held on vnode.
1437 * write lock held on vnode.
1439 * @post read lock held on vnode.
1440 * if vnode was modified, it has been written to disk.
1442 * @internal volume package internal use only
1445 VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
1449 struct VnodeClassInfo *vcp;
1450 #ifdef AFS_PTHREAD_ENV
1451 pthread_t thisProcess;
1452 #else /* AFS_PTHREAD_ENV */
1453 PROCESS thisProcess;
1454 #endif /* AFS_PTHREAD_ENV */
1457 assert(Vn_refcount(vnp) != 0);
1458 class = vnodeIdToClass(Vn_id(vnp));
1459 vcp = &VnodeClassInfo[class];
1460 assert(vnp->disk.vnodeMagic == vcp->magic);
1461 VNLog(300, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
1463 #ifdef AFS_DEMAND_ATTACH_FS
1464 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1466 writeLocked = WriteLocked(&vnp->lock);
1473 VNLog(301, 2, (afs_int32) vnp,
1474 ((vnp->changed_newTime) << 1) | ((vnp->
1475 changed_oldTime) << 1) | vnp->
1479 #ifdef AFS_PTHREAD_ENV
1480 thisProcess = pthread_self();
1481 #else /* AFS_PTHREAD_ENV */
1482 LWP_CurrentProcess(&thisProcess);
1483 #endif /* AFS_PTHREAD_ENV */
1484 if (thisProcess != vnp->writer)
1485 Abort("VPutVnode: Vnode at 0x%x locked by another process!\n",
1491 if (vnp->changed_oldTime || vnp->changed_newTime) {
1492 Volume *vp = Vn_volume(vnp);
1493 afs_uint32 now = FT_ApproxTime();
1494 assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1495 if (vnp->changed_newTime)
1496 vnp->disk.serverModifyTime = now;
1497 if (vnp->changed_newTime)
1498 V_updateDate(vp) = vp->updateTime = now;
1500 /* The inode has been changed. Write it out to disk */
1502 #ifdef AFS_DEMAND_ATTACH_FS
1503 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1505 assert(V_needsSalvaged(vp));
1509 VnStore(ec, vp, vnp, vcp, class);
1512 vnp->changed_newTime = vnp->changed_oldTime = 0;
1516 #ifdef AFS_DEMAND_ATTACH_FS
1517 VnChangeState_r(vnp, VN_STATE_ONLINE);
1520 ConvertWriteToReadLock(&vnp->lock);
1526 * initial size of ihandle pointer vector.
1528 * @see VInvalidateVnodesByVolume_r
1530 #define IH_VEC_BASE_SIZE 256
1533 * increment amount for growing ihandle pointer vector.
1535 * @see VInvalidateVnodesByVolume_r
1537 #define IH_VEC_INCREMENT 256
1540 * Compile list of ihandles to be released/reallyclosed at a later time.
1542 * @param[in] vp volume object pointer
1543 * @param[out] vec_out vector of ihandle pointers to be released/reallyclosed
1544 * @param[out] vec_len_out number of valid elements in ihandle vector
1546 * @pre - VOL_LOCK is held
1547 * - volume is in appropriate exclusive state (e.g. VOL_STATE_VNODE_CLOSE,
1548 * VOL_STATE_VNODE_RELEASE)
1550 * @post - all vnodes on VVn list are invalidated
1551 * - ih_vec is populated with all valid ihandles
1553 * @return operation status
1555 * @retval ENOMEM out of memory
1557 * @todo we should handle out of memory conditions more gracefully.
1559 * @internal vnode package internal use only
1562 VInvalidateVnodesByVolume_r(Volume * vp,
1563 IHandle_t *** vec_out,
1564 size_t * vec_len_out)
1568 size_t i = 0, vec_len;
1569 IHandle_t **ih_vec, **ih_vec_new;
1571 #ifdef AFS_DEMAND_ATTACH_FS
1573 #endif /* AFS_DEMAND_ATTACH_FS */
1575 vec_len = IH_VEC_BASE_SIZE;
1576 ih_vec = malloc(sizeof(IHandle_t *) * vec_len);
1577 #ifdef AFS_DEMAND_ATTACH_FS
1584 * Traverse the volume's vnode list. Pull all the ihandles out into a
1585 * thread-private array for later asynchronous processing.
1588 for (queue_Scan(&vp->vnode_list, vnp, nvnp, Vnode)) {
1589 if (vnp->handle != NULL) {
1591 #ifdef AFS_DEMAND_ATTACH_FS
1594 vec_len += IH_VEC_INCREMENT;
1595 ih_vec_new = realloc(ih_vec, sizeof(IHandle_t *) * vec_len);
1596 #ifdef AFS_DEMAND_ATTACH_FS
1599 if (ih_vec_new == NULL) {
1603 ih_vec = ih_vec_new;
1604 #ifdef AFS_DEMAND_ATTACH_FS
1606 * Theoretically, the volume's VVn list should not change
1607 * because the volume is in an exclusive state. For the
1608 * sake of safety, we will restart the traversal from the
1609 * the beginning (which is not expensive because we're
1610 * deleting the items from the list as we go).
1612 goto restart_traversal;
1615 ih_vec[i++] = vnp->handle;
1618 DeleteFromVVnList(vnp);
1619 VInvalidateVnode_r(vnp);
1629 /* VCloseVnodeFiles - called when a volume is going off line. All open
1630 * files for vnodes in that volume are closed. This might be excessive,
1631 * since we may only be taking one volume of a volume group offline.
1634 VCloseVnodeFiles_r(Volume * vp)
1636 #ifdef AFS_DEMAND_ATTACH_FS
1637 VolState vol_state_save;
1639 IHandle_t ** ih_vec;
1642 #ifdef AFS_DEMAND_ATTACH_FS
1643 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_CLOSE);
1644 #endif /* AFS_DEMAND_ATTACH_FS */
1646 /* XXX need better error handling here */
1647 assert(VInvalidateVnodesByVolume_r(vp,
1653 * now we drop VOL_LOCK while we perform some potentially very
1654 * expensive operations in the background
1656 #ifdef AFS_DEMAND_ATTACH_FS
1660 for (i = 0; i < vec_len; i++) {
1661 IH_REALLYCLOSE(ih_vec[i]);
1666 #ifdef AFS_DEMAND_ATTACH_FS
1668 VChangeState_r(vp, vol_state_save);
1669 #endif /* AFS_DEMAND_ATTACH_FS */
1674 * shut down all vnode cache state for a given volume.
1676 * @param[in] vp volume object pointer
1678 * @pre VOL_LOCK is held
1680 * @post all file descriptors closed.
1681 * all inode handles released.
1682 * all vnode cache objects disassociated from volume.
1684 * @note for DAFS, these operations are performed outside the vol glock under
1685 * volume exclusive state VOL_STATE_VNODE_RELEASE. Please further note
1686 * that it would be a bug to acquire and release a volume reservation
1687 * during this exclusive operation. This is due to the fact that we are
1688 * generally called during the refcount 1->0 transition.
1690 * @todo we should handle failures in VInvalidateVnodesByVolume_r more
1693 * @see VInvalidateVnodesByVolume_r
1695 * @internal this routine is internal to the volume package
1698 VReleaseVnodeFiles_r(Volume * vp)
1700 #ifdef AFS_DEMAND_ATTACH_FS
1701 VolState vol_state_save;
1703 IHandle_t ** ih_vec;
1706 #ifdef AFS_DEMAND_ATTACH_FS
1707 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_RELEASE);
1708 #endif /* AFS_DEMAND_ATTACH_FS */
1710 /* XXX need better error handling here */
1711 assert(VInvalidateVnodesByVolume_r(vp,
1717 * now we drop VOL_LOCK while we perform some potentially very
1718 * expensive operations in the background
1720 #ifdef AFS_DEMAND_ATTACH_FS
1724 for (i = 0; i < vec_len; i++) {
1725 IH_RELEASE(ih_vec[i]);
1730 #ifdef AFS_DEMAND_ATTACH_FS
1732 VChangeState_r(vp, vol_state_save);
1733 #endif /* AFS_DEMAND_ATTACH_FS */