2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
15 Institution: The Information Technology Center, Carnegie-Mellon University
18 #include <afsconfig.h>
19 #include <afs/param.h>
20 #define MAXINT (~(1<<((sizeof(int)*8)-1)))
27 #include <afs/afs_assert.h>
30 #include "rx/rx_queue.h"
31 #include <afs/afsint.h>
33 #include <afs/errors.h>
36 #include <afs/afssyscalls.h>
40 #include "volume_inline.h"
41 #include "vnode_inline.h"
42 #include "partition.h"
45 #if defined(AFS_SGI_ENV)
46 #include "sys/types.h"
58 #include <sys/fcntl.h>
61 #endif /* AFS_NT40_ENV */
68 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
70 void VNLog(afs_int32 aop, afs_int32 anparms, ... );
77 #define BAD_IGET -1000
79 /* There are two separate vnode queue types defined here:
80 * Each hash conflict chain -- is singly linked, with a single head
81 * pointer. New entries are added at the beginning. Old
82 * entries are removed by linear search, which generally
83 * only occurs after a disk read).
84 * LRU chain -- is doubly linked, single head pointer.
85 * Entries are added at the head, reclaimed from the tail,
86 * or removed from anywhere in the queue.
90 /* Vnode hash table. Find hash chain by taking lower bits of
91 * (volume_hash_offset + vnode).
92 * This distributes the root inodes of the volumes over the
93 * hash table entries and also distributes the vnodes of
94 * volumes reasonably fairly. The volume_hash_offset field
95 * for each volume is established as the volume comes on line
96 * by using the VOLUME_HASH_OFFSET macro. This distributes the
97 * volumes fairly among the cache entries, both when servicing
98 * a small number of volumes and when servicing a large number.
101 /* logging stuff for finding bugs */
102 #define THELOGSIZE 5120
103 static afs_int32 theLog[THELOGSIZE];
104 static afs_int32 vnLogPtr = 0;
106 VNLog(afs_int32 aop, afs_int32 anparms, ... )
111 va_start(ap, anparms);
114 anparms = 4; /* do bounds checking */
116 temp = (aop << 16) | anparms;
117 theLog[vnLogPtr++] = temp;
118 if (vnLogPtr >= THELOGSIZE)
120 for (temp = 0; temp < anparms; temp++) {
121 theLog[vnLogPtr++] = va_arg(ap, afs_int32);
122 if (vnLogPtr >= THELOGSIZE)
128 /* VolumeHashOffset -- returns a new value to be stored in the
129 * volumeHashOffset of a Volume structure. Called when a
130 * volume is initialized. Sets the volumeHashOffset so that
131 * vnode cache entries are distributed reasonably between
132 * volumes (the root vnodes of the volumes will hash to
133 * different values, and spacing is maintained between volumes
134 * when there are not many volumes represented), and spread
135 * equally amongst vnodes within a single volume.
138 VolumeHashOffset_r(void)
140 static int nextVolumeHashOffset = 0;
141 /* hashindex Must be power of two in size */
143 # define hashMask ((1<<hashShift)-1)
144 static byte hashindex[1 << hashShift] =
145 { 0, 128, 64, 192, 32, 160, 96, 224 };
147 offset = hashindex[nextVolumeHashOffset & hashMask]
148 + (nextVolumeHashOffset >> hashShift);
149 nextVolumeHashOffset++;
153 /* Change hashindex (above) if you change this constant */
154 #define VNODE_HASH_TABLE_SIZE 256
155 private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
156 #define VNODE_HASH(volumeptr,vnodenumber)\
157 ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
161 * add a vnode to the volume's vnode list.
163 * @param[in] vp volume object pointer
164 * @param[in] vnp vnode object pointer
166 * @note for DAFS, it may seem like we should be acquiring a lightweight ref
167 * on vp, but this would actually break things. Right now, this is ok
168 * because we destroy all vnode cache contents during during volume
173 * @internal volume package internal use only
176 AddToVVnList(Volume * vp, Vnode * vnp)
178 if (queue_IsOnQueue(vnp))
182 Vn_cacheCheck(vnp) = vp->cacheCheck;
183 queue_Append(&vp->vnode_list, vnp);
184 Vn_stateFlags(vnp) |= VN_ON_VVN;
188 * delete a vnode from the volume's vnode list.
192 * @internal volume package internal use only
195 DeleteFromVVnList(Vnode * vnp)
197 Vn_volume(vnp) = NULL;
199 if (!queue_IsOnQueue(vnp))
203 Vn_stateFlags(vnp) &= ~(VN_ON_VVN);
207 * add a vnode to the end of the lru.
209 * @param[in] vcp vnode class info object pointer
210 * @param[in] vnp vnode object pointer
212 * @internal vnode package internal use only
215 AddToVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
217 if (Vn_stateFlags(vnp) & VN_ON_LRU) {
221 /* Add it to the circular LRU list */
222 if (vcp->lruHead == NULL)
223 Abort("VPutVnode: vcp->lruHead==NULL");
225 vnp->lruNext = vcp->lruHead;
226 vnp->lruPrev = vcp->lruHead->lruPrev;
227 vcp->lruHead->lruPrev = vnp;
228 vnp->lruPrev->lruNext = vnp;
232 /* If the vnode was just deleted, put it at the end of the chain so it
233 * will be reused immediately */
235 vcp->lruHead = vnp->lruNext;
237 Vn_stateFlags(vnp) |= VN_ON_LRU;
241 * delete a vnode from the lru.
243 * @param[in] vcp vnode class info object pointer
244 * @param[in] vnp vnode object pointer
246 * @internal vnode package internal use only
249 DeleteFromVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
251 if (!(Vn_stateFlags(vnp) & VN_ON_LRU)) {
255 if (vnp == vcp->lruHead)
256 vcp->lruHead = vcp->lruHead->lruNext;
258 if ((vnp == vcp->lruHead) ||
259 (vcp->lruHead == NULL))
260 Abort("DeleteFromVnLRU: lru chain addled!\n");
262 vnp->lruPrev->lruNext = vnp->lruNext;
263 vnp->lruNext->lruPrev = vnp->lruPrev;
265 Vn_stateFlags(vnp) &= ~(VN_ON_LRU);
269 * add a vnode to the vnode hash table.
271 * @param[in] vnp vnode object pointer
275 * @post vnode on hash
277 * @internal vnode package internal use only
280 AddToVnHash(Vnode * vnp)
282 unsigned int newHash;
284 if (!(Vn_stateFlags(vnp) & VN_ON_HASH)) {
285 newHash = VNODE_HASH(Vn_volume(vnp), Vn_id(vnp));
286 vnp->hashNext = VnodeHashTable[newHash];
287 VnodeHashTable[newHash] = vnp;
288 vnp->hashIndex = newHash;
290 Vn_stateFlags(vnp) |= VN_ON_HASH;
295 * delete a vnode from the vnode hash table.
302 * @post vnode removed from hash
304 * @internal vnode package internal use only
307 DeleteFromVnHash(Vnode * vnp)
311 if (Vn_stateFlags(vnp) & VN_ON_HASH) {
312 tvnp = VnodeHashTable[vnp->hashIndex];
314 VnodeHashTable[vnp->hashIndex] = vnp->hashNext;
316 while (tvnp && tvnp->hashNext != vnp)
317 tvnp = tvnp->hashNext;
319 tvnp->hashNext = vnp->hashNext;
322 vnp->hashNext = NULL;
324 Vn_stateFlags(vnp) &= ~(VN_ON_HASH);
330 * invalidate a vnode cache entry.
332 * @param[in] avnode vnode object pointer
336 * @post vnode metadata invalidated.
337 * vnode removed from hash table.
338 * DAFS: vnode state set to VN_STATE_INVALID.
340 * @internal vnode package internal use only
343 VInvalidateVnode_r(struct Vnode *avnode)
345 avnode->changed_newTime = 0; /* don't let it get flushed out again */
346 avnode->changed_oldTime = 0;
347 avnode->delete = 0; /* it isn't deleted, really */
348 avnode->cacheCheck = 0; /* invalid: prevents future vnode searches from working */
349 DeleteFromVnHash(avnode);
350 #ifdef AFS_DEMAND_ATTACH_FS
351 VnChangeState_r(avnode, VN_STATE_INVALID);
357 * initialize vnode cache for a given vnode class.
359 * @param[in] class vnode class
360 * @param[in] nVnodes size of cache
362 * @post vnode cache allocated and initialized
364 * @internal volume package internal use only
366 * @note generally called by VInitVolumePackage_r
368 * @see VInitVolumePackage_r
371 VInitVnodes(VnodeClass class, int nVnodes)
374 struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
376 vcp->allocs = vcp->gets = vcp->reads = vcp->writes = 0;
377 vcp->cacheSize = nVnodes;
380 osi_Assert(CHECKSIZE_SMALLVNODE);
382 vcp->residentSize = SIZEOF_SMALLVNODE;
383 vcp->diskSize = SIZEOF_SMALLDISKVNODE;
384 vcp->magic = SMALLVNODEMAGIC;
388 vcp->residentSize = SIZEOF_LARGEVNODE;
389 vcp->diskSize = SIZEOF_LARGEDISKVNODE;
390 vcp->magic = LARGEVNODEMAGIC;
394 int s = vcp->diskSize - 1;
404 va = (byte *) calloc(nVnodes, vcp->residentSize);
405 osi_Assert(va != NULL);
407 Vnode *vnp = (Vnode *) va;
408 Vn_refcount(vnp) = 0; /* no context switches */
409 Vn_stateFlags(vnp) |= VN_ON_LRU;
410 #ifdef AFS_DEMAND_ATTACH_FS
411 CV_INIT(&Vn_stateCV(vnp), "vnode state", CV_DEFAULT, 0);
412 Vn_state(vnp) = VN_STATE_INVALID;
414 #else /* !AFS_DEMAND_ATTACH_FS */
415 Lock_Init(&vnp->lock);
416 #endif /* !AFS_DEMAND_ATTACH_FS */
417 vnp->changed_oldTime = 0;
418 vnp->changed_newTime = 0;
419 Vn_volume(vnp) = NULL;
420 Vn_cacheCheck(vnp) = 0;
421 vnp->delete = Vn_id(vnp) = 0;
422 #ifdef AFS_PTHREAD_ENV
423 vnp->writer = (pthread_t) 0;
424 #else /* AFS_PTHREAD_ENV */
425 vnp->writer = (PROCESS) 0;
426 #endif /* AFS_PTHREAD_ENV */
430 if (vcp->lruHead == NULL)
431 vcp->lruHead = vnp->lruNext = vnp->lruPrev = vnp;
433 vnp->lruNext = vcp->lruHead;
434 vnp->lruPrev = vcp->lruHead->lruPrev;
435 vcp->lruHead->lruPrev = vnp;
436 vnp->lruPrev->lruNext = vnp;
439 va += vcp->residentSize;
446 * allocate an unused vnode from the lru chain.
448 * @param[in] vcp vnode class info object pointer
450 * @pre VOL_LOCK is held
452 * @post vnode object is removed from lru, and vnode hash table.
453 * vnode is disassociated from volume object.
454 * state is set to VN_STATE_INVALID.
455 * inode handle is released.
457 * @note we traverse backwards along the lru circlist. It shouldn't
458 * be necessary to specify that nUsers == 0 since if it is in the list,
459 * nUsers should be 0. Things shouldn't be in lruq unless no one is
462 * @warning DAFS: VOL_LOCK is dropped while doing inode handle release
464 * @return vnode object pointer
467 VGetFreeVnode_r(struct VnodeClassInfo * vcp)
471 vnp = vcp->lruHead->lruPrev;
472 #ifdef AFS_DEMAND_ATTACH_FS
473 if (Vn_refcount(vnp) != 0 || VnIsExclusiveState(Vn_state(vnp)) ||
474 Vn_readers(vnp) != 0)
475 Abort("VGetFreeVnode_r: in-use vnode in lruq");
477 if (Vn_refcount(vnp) != 0 || CheckLock(&vnp->lock))
478 Abort("VGetFreeVnode_r: locked vnode in lruq");
480 VNLog(1, 2, Vn_id(vnp), (intptr_t)vnp, 0, 0);
483 * it's going to be overwritten soon enough.
484 * remove from LRU, delete hash entry, and
485 * disassociate from old parent volume before
486 * we have a chance to drop the vol glock
488 DeleteFromVnLRU(vcp, vnp);
489 DeleteFromVnHash(vnp);
490 if (Vn_volume(vnp)) {
491 DeleteFromVVnList(vnp);
494 /* drop the file descriptor */
496 #ifdef AFS_DEMAND_ATTACH_FS
497 VnChangeState_r(vnp, VN_STATE_RELEASING);
500 /* release is, potentially, a highly latent operation due to a couple
502 * - ihandle package lock contention
503 * - closing file descriptor(s) associated with ih
505 * Hance, we perform outside of the volume package lock in order to
506 * reduce the probability of contention.
508 IH_RELEASE(vnp->handle);
509 #ifdef AFS_DEMAND_ATTACH_FS
514 #ifdef AFS_DEMAND_ATTACH_FS
515 VnChangeState_r(vnp, VN_STATE_INVALID);
523 * lookup a vnode in the vnode cache hash table.
525 * @param[in] vp pointer to volume object
526 * @param[in] vnodeId vnode id
530 * @post matching vnode object or NULL is returned
532 * @return vnode object pointer
533 * @retval NULL no matching vnode object was found in the cache
535 * @internal vnode package internal use only
537 * @note this symbol is exported strictly for fssync debug protocol use
540 VLookupVnode(Volume * vp, VnodeId vnodeId)
543 unsigned int newHash;
545 newHash = VNODE_HASH(vp, vnodeId);
546 for (vnp = VnodeHashTable[newHash];
548 ((Vn_id(vnp) != vnodeId) ||
549 (Vn_volume(vnp) != vp) ||
550 (vp->cacheCheck != Vn_cacheCheck(vnp))));
551 vnp = vnp->hashNext);
558 VAllocVnode(Error * ec, Volume * vp, VnodeType type)
562 retVal = VAllocVnode_r(ec, vp, type);
568 * allocate a new vnode.
570 * @param[out] ec error code return
571 * @param[in] vp volume object pointer
572 * @param[in] type desired vnode type
574 * @return vnode object pointer
576 * @pre VOL_LOCK held;
577 * heavyweight ref held on vp
579 * @post vnode allocated and returned
582 VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
587 struct VnodeClassInfo *vcp;
590 #ifdef AFS_DEMAND_ATTACH_FS
591 VolState vol_state_save;
596 #ifdef AFS_DEMAND_ATTACH_FS
598 * once a volume has entered an error state, don't permit
599 * further operations to proceed
600 * -- tkeiser 11/21/2007
602 VWaitExclusiveState_r(vp);
603 if (VIsErrorState(V_attachState(vp))) {
604 /* XXX is VSALVAGING acceptable here? */
610 if (programType == fileServer && !V_inUse(vp)) {
611 if (vp->specialStatus) {
612 *ec = vp->specialStatus;
618 class = vnodeTypeToClass(type);
619 vcp = &VnodeClassInfo[class];
621 if (!VolumeWriteable(vp)) {
622 *ec = (bit32) VREADONLY;
626 unique = vp->nextVnodeUnique++;
628 unique = vp->nextVnodeUnique++;
630 if (vp->nextVnodeUnique > V_uniquifier(vp)) {
631 VUpdateVolume_r(ec, vp, 0);
636 if (programType == fileServer) {
637 VAddToVolumeUpdateList_r(ec, vp);
642 /* Find a slot in the bit map */
643 bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
644 VOL_ALLOC_BITMAP_WAIT);
647 vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
651 * at this point we should be assured that V_attachState(vp) is non-exclusive
655 VNLog(2, 1, vnodeNumber, 0, 0, 0);
656 /* Prepare to move it to the new hash chain */
657 vnp = VLookupVnode(vp, vnodeNumber);
659 /* slot already exists. May even not be in lruq (consider store file locking a file being deleted)
660 * so we may have to wait for it below */
661 VNLog(3, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
663 VnCreateReservation_r(vnp);
664 if (Vn_refcount(vnp) == 1) {
665 /* we're the only user */
666 /* This won't block */
667 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
669 /* other users present; follow locking hierarchy */
670 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, MIGHT_DEADLOCK);
672 #ifdef AFS_DEMAND_ATTACH_FS
675 * vnode was cached, wait for any existing exclusive ops to finish.
676 * once we have reacquired the lock, re-verify volume state.
678 * note: any vnode error state is related to the old vnode; disregard.
680 VnWaitQuiescent_r(vnp);
681 if (VIsErrorState(V_attachState(vp))) {
682 VnUnlock(vnp, WRITE_LOCK);
683 VnCancelReservation_r(vnp);
690 * verify state of the world hasn't changed
692 * (technically, this should never happen because cachecheck
693 * is only updated during a volume attach, which should not
694 * happen when refs are held)
696 if (Vn_volume(vnp)->cacheCheck != Vn_cacheCheck(vnp)) {
697 VnUnlock(vnp, WRITE_LOCK);
698 VnCancelReservation_r(vnp);
704 /* no such vnode in the cache */
706 vnp = VGetFreeVnode_r(vcp);
708 /* Initialize the header fields so noone allocates another
709 * vnode with the same number */
710 Vn_id(vnp) = vnodeNumber;
711 VnCreateReservation_r(vnp);
712 AddToVVnList(vp, vnp);
713 #ifdef AFS_DEMAND_ATTACH_FS
717 /* This will never block (guaranteed by check in VGetFreeVnode_r() */
718 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
720 #ifdef AFS_DEMAND_ATTACH_FS
721 VnChangeState_r(vnp, VN_STATE_ALLOC);
724 /* Sanity check: is this vnode really not in use? */
727 IHandle_t *ihP = vp->vnodeIndex[class].handle;
729 afs_foff_t off = vnodeIndexOffset(vcp, vnodeNumber);
732 /* XXX we have a potential race here if two threads
733 * allocate new vnodes at the same time, and they
734 * both decide it's time to extend the index
737 #ifdef AFS_DEMAND_ATTACH_FS
739 * this race has been eliminated for the DAFS case
740 * using exclusive state VOL_STATE_VNODE_ALLOC
742 * if this becomes a bottleneck, there are ways to
743 * improve parallelism for this code path
744 * -- tkeiser 11/28/2007
746 VCreateReservation_r(vp);
747 VWaitExclusiveState_r(vp);
748 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_ALLOC);
754 Log("VAllocVnode: can't open index file!\n");
756 goto error_encountered;
758 if ((size = FDH_SIZE(fdP)) < 0) {
759 Log("VAllocVnode: can't stat index file!\n");
761 goto error_encountered;
763 if (off + vcp->diskSize <= size) {
764 if (FDH_PREAD(fdP, &vnp->disk, vcp->diskSize, off) != vcp->diskSize) {
765 Log("VAllocVnode: can't read index file!\n");
767 goto error_encountered;
769 if (vnp->disk.type != vNull) {
770 Log("VAllocVnode: addled bitmap or index!\n");
772 goto error_encountered;
775 /* growing file - grow in a reasonable increment */
776 char *buf = (char *)malloc(16 * 1024);
778 Log("VAllocVnode: can't grow vnode index: out of memory\n");
780 goto error_encountered;
782 memset(buf, 0, 16 * 1024);
783 if ((FDH_PWRITE(fdP, buf, 16 * 1024, off)) != 16 * 1024) {
784 Log("VAllocVnode: can't grow vnode index: write failed\n");
787 goto error_encountered;
794 #ifdef AFS_DEMAND_ATTACH_FS
795 VChangeState_r(vp, vol_state_save);
796 VCancelReservation_r(vp);
803 * close the file handle
805 * invalidate the vnode
806 * free up the bitmap entry (although salvager should take care of it)
808 * drop vnode lock and refs
813 VFreeBitMapEntry_r(&tmp, &vp->vnodeIndex[class], bitNumber);
814 VInvalidateVnode_r(vnp);
815 VnUnlock(vnp, WRITE_LOCK);
816 VnCancelReservation_r(vnp);
817 #ifdef AFS_DEMAND_ATTACH_FS
818 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
819 VCancelReservation_r(vp);
821 VForceOffline_r(vp, 0);
826 VNLog(4, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
827 #ifndef AFS_DEMAND_ATTACH_FS
832 VNLog(5, 1, (intptr_t)vnp, 0, 0, 0);
833 memset(&vnp->disk, 0, sizeof(vnp->disk));
834 vnp->changed_newTime = 0; /* set this bit when vnode is updated */
835 vnp->changed_oldTime = 0; /* set this on CopyOnWrite. */
837 vnp->disk.vnodeMagic = vcp->magic;
838 vnp->disk.type = type;
839 vnp->disk.uniquifier = unique;
842 vp->header->diskstuff.filecount++;
843 #ifdef AFS_DEMAND_ATTACH_FS
844 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
850 * load a vnode from disk.
852 * @param[out] ec client error code return
853 * @param[in] vp volume object pointer
854 * @param[in] vnp vnode object pointer
855 * @param[in] vcp vnode class info object pointer
856 * @param[in] class vnode class enumeration
858 * @pre vnode is registered in appropriate data structures;
859 * caller holds a ref on vnode; VOL_LOCK is held
861 * @post vnode data is loaded from disk.
862 * vnode state is set to VN_STATE_ONLINE.
863 * on failure, vnode is invalidated.
865 * @internal vnode package internal use only
868 VnLoad(Error * ec, Volume * vp, Vnode * vnp,
869 struct VnodeClassInfo * vcp, VnodeClass class)
871 /* vnode not cached */
875 IHandle_t *ihP = vp->vnodeIndex[class].handle;
882 #ifdef AFS_DEMAND_ATTACH_FS
883 VnChangeState_r(vnp, VN_STATE_LOAD);
886 /* This will never block */
887 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
892 Log("VnLoad: can't open index dev=%u, i=%s\n", vp->device,
893 PrintInode(stmp, vp->vnodeIndex[class].handle->ih_ino));
895 goto error_encountered_nolock;
896 } else if ((nBytes = FDH_PREAD(fdP, (char *)&vnp->disk, vcp->diskSize, vnodeIndexOffset(vcp, Vn_id(vnp))))
898 /* Don't take volume off line if the inumber is out of range
899 * or the inode table is full. */
900 if (nBytes == BAD_IGET) {
901 Log("VnLoad: bad inumber %s\n",
902 PrintInode(stmp, vp->vnodeIndex[class].handle->ih_ino));
905 } else if (nBytes == -1 && errno == EIO) {
906 /* disk error; salvage */
907 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
909 /* vnode is not allocated */
911 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); read %d bytes, errno %d\n",
912 Vn_id(vnp), V_id(vp), V_name(vp), (int)nBytes, errno);
916 goto error_encountered_nolock;
921 /* Quick check to see that the data is reasonable */
922 if (vnp->disk.vnodeMagic != vcp->magic || vnp->disk.type == vNull) {
923 if (vnp->disk.type == vNull) {
927 struct vnodeIndex *index = &vp->vnodeIndex[class];
928 unsigned int bitNumber = vnodeIdToBitNumber(Vn_id(vnp));
929 unsigned int offset = bitNumber >> 3;
931 /* Test to see if vnode number is valid. */
932 if ((offset >= index->bitmapSize)
933 || ((*(index->bitmap + offset) & (1 << (bitNumber & 0x7)))
935 Log("VnLoad: Request for unallocated vnode %u, volume %u (%s) denied.\n", Vn_id(vnp), V_id(vp), V_name(vp));
939 Log("VnLoad: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
942 goto error_encountered;
945 IH_INIT(vnp->handle, V_device(vp), V_parentId(vp), VN_GET_INO(vnp));
946 VnUnlock(vnp, WRITE_LOCK);
947 #ifdef AFS_DEMAND_ATTACH_FS
948 VnChangeState_r(vnp, VN_STATE_ONLINE);
953 error_encountered_nolock:
955 FDH_REALLYCLOSE(fdP);
961 #ifdef AFS_DEMAND_ATTACH_FS
962 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
964 VForceOffline_r(vp, 0);
971 VInvalidateVnode_r(vnp);
972 VnUnlock(vnp, WRITE_LOCK);
976 * store a vnode to disk.
978 * @param[out] ec error code output
979 * @param[in] vp volume object pointer
980 * @param[in] vnp vnode object pointer
981 * @param[in] vcp vnode class info object pointer
982 * @param[in] class vnode class enumeration
984 * @pre VOL_LOCK held.
985 * caller holds refs to volume and vnode.
986 * DAFS: caller is responsible for performing state sanity checks.
988 * @post vnode state is stored to disk.
990 * @internal vnode package internal use only
993 VnStore(Error * ec, Volume * vp, Vnode * vnp,
994 struct VnodeClassInfo * vcp, VnodeClass class)
998 IHandle_t *ihP = vp->vnodeIndex[class].handle;
1001 #ifdef AFS_DEMAND_ATTACH_FS
1002 VnState vn_state_save;
1007 #ifdef AFS_DEMAND_ATTACH_FS
1008 vn_state_save = VnChangeState_r(vnp, VN_STATE_STORE);
1011 offset = vnodeIndexOffset(vcp, Vn_id(vnp));
1015 Log("VnStore: can't open index file!\n");
1016 goto error_encountered;
1018 nBytes = FDH_PWRITE(fdP, &vnp->disk, vcp->diskSize, offset);
1019 if (nBytes != vcp->diskSize) {
1020 /* Don't force volume offline if the inumber is out of
1021 * range or the inode table is full.
1023 FDH_REALLYCLOSE(fdP);
1024 if (nBytes == BAD_IGET) {
1025 Log("VnStore: bad inumber %s\n",
1027 vp->vnodeIndex[class].handle->ih_ino));
1030 #ifdef AFS_DEMAND_ATTACH_FS
1031 VnChangeState_r(vnp, VN_STATE_ERROR);
1034 Log("VnStore: Couldn't write vnode %u, volume %u (%s) (error %d)\n", Vn_id(vnp), V_id(Vn_volume(vnp)), V_name(Vn_volume(vnp)), (int)nBytes);
1035 #ifdef AFS_DEMAND_ATTACH_FS
1036 goto error_encountered;
1039 VForceOffline_r(vp, 0);
1049 #ifdef AFS_DEMAND_ATTACH_FS
1050 VnChangeState_r(vnp, vn_state_save);
1055 #ifdef AFS_DEMAND_ATTACH_FS
1056 /* XXX instead of dumping core, let's try to request a salvage
1057 * and just fail the putvnode */
1061 VnChangeState_r(vnp, VN_STATE_ERROR);
1062 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1069 * get a handle to a vnode object.
1071 * @param[out] ec error code
1072 * @param[in] vp volume object
1073 * @param[in] vnodeNumber vnode id
1074 * @param[in] locktype type of lock to acquire
1076 * @return vnode object pointer
1081 VGetVnode(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1082 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1085 retVal = VGetVnode_r(ec, vp, vnodeNumber, locktype);
1091 * get a handle to a vnode object.
1093 * @param[out] ec error code
1094 * @param[in] vp volume object
1095 * @param[in] vnodeNumber vnode id
1096 * @param[in] locktype type of lock to acquire
1098 * @return vnode object pointer
1100 * @internal vnode package internal use only
1102 * @pre VOL_LOCK held.
1103 * heavyweight ref held on volume object.
1106 VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1107 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1110 struct VnodeClassInfo *vcp;
1114 if (vnodeNumber == 0) {
1119 VNLog(100, 1, vnodeNumber, 0, 0, 0);
1121 #ifdef AFS_DEMAND_ATTACH_FS
1123 * once a volume has entered an error state, don't permit
1124 * further operations to proceed
1125 * -- tkeiser 11/21/2007
1127 VWaitExclusiveState_r(vp);
1128 if (VIsErrorState(V_attachState(vp))) {
1129 /* XXX is VSALVAGING acceptable here? */
1135 if (programType == fileServer && !V_inUse(vp)) {
1136 *ec = (vp->specialStatus ? vp->specialStatus : VOFFLINE);
1138 /* If the volume is VBUSY (being cloned or dumped) and this is
1139 * a READ operation, then don't fail.
1141 if ((*ec != VBUSY) || (locktype != READ_LOCK)) {
1146 class = vnodeIdToClass(vnodeNumber);
1147 vcp = &VnodeClassInfo[class];
1148 if (locktype == WRITE_LOCK && !VolumeWriteable(vp)) {
1149 *ec = (bit32) VREADONLY;
1153 if (locktype == WRITE_LOCK && programType == fileServer) {
1154 VAddToVolumeUpdateList_r(ec, vp);
1162 /* See whether the vnode is in the cache. */
1163 vnp = VLookupVnode(vp, vnodeNumber);
1165 /* vnode is in cache */
1167 VNLog(101, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
1168 VnCreateReservation_r(vnp);
1170 #ifdef AFS_DEMAND_ATTACH_FS
1172 * this is the one DAFS case where we may run into contention.
1173 * here's the basic control flow:
1175 * if locktype is READ_LOCK:
1176 * wait until vnode is not exclusive
1177 * set to VN_STATE_READ
1178 * increment read count
1181 * wait until vnode is quiescent
1182 * set to VN_STATE_EXCLUSIVE
1185 if (locktype == READ_LOCK) {
1186 VnWaitExclusiveState_r(vnp);
1188 VnWaitQuiescent_r(vnp);
1191 if (VnIsErrorState(Vn_state(vnp))) {
1192 VnCancelReservation_r(vnp);
1196 #endif /* AFS_DEMAND_ATTACH_FS */
1198 /* vnode not cached */
1200 /* Not in cache; tentatively grab most distantly used one from the LRU
1203 vnp = VGetFreeVnode_r(vcp);
1206 vnp->changed_newTime = vnp->changed_oldTime = 0;
1208 Vn_id(vnp) = vnodeNumber;
1209 VnCreateReservation_r(vnp);
1210 AddToVVnList(vp, vnp);
1211 #ifdef AFS_DEMAND_ATTACH_FS
1216 * XXX for non-DAFS, there is a serious
1217 * race condition here:
1219 * two threads can race to load a vnode. the net
1220 * result is two struct Vnodes can be allocated
1221 * and hashed, which point to the same underlying
1222 * disk data store. conflicting vnode locks can
1223 * thus be held concurrently.
1225 * for non-DAFS to be safe, VOL_LOCK really shouldn't
1226 * be dropped in VnLoad. Of course, this would likely
1227 * lead to an unacceptable slow-down.
1230 VnLoad(ec, vp, vnp, vcp, class);
1232 VnCancelReservation_r(vnp);
1235 #ifndef AFS_DEMAND_ATTACH_FS
1240 * there is no possibility for contention. we "own" this vnode.
1246 * it is imperative that nothing drop vol lock between here
1247 * and the VnBeginRead/VnChangeState stanza below
1250 VnLock(vnp, locktype, VOL_LOCK_HELD, MIGHT_DEADLOCK);
1252 /* Check that the vnode hasn't been removed while we were obtaining
1254 VNLog(102, 2, vnodeNumber, (intptr_t) vnp, 0, 0);
1255 if ((vnp->disk.type == vNull) || (Vn_cacheCheck(vnp) == 0)) {
1256 VnUnlock(vnp, locktype);
1257 VnCancelReservation_r(vnp);
1259 /* vnode is labelled correctly by now, so we don't have to invalidate it */
1263 #ifdef AFS_DEMAND_ATTACH_FS
1264 if (locktype == READ_LOCK) {
1267 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
1271 if (programType == fileServer)
1272 VBumpVolumeUsage_r(Vn_volume(vnp)); /* Hack; don't know where it should be
1273 * called from. Maybe VGetVolume */
1278 int TrustVnodeCacheEntry = 1;
1279 /* This variable is bogus--when it's set to 0, the hash chains fill
1280 up with multiple versions of the same vnode. Should fix this!! */
1282 VPutVnode(Error * ec, Vnode * vnp)
1285 VPutVnode_r(ec, vnp);
1290 * put back a handle to a vnode object.
1292 * @param[out] ec client error code
1293 * @param[in] vnp vnode object pointer
1295 * @pre VOL_LOCK held.
1296 * ref held on vnode.
1298 * @post ref dropped on vnode.
1299 * if vnode was modified or deleted, it is written out to disk
1300 * (assuming a write lock was held).
1302 * @internal volume package internal use only
1305 VPutVnode_r(Error * ec, Vnode * vnp)
1309 struct VnodeClassInfo *vcp;
1312 osi_Assert(Vn_refcount(vnp) != 0);
1313 class = vnodeIdToClass(Vn_id(vnp));
1314 vcp = &VnodeClassInfo[class];
1315 osi_Assert(vnp->disk.vnodeMagic == vcp->magic);
1316 VNLog(200, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1318 #ifdef AFS_DEMAND_ATTACH_FS
1319 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1321 writeLocked = WriteLocked(&vnp->lock);
1326 #ifdef AFS_PTHREAD_ENV
1327 pthread_t thisProcess = pthread_self();
1328 #else /* AFS_PTHREAD_ENV */
1329 PROCESS thisProcess;
1330 LWP_CurrentProcess(&thisProcess);
1331 #endif /* AFS_PTHREAD_ENV */
1332 VNLog(201, 2, (intptr_t) vnp,
1333 ((vnp->changed_newTime) << 1) | ((vnp->
1334 changed_oldTime) << 1) | vnp->
1336 if (thisProcess != vnp->writer)
1337 Abort("VPutVnode: Vnode at %"AFS_PTR_FMT" locked by another process!\n",
1341 if (vnp->changed_oldTime || vnp->changed_newTime || vnp->delete) {
1342 Volume *vp = Vn_volume(vnp);
1343 afs_uint32 now = FT_ApproxTime();
1344 osi_Assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1347 /* No longer any directory entries for this vnode. Free the Vnode */
1348 memset(&vnp->disk, 0, sizeof(vnp->disk));
1349 /* delete flag turned off further down */
1350 VNLog(202, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1351 } else if (vnp->changed_newTime) {
1352 vnp->disk.serverModifyTime = now;
1354 if (vnp->changed_newTime)
1356 V_updateDate(vp) = vp->updateTime = now;
1357 if(V_volUpCounter(vp)<MAXINT)
1358 V_volUpCounter(vp)++;
1361 /* The vnode has been changed. Write it out to disk */
1363 #ifdef AFS_DEMAND_ATTACH_FS
1364 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1366 osi_Assert(V_needsSalvaged(vp));
1370 VnStore(ec, vp, vnp, vcp, class);
1372 /* If the vnode is to be deleted, and we wrote the vnode out,
1373 * free its bitmap entry. Do after the vnode is written so we
1374 * don't allocate from bitmap before the vnode is written
1375 * (doing so could cause a "addled bitmap" message).
1377 if (vnp->delete && !*ec) {
1378 if (Vn_volume(vnp)->header->diskstuff.filecount-- < 1)
1379 Vn_volume(vnp)->header->diskstuff.filecount = 0;
1380 VFreeBitMapEntry_r(ec, &vp->vnodeIndex[class],
1381 vnodeIdToBitNumber(Vn_id(vnp)));
1385 vnp->changed_newTime = vnp->changed_oldTime = 0;
1387 #ifdef AFS_DEMAND_ATTACH_FS
1388 VnChangeState_r(vnp, VN_STATE_ONLINE);
1390 } else { /* Not write locked */
1391 if (vnp->changed_newTime || vnp->changed_oldTime || vnp->delete)
1393 ("VPutVnode: Change or delete flag for vnode "
1394 "%"AFS_PTR_FMT" is set but vnode is not write locked!\n",
1396 #ifdef AFS_DEMAND_ATTACH_FS
1401 /* Do not look at disk portion of vnode after this point; it may
1402 * have been deleted above */
1404 VnUnlock(vnp, ((writeLocked) ? WRITE_LOCK : READ_LOCK));
1405 VnCancelReservation_r(vnp);
1409 * Make an attempt to convert a vnode lock from write to read.
1410 * Do nothing if the vnode isn't write locked or the vnode has
1414 VVnodeWriteToRead(Error * ec, Vnode * vnp)
1418 retVal = VVnodeWriteToRead_r(ec, vnp);
1424 * convert vnode handle from mutually exclusive to shared access.
1426 * @param[out] ec client error code
1427 * @param[in] vnp vnode object pointer
1429 * @return unspecified use (see out argument 'ec' for error code return)
1431 * @pre VOL_LOCK held.
1432 * ref held on vnode.
1433 * write lock held on vnode.
1435 * @post read lock held on vnode.
1436 * if vnode was modified, it has been written to disk.
1438 * @internal volume package internal use only
1441 VVnodeWriteToRead_r(Error * ec, Vnode * vnp)
1445 struct VnodeClassInfo *vcp;
1446 #ifdef AFS_PTHREAD_ENV
1447 pthread_t thisProcess;
1448 #else /* AFS_PTHREAD_ENV */
1449 PROCESS thisProcess;
1450 #endif /* AFS_PTHREAD_ENV */
1453 osi_Assert(Vn_refcount(vnp) != 0);
1454 class = vnodeIdToClass(Vn_id(vnp));
1455 vcp = &VnodeClassInfo[class];
1456 osi_Assert(vnp->disk.vnodeMagic == vcp->magic);
1457 VNLog(300, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1459 #ifdef AFS_DEMAND_ATTACH_FS
1460 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1462 writeLocked = WriteLocked(&vnp->lock);
1469 VNLog(301, 2, (intptr_t) vnp,
1470 ((vnp->changed_newTime) << 1) | ((vnp->
1471 changed_oldTime) << 1) | vnp->
1475 #ifdef AFS_PTHREAD_ENV
1476 thisProcess = pthread_self();
1477 #else /* AFS_PTHREAD_ENV */
1478 LWP_CurrentProcess(&thisProcess);
1479 #endif /* AFS_PTHREAD_ENV */
1480 if (thisProcess != vnp->writer)
1481 Abort("VPutVnode: Vnode at %"AFS_PTR_FMT
1482 " locked by another process!\n", vnp);
1487 if (vnp->changed_oldTime || vnp->changed_newTime) {
1488 Volume *vp = Vn_volume(vnp);
1489 afs_uint32 now = FT_ApproxTime();
1490 osi_Assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1491 if (vnp->changed_newTime)
1492 vnp->disk.serverModifyTime = now;
1493 if (vnp->changed_newTime)
1494 V_updateDate(vp) = vp->updateTime = now;
1496 /* The inode has been changed. Write it out to disk */
1498 #ifdef AFS_DEMAND_ATTACH_FS
1499 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1501 osi_Assert(V_needsSalvaged(vp));
1505 VnStore(ec, vp, vnp, vcp, class);
1508 vnp->changed_newTime = vnp->changed_oldTime = 0;
1512 #ifdef AFS_DEMAND_ATTACH_FS
1513 VnChangeState_r(vnp, VN_STATE_ONLINE);
1516 ConvertWriteToReadLock(&vnp->lock);
1522 * initial size of ihandle pointer vector.
1524 * @see VInvalidateVnodesByVolume_r
1526 #define IH_VEC_BASE_SIZE 256
1529 * increment amount for growing ihandle pointer vector.
1531 * @see VInvalidateVnodesByVolume_r
1533 #define IH_VEC_INCREMENT 256
1536 * Compile list of ihandles to be released/reallyclosed at a later time.
1538 * @param[in] vp volume object pointer
1539 * @param[out] vec_out vector of ihandle pointers to be released/reallyclosed
1540 * @param[out] vec_len_out number of valid elements in ihandle vector
1542 * @pre - VOL_LOCK is held
1543 * - volume is in appropriate exclusive state (e.g. VOL_STATE_VNODE_CLOSE,
1544 * VOL_STATE_VNODE_RELEASE)
1546 * @post - all vnodes on VVn list are invalidated
1547 * - ih_vec is populated with all valid ihandles
1549 * @return operation status
1551 * @retval ENOMEM out of memory
1553 * @todo we should handle out of memory conditions more gracefully.
1555 * @internal vnode package internal use only
1558 VInvalidateVnodesByVolume_r(Volume * vp,
1559 IHandle_t *** vec_out,
1560 size_t * vec_len_out)
1564 size_t i = 0, vec_len;
1565 IHandle_t **ih_vec, **ih_vec_new;
1567 #ifdef AFS_DEMAND_ATTACH_FS
1569 #endif /* AFS_DEMAND_ATTACH_FS */
1571 vec_len = IH_VEC_BASE_SIZE;
1572 ih_vec = malloc(sizeof(IHandle_t *) * vec_len);
1573 #ifdef AFS_DEMAND_ATTACH_FS
1580 * Traverse the volume's vnode list. Pull all the ihandles out into a
1581 * thread-private array for later asynchronous processing.
1583 #ifdef AFS_DEMAND_ATTACH_FS
1586 for (queue_Scan(&vp->vnode_list, vnp, nvnp, Vnode)) {
1587 if (vnp->handle != NULL) {
1589 #ifdef AFS_DEMAND_ATTACH_FS
1592 vec_len += IH_VEC_INCREMENT;
1593 ih_vec_new = realloc(ih_vec, sizeof(IHandle_t *) * vec_len);
1594 #ifdef AFS_DEMAND_ATTACH_FS
1597 if (ih_vec_new == NULL) {
1601 ih_vec = ih_vec_new;
1602 #ifdef AFS_DEMAND_ATTACH_FS
1604 * Theoretically, the volume's VVn list should not change
1605 * because the volume is in an exclusive state. For the
1606 * sake of safety, we will restart the traversal from the
1607 * the beginning (which is not expensive because we're
1608 * deleting the items from the list as we go).
1610 goto restart_traversal;
1613 ih_vec[i++] = vnp->handle;
1616 DeleteFromVVnList(vnp);
1617 VInvalidateVnode_r(vnp);
1627 /* VCloseVnodeFiles - called when a volume is going off line. All open
1628 * files for vnodes in that volume are closed. This might be excessive,
1629 * since we may only be taking one volume of a volume group offline.
1632 VCloseVnodeFiles_r(Volume * vp)
1634 #ifdef AFS_DEMAND_ATTACH_FS
1635 VolState vol_state_save;
1637 IHandle_t ** ih_vec;
1640 #ifdef AFS_DEMAND_ATTACH_FS
1641 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_CLOSE);
1642 #endif /* AFS_DEMAND_ATTACH_FS */
1644 /* XXX need better error handling here */
1645 osi_Assert(VInvalidateVnodesByVolume_r(vp,
1651 * now we drop VOL_LOCK while we perform some potentially very
1652 * expensive operations in the background
1654 #ifdef AFS_DEMAND_ATTACH_FS
1658 for (i = 0; i < vec_len; i++) {
1659 IH_REALLYCLOSE(ih_vec[i]);
1664 #ifdef AFS_DEMAND_ATTACH_FS
1666 VChangeState_r(vp, vol_state_save);
1667 #endif /* AFS_DEMAND_ATTACH_FS */
1672 * shut down all vnode cache state for a given volume.
1674 * @param[in] vp volume object pointer
1676 * @pre VOL_LOCK is held
1678 * @post all file descriptors closed.
1679 * all inode handles released.
1680 * all vnode cache objects disassociated from volume.
1682 * @note for DAFS, these operations are performed outside the vol glock under
1683 * volume exclusive state VOL_STATE_VNODE_RELEASE. Please further note
1684 * that it would be a bug to acquire and release a volume reservation
1685 * during this exclusive operation. This is due to the fact that we are
1686 * generally called during the refcount 1->0 transition.
1688 * @todo we should handle failures in VInvalidateVnodesByVolume_r more
1691 * @see VInvalidateVnodesByVolume_r
1693 * @internal this routine is internal to the volume package
1696 VReleaseVnodeFiles_r(Volume * vp)
1698 #ifdef AFS_DEMAND_ATTACH_FS
1699 VolState vol_state_save;
1701 IHandle_t ** ih_vec;
1704 #ifdef AFS_DEMAND_ATTACH_FS
1705 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_RELEASE);
1706 #endif /* AFS_DEMAND_ATTACH_FS */
1708 /* XXX need better error handling here */
1709 osi_Assert(VInvalidateVnodesByVolume_r(vp,
1715 * now we drop VOL_LOCK while we perform some potentially very
1716 * expensive operations in the background
1718 #ifdef AFS_DEMAND_ATTACH_FS
1722 for (i = 0; i < vec_len; i++) {
1723 IH_RELEASE(ih_vec[i]);
1728 #ifdef AFS_DEMAND_ATTACH_FS
1730 VChangeState_r(vp, vol_state_save);
1731 #endif /* AFS_DEMAND_ATTACH_FS */