2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
15 Institution: The Information Technology Center, Carnegie-Mellon University
18 #include <afsconfig.h>
19 #include <afs/param.h>
23 #define MAXINT (~(1<<((sizeof(int)*8)-1)))
30 #include <afs/afs_assert.h>
33 #include "rx/rx_queue.h"
34 #include <afs/afsint.h>
36 #include <afs/errors.h>
39 #include <afs/afssyscalls.h>
43 #include "volume_inline.h"
44 #include "vnode_inline.h"
45 #include "partition.h"
48 #if defined(AFS_SGI_ENV)
49 #include "sys/types.h"
61 #include <sys/fcntl.h>
64 #endif /* AFS_NT40_ENV */
71 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
73 void VNLog(afs_int32 aop, afs_int32 anparms, ... );
80 #define BAD_IGET -1000
82 /* There are two separate vnode queue types defined here:
83 * Each hash conflict chain -- is singly linked, with a single head
84 * pointer. New entries are added at the beginning. Old
85 * entries are removed by linear search, which generally
86 * only occurs after a disk read).
87 * LRU chain -- is doubly linked, single head pointer.
88 * Entries are added at the head, reclaimed from the tail,
89 * or removed from anywhere in the queue.
93 /* Vnode hash table. Find hash chain by taking lower bits of
94 * (volume_hash_offset + vnode).
95 * This distributes the root inodes of the volumes over the
96 * hash table entries and also distributes the vnodes of
97 * volumes reasonably fairly. The volume_hash_offset field
98 * for each volume is established as the volume comes on line
99 * by using the VOLUME_HASH_OFFSET macro. This distributes the
100 * volumes fairly among the cache entries, both when servicing
101 * a small number of volumes and when servicing a large number.
104 /* logging stuff for finding bugs */
105 #define THELOGSIZE 5120
106 static afs_int32 theLog[THELOGSIZE];
107 static afs_int32 vnLogPtr = 0;
109 VNLog(afs_int32 aop, afs_int32 anparms, ... )
114 va_start(ap, anparms);
117 anparms = 4; /* do bounds checking */
119 temp = (aop << 16) | anparms;
120 theLog[vnLogPtr++] = temp;
121 if (vnLogPtr >= THELOGSIZE)
123 for (temp = 0; temp < anparms; temp++) {
124 theLog[vnLogPtr++] = va_arg(ap, afs_int32);
125 if (vnLogPtr >= THELOGSIZE)
131 /* VolumeHashOffset -- returns a new value to be stored in the
132 * volumeHashOffset of a Volume structure. Called when a
133 * volume is initialized. Sets the volumeHashOffset so that
134 * vnode cache entries are distributed reasonably between
135 * volumes (the root vnodes of the volumes will hash to
136 * different values, and spacing is maintained between volumes
137 * when there are not many volumes represented), and spread
138 * equally amongst vnodes within a single volume.
141 VolumeHashOffset_r(void)
143 static int nextVolumeHashOffset = 0;
144 /* hashindex Must be power of two in size */
146 # define hashMask ((1<<hashShift)-1)
147 static byte hashindex[1 << hashShift] =
148 { 0, 128, 64, 192, 32, 160, 96, 224 };
150 offset = hashindex[nextVolumeHashOffset & hashMask]
151 + (nextVolumeHashOffset >> hashShift);
152 nextVolumeHashOffset++;
156 /* Change hashindex (above) if you change this constant */
157 #define VNODE_HASH_TABLE_SIZE 256
158 private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
159 #define VNODE_HASH(volumeptr,vnodenumber)\
160 ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
164 * add a vnode to the volume's vnode list.
166 * @param[in] vp volume object pointer
167 * @param[in] vnp vnode object pointer
169 * @note for DAFS, it may seem like we should be acquiring a lightweight ref
170 * on vp, but this would actually break things. Right now, this is ok
171 * because we destroy all vnode cache contents during during volume
176 * @internal volume package internal use only
179 AddToVVnList(Volume * vp, Vnode * vnp)
181 if (queue_IsOnQueue(vnp))
185 Vn_cacheCheck(vnp) = vp->cacheCheck;
186 queue_Append(&vp->vnode_list, vnp);
187 Vn_stateFlags(vnp) |= VN_ON_VVN;
191 * delete a vnode from the volume's vnode list.
195 * @internal volume package internal use only
198 DeleteFromVVnList(Vnode * vnp)
200 Vn_volume(vnp) = NULL;
202 if (!queue_IsOnQueue(vnp))
206 Vn_stateFlags(vnp) &= ~(VN_ON_VVN);
210 * add a vnode to the end of the lru.
212 * @param[in] vcp vnode class info object pointer
213 * @param[in] vnp vnode object pointer
215 * @internal vnode package internal use only
218 AddToVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
220 if (Vn_stateFlags(vnp) & VN_ON_LRU) {
224 /* Add it to the circular LRU list */
225 if (vcp->lruHead == NULL)
226 Abort("VPutVnode: vcp->lruHead==NULL");
228 vnp->lruNext = vcp->lruHead;
229 vnp->lruPrev = vcp->lruHead->lruPrev;
230 vcp->lruHead->lruPrev = vnp;
231 vnp->lruPrev->lruNext = vnp;
235 /* If the vnode was just deleted, put it at the end of the chain so it
236 * will be reused immediately */
238 vcp->lruHead = vnp->lruNext;
240 Vn_stateFlags(vnp) |= VN_ON_LRU;
244 * delete a vnode from the lru.
246 * @param[in] vcp vnode class info object pointer
247 * @param[in] vnp vnode object pointer
249 * @internal vnode package internal use only
252 DeleteFromVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
254 if (!(Vn_stateFlags(vnp) & VN_ON_LRU)) {
258 if (vnp == vcp->lruHead)
259 vcp->lruHead = vcp->lruHead->lruNext;
261 if ((vnp == vcp->lruHead) ||
262 (vcp->lruHead == NULL))
263 Abort("DeleteFromVnLRU: lru chain addled!\n");
265 vnp->lruPrev->lruNext = vnp->lruNext;
266 vnp->lruNext->lruPrev = vnp->lruPrev;
268 Vn_stateFlags(vnp) &= ~(VN_ON_LRU);
272 * add a vnode to the vnode hash table.
274 * @param[in] vnp vnode object pointer
278 * @post vnode on hash
280 * @internal vnode package internal use only
283 AddToVnHash(Vnode * vnp)
285 unsigned int newHash;
287 if (!(Vn_stateFlags(vnp) & VN_ON_HASH)) {
288 newHash = VNODE_HASH(Vn_volume(vnp), Vn_id(vnp));
289 vnp->hashNext = VnodeHashTable[newHash];
290 VnodeHashTable[newHash] = vnp;
291 vnp->hashIndex = newHash;
293 Vn_stateFlags(vnp) |= VN_ON_HASH;
298 * delete a vnode from the vnode hash table.
305 * @post vnode removed from hash
307 * @internal vnode package internal use only
310 DeleteFromVnHash(Vnode * vnp)
314 if (Vn_stateFlags(vnp) & VN_ON_HASH) {
315 tvnp = VnodeHashTable[vnp->hashIndex];
317 VnodeHashTable[vnp->hashIndex] = vnp->hashNext;
319 while (tvnp && tvnp->hashNext != vnp)
320 tvnp = tvnp->hashNext;
322 tvnp->hashNext = vnp->hashNext;
325 vnp->hashNext = NULL;
327 Vn_stateFlags(vnp) &= ~(VN_ON_HASH);
333 * invalidate a vnode cache entry.
335 * @param[in] avnode vnode object pointer
339 * @post vnode metadata invalidated.
340 * vnode removed from hash table.
341 * DAFS: vnode state set to VN_STATE_INVALID.
343 * @internal vnode package internal use only
346 VInvalidateVnode_r(struct Vnode *avnode)
348 avnode->changed_newTime = 0; /* don't let it get flushed out again */
349 avnode->changed_oldTime = 0;
350 avnode->delete = 0; /* it isn't deleted, really */
351 avnode->cacheCheck = 0; /* invalid: prevents future vnode searches from working */
352 DeleteFromVnHash(avnode);
353 #ifdef AFS_DEMAND_ATTACH_FS
354 VnChangeState_r(avnode, VN_STATE_INVALID);
360 * initialize vnode cache for a given vnode class.
362 * @param[in] class vnode class
363 * @param[in] nVnodes size of cache
365 * @post vnode cache allocated and initialized
367 * @internal volume package internal use only
369 * @note generally called by VInitVolumePackage_r
371 * @see VInitVolumePackage_r
374 VInitVnodes(VnodeClass class, int nVnodes)
377 struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
379 vcp->allocs = vcp->gets = vcp->reads = vcp->writes = 0;
380 vcp->cacheSize = nVnodes;
383 osi_Assert(CHECKSIZE_SMALLVNODE);
385 vcp->residentSize = SIZEOF_SMALLVNODE;
386 vcp->diskSize = SIZEOF_SMALLDISKVNODE;
387 vcp->magic = SMALLVNODEMAGIC;
391 vcp->residentSize = SIZEOF_LARGEVNODE;
392 vcp->diskSize = SIZEOF_LARGEDISKVNODE;
393 vcp->magic = LARGEVNODEMAGIC;
397 int s = vcp->diskSize - 1;
407 va = (byte *) calloc(nVnodes, vcp->residentSize);
408 osi_Assert(va != NULL);
410 Vnode *vnp = (Vnode *) va;
411 Vn_refcount(vnp) = 0; /* no context switches */
412 Vn_stateFlags(vnp) |= VN_ON_LRU;
413 #ifdef AFS_DEMAND_ATTACH_FS
414 CV_INIT(&Vn_stateCV(vnp), "vnode state", CV_DEFAULT, 0);
415 Vn_state(vnp) = VN_STATE_INVALID;
417 #else /* !AFS_DEMAND_ATTACH_FS */
418 Lock_Init(&vnp->lock);
419 #endif /* !AFS_DEMAND_ATTACH_FS */
420 vnp->changed_oldTime = 0;
421 vnp->changed_newTime = 0;
422 Vn_volume(vnp) = NULL;
423 Vn_cacheCheck(vnp) = 0;
424 vnp->delete = Vn_id(vnp) = 0;
425 #ifdef AFS_PTHREAD_ENV
426 vnp->writer = (pthread_t) 0;
427 #else /* AFS_PTHREAD_ENV */
428 vnp->writer = (PROCESS) 0;
429 #endif /* AFS_PTHREAD_ENV */
433 if (vcp->lruHead == NULL)
434 vcp->lruHead = vnp->lruNext = vnp->lruPrev = vnp;
436 vnp->lruNext = vcp->lruHead;
437 vnp->lruPrev = vcp->lruHead->lruPrev;
438 vcp->lruHead->lruPrev = vnp;
439 vnp->lruPrev->lruNext = vnp;
442 va += vcp->residentSize;
449 * allocate an unused vnode from the lru chain.
451 * @param[in] vcp vnode class info object pointer
453 * @pre VOL_LOCK is held
455 * @post vnode object is removed from lru, and vnode hash table.
456 * vnode is disassociated from volume object.
457 * state is set to VN_STATE_INVALID.
458 * inode handle is released.
460 * @note we traverse backwards along the lru circlist. It shouldn't
461 * be necessary to specify that nUsers == 0 since if it is in the list,
462 * nUsers should be 0. Things shouldn't be in lruq unless no one is
465 * @warning DAFS: VOL_LOCK is dropped while doing inode handle release
467 * @return vnode object pointer
470 VGetFreeVnode_r(struct VnodeClassInfo * vcp)
474 vnp = vcp->lruHead->lruPrev;
475 #ifdef AFS_DEMAND_ATTACH_FS
476 if (Vn_refcount(vnp) != 0 || VnIsExclusiveState(Vn_state(vnp)) ||
477 Vn_readers(vnp) != 0)
478 Abort("VGetFreeVnode_r: in-use vnode in lruq");
480 if (Vn_refcount(vnp) != 0 || CheckLock(&vnp->lock))
481 Abort("VGetFreeVnode_r: locked vnode in lruq");
483 VNLog(1, 2, Vn_id(vnp), (intptr_t)vnp, 0, 0);
486 * it's going to be overwritten soon enough.
487 * remove from LRU, delete hash entry, and
488 * disassociate from old parent volume before
489 * we have a chance to drop the vol glock
491 DeleteFromVnLRU(vcp, vnp);
492 DeleteFromVnHash(vnp);
493 if (Vn_volume(vnp)) {
494 DeleteFromVVnList(vnp);
497 /* drop the file descriptor */
499 #ifdef AFS_DEMAND_ATTACH_FS
500 VnChangeState_r(vnp, VN_STATE_RELEASING);
503 /* release is, potentially, a highly latent operation due to a couple
505 * - ihandle package lock contention
506 * - closing file descriptor(s) associated with ih
508 * Hance, we perform outside of the volume package lock in order to
509 * reduce the probability of contention.
511 IH_RELEASE(vnp->handle);
512 #ifdef AFS_DEMAND_ATTACH_FS
517 #ifdef AFS_DEMAND_ATTACH_FS
518 VnChangeState_r(vnp, VN_STATE_INVALID);
526 * lookup a vnode in the vnode cache hash table.
528 * @param[in] vp pointer to volume object
529 * @param[in] vnodeId vnode id
533 * @post matching vnode object or NULL is returned
535 * @return vnode object pointer
536 * @retval NULL no matching vnode object was found in the cache
538 * @internal vnode package internal use only
540 * @note this symbol is exported strictly for fssync debug protocol use
543 VLookupVnode(Volume * vp, VnodeId vnodeId)
546 unsigned int newHash;
548 newHash = VNODE_HASH(vp, vnodeId);
549 for (vnp = VnodeHashTable[newHash];
551 ((Vn_id(vnp) != vnodeId) ||
552 (Vn_volume(vnp) != vp) ||
553 (vp->cacheCheck != Vn_cacheCheck(vnp))));
554 vnp = vnp->hashNext);
561 VAllocVnode(Error * ec, Volume * vp, VnodeType type)
565 retVal = VAllocVnode_r(ec, vp, type);
571 * allocate a new vnode.
573 * @param[out] ec error code return
574 * @param[in] vp volume object pointer
575 * @param[in] type desired vnode type
577 * @return vnode object pointer
579 * @pre VOL_LOCK held;
580 * heavyweight ref held on vp
582 * @post vnode allocated and returned
585 VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
590 struct VnodeClassInfo *vcp;
593 #ifdef AFS_DEMAND_ATTACH_FS
594 VolState vol_state_save;
599 #ifdef AFS_DEMAND_ATTACH_FS
601 * once a volume has entered an error state, don't permit
602 * further operations to proceed
603 * -- tkeiser 11/21/2007
605 VWaitExclusiveState_r(vp);
606 if (VIsErrorState(V_attachState(vp))) {
607 /* XXX is VSALVAGING acceptable here? */
613 if (programType == fileServer && !V_inUse(vp)) {
614 if (vp->specialStatus) {
615 *ec = vp->specialStatus;
621 class = vnodeTypeToClass(type);
622 vcp = &VnodeClassInfo[class];
624 if (!VolumeWriteable(vp)) {
625 *ec = (bit32) VREADONLY;
629 unique = vp->nextVnodeUnique++;
631 unique = vp->nextVnodeUnique++;
633 if (vp->nextVnodeUnique > V_uniquifier(vp)) {
634 VUpdateVolume_r(ec, vp, 0);
639 if (programType == fileServer) {
640 VAddToVolumeUpdateList_r(ec, vp);
645 /* Find a slot in the bit map */
646 bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
647 VOL_ALLOC_BITMAP_WAIT);
650 vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
654 * at this point we should be assured that V_attachState(vp) is non-exclusive
658 VNLog(2, 1, vnodeNumber, 0, 0, 0);
659 /* Prepare to move it to the new hash chain */
660 vnp = VLookupVnode(vp, vnodeNumber);
662 /* slot already exists. May even not be in lruq (consider store file locking a file being deleted)
663 * so we may have to wait for it below */
664 VNLog(3, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
666 VnCreateReservation_r(vnp);
667 if (Vn_refcount(vnp) == 1) {
668 /* we're the only user */
669 /* This won't block */
670 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
672 #ifdef AFS_DEMAND_ATTACH_FS
675 * vnode was cached, wait for any existing exclusive ops to finish.
676 * once we have reacquired the lock, re-verify volume state.
678 * note: any vnode error state is related to the old vnode; disregard.
680 VnWaitQuiescent_r(vnp);
681 if (VIsErrorState(V_attachState(vp))) {
682 VnUnlock(vnp, WRITE_LOCK);
683 VnCancelReservation_r(vnp);
689 /* other users present; follow locking hierarchy */
690 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, MIGHT_DEADLOCK);
693 * verify state of the world hasn't changed
695 * (technically, this should never happen because cachecheck
696 * is only updated during a volume attach, which should not
697 * happen when refs are held)
699 if (Vn_volume(vnp)->cacheCheck != Vn_cacheCheck(vnp)) {
700 VnUnlock(vnp, WRITE_LOCK);
701 VnCancelReservation_r(vnp);
706 /* sanity check: vnode should be blank if it was deleted. If it's
707 * not blank, it is still in use somewhere; but the bitmap told us
708 * this vnode number was free, so something is wrong. */
709 if (vnp->disk.type != vNull) {
711 Log("VAllocVnode: addled bitmap or vnode object! (vol %ld, "
712 "vnode %p, number %ld, type %ld)\n", (long)vp->hashid, vnp,
713 (long)Vn_id(vnp), (long)vnp->disk.type);
715 VFreeBitMapEntry_r(&tmp, vp, &vp->vnodeIndex[class], bitNumber,
716 VOL_FREE_BITMAP_WAIT);
717 VInvalidateVnode_r(vnp);
718 VnUnlock(vnp, WRITE_LOCK);
719 VnCancelReservation_r(vnp);
720 #ifdef AFS_DEMAND_ATTACH_FS
721 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
723 VForceOffline_r(vp, 0);
729 /* no such vnode in the cache */
731 vnp = VGetFreeVnode_r(vcp);
733 /* Initialize the header fields so noone allocates another
734 * vnode with the same number */
735 Vn_id(vnp) = vnodeNumber;
736 VnCreateReservation_r(vnp);
737 AddToVVnList(vp, vnp);
738 #ifdef AFS_DEMAND_ATTACH_FS
742 /* This will never block (guaranteed by check in VGetFreeVnode_r() */
743 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
745 #ifdef AFS_DEMAND_ATTACH_FS
746 VnChangeState_r(vnp, VN_STATE_ALLOC);
749 /* Sanity check: is this vnode really not in use? */
752 IHandle_t *ihP = vp->vnodeIndex[class].handle;
754 afs_foff_t off = vnodeIndexOffset(vcp, vnodeNumber);
757 /* XXX we have a potential race here if two threads
758 * allocate new vnodes at the same time, and they
759 * both decide it's time to extend the index
762 #ifdef AFS_DEMAND_ATTACH_FS
764 * this race has been eliminated for the DAFS case
765 * using exclusive state VOL_STATE_VNODE_ALLOC
767 * if this becomes a bottleneck, there are ways to
768 * improve parallelism for this code path
769 * -- tkeiser 11/28/2007
771 VCreateReservation_r(vp);
772 VWaitExclusiveState_r(vp);
773 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_ALLOC);
779 Log("VAllocVnode: can't open index file!\n");
781 goto error_encountered;
783 if ((size = FDH_SIZE(fdP)) < 0) {
784 Log("VAllocVnode: can't stat index file!\n");
786 goto error_encountered;
788 if (off + vcp->diskSize <= size) {
789 if (FDH_PREAD(fdP, &vnp->disk, vcp->diskSize, off) != vcp->diskSize) {
790 Log("VAllocVnode: can't read index file!\n");
792 goto error_encountered;
794 if (vnp->disk.type != vNull) {
795 Log("VAllocVnode: addled bitmap or index!\n");
797 goto error_encountered;
800 /* growing file - grow in a reasonable increment */
801 char *buf = (char *)malloc(16 * 1024);
803 Log("VAllocVnode: can't grow vnode index: out of memory\n");
805 goto error_encountered;
807 memset(buf, 0, 16 * 1024);
808 if ((FDH_PWRITE(fdP, buf, 16 * 1024, off)) != 16 * 1024) {
809 Log("VAllocVnode: can't grow vnode index: write failed\n");
812 goto error_encountered;
819 #ifdef AFS_DEMAND_ATTACH_FS
820 VChangeState_r(vp, vol_state_save);
821 VCancelReservation_r(vp);
828 * close the file handle
830 * invalidate the vnode
831 * free up the bitmap entry (although salvager should take care of it)
833 * drop vnode lock and refs
838 VFreeBitMapEntry_r(&tmp, vp, &vp->vnodeIndex[class], bitNumber, 0 /*flags*/);
839 VInvalidateVnode_r(vnp);
840 VnUnlock(vnp, WRITE_LOCK);
841 VnCancelReservation_r(vnp);
842 #ifdef AFS_DEMAND_ATTACH_FS
843 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
844 VCancelReservation_r(vp);
846 VForceOffline_r(vp, 0);
851 VNLog(4, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
852 #ifndef AFS_DEMAND_ATTACH_FS
857 VNLog(5, 1, (intptr_t)vnp, 0, 0, 0);
858 memset(&vnp->disk, 0, sizeof(vnp->disk));
859 vnp->changed_newTime = 0; /* set this bit when vnode is updated */
860 vnp->changed_oldTime = 0; /* set this on CopyOnWrite. */
862 vnp->disk.vnodeMagic = vcp->magic;
863 vnp->disk.type = type;
864 vnp->disk.uniquifier = unique;
867 vp->header->diskstuff.filecount++;
868 #ifdef AFS_DEMAND_ATTACH_FS
869 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
875 * load a vnode from disk.
877 * @param[out] ec client error code return
878 * @param[in] vp volume object pointer
879 * @param[in] vnp vnode object pointer
880 * @param[in] vcp vnode class info object pointer
881 * @param[in] class vnode class enumeration
883 * @pre vnode is registered in appropriate data structures;
884 * caller holds a ref on vnode; VOL_LOCK is held
886 * @post vnode data is loaded from disk.
887 * vnode state is set to VN_STATE_ONLINE.
888 * on failure, vnode is invalidated.
890 * @internal vnode package internal use only
893 VnLoad(Error * ec, Volume * vp, Vnode * vnp,
894 struct VnodeClassInfo * vcp, VnodeClass class)
896 /* vnode not cached */
900 IHandle_t *ihP = vp->vnodeIndex[class].handle;
907 #ifdef AFS_DEMAND_ATTACH_FS
908 VnChangeState_r(vnp, VN_STATE_LOAD);
911 /* This will never block */
912 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
917 Log("VnLoad: can't open index dev=%u, i=%s\n", vp->device,
918 PrintInode(stmp, vp->vnodeIndex[class].handle->ih_ino));
920 goto error_encountered_nolock;
921 } else if ((nBytes = FDH_PREAD(fdP, (char *)&vnp->disk, vcp->diskSize, vnodeIndexOffset(vcp, Vn_id(vnp))))
923 /* Don't take volume off line if the inumber is out of range
924 * or the inode table is full. */
925 if (nBytes == BAD_IGET) {
926 Log("VnLoad: bad inumber %s\n",
927 PrintInode(stmp, vp->vnodeIndex[class].handle->ih_ino));
930 } else if (nBytes == -1 && errno == EIO) {
931 /* disk error; salvage */
932 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
934 /* vnode is not allocated */
936 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); read %d bytes, errno %d\n",
937 Vn_id(vnp), V_id(vp), V_name(vp), (int)nBytes, errno);
941 goto error_encountered_nolock;
946 /* Quick check to see that the data is reasonable */
947 if (vnp->disk.vnodeMagic != vcp->magic || vnp->disk.type == vNull) {
948 if (vnp->disk.type == vNull) {
952 struct vnodeIndex *index = &vp->vnodeIndex[class];
953 unsigned int bitNumber = vnodeIdToBitNumber(Vn_id(vnp));
954 unsigned int offset = bitNumber >> 3;
956 #ifdef AFS_DEMAND_ATTACH_FS
957 /* Make sure the volume bitmap isn't getting updated while we are
959 VWaitExclusiveState_r(vp);
962 /* Test to see if vnode number is valid. */
963 if ((offset >= index->bitmapSize)
964 || ((*(index->bitmap + offset) & (1 << (bitNumber & 0x7)))
966 Log("VnLoad: Request for unallocated vnode %u, volume %u (%s) denied.\n", Vn_id(vnp), V_id(vp), V_name(vp));
970 Log("VnLoad: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
973 goto error_encountered;
976 IH_INIT(vnp->handle, V_device(vp), V_parentId(vp), VN_GET_INO(vnp));
977 VnUnlock(vnp, WRITE_LOCK);
978 #ifdef AFS_DEMAND_ATTACH_FS
979 VnChangeState_r(vnp, VN_STATE_ONLINE);
984 error_encountered_nolock:
986 FDH_REALLYCLOSE(fdP);
992 #ifdef AFS_DEMAND_ATTACH_FS
993 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
995 VForceOffline_r(vp, 0);
1002 VInvalidateVnode_r(vnp);
1003 VnUnlock(vnp, WRITE_LOCK);
1007 * store a vnode to disk.
1009 * @param[out] ec error code output
1010 * @param[in] vp volume object pointer
1011 * @param[in] vnp vnode object pointer
1012 * @param[in] vcp vnode class info object pointer
1013 * @param[in] class vnode class enumeration
1015 * @pre VOL_LOCK held.
1016 * caller holds refs to volume and vnode.
1017 * DAFS: caller is responsible for performing state sanity checks.
1019 * @post vnode state is stored to disk.
1021 * @internal vnode package internal use only
1024 VnStore(Error * ec, Volume * vp, Vnode * vnp,
1025 struct VnodeClassInfo * vcp, VnodeClass class)
1029 IHandle_t *ihP = vp->vnodeIndex[class].handle;
1032 #ifdef AFS_DEMAND_ATTACH_FS
1033 VnState vn_state_save;
1038 #ifdef AFS_DEMAND_ATTACH_FS
1039 vn_state_save = VnChangeState_r(vnp, VN_STATE_STORE);
1042 offset = vnodeIndexOffset(vcp, Vn_id(vnp));
1046 Log("VnStore: can't open index file!\n");
1047 goto error_encountered;
1049 nBytes = FDH_PWRITE(fdP, &vnp->disk, vcp->diskSize, offset);
1050 if (nBytes != vcp->diskSize) {
1051 /* Don't force volume offline if the inumber is out of
1052 * range or the inode table is full.
1054 FDH_REALLYCLOSE(fdP);
1055 if (nBytes == BAD_IGET) {
1056 Log("VnStore: bad inumber %s\n",
1058 vp->vnodeIndex[class].handle->ih_ino));
1061 #ifdef AFS_DEMAND_ATTACH_FS
1062 VnChangeState_r(vnp, VN_STATE_ERROR);
1065 Log("VnStore: Couldn't write vnode %u, volume %u (%s) (error %d)\n", Vn_id(vnp), V_id(Vn_volume(vnp)), V_name(Vn_volume(vnp)), (int)nBytes);
1066 #ifdef AFS_DEMAND_ATTACH_FS
1067 goto error_encountered;
1070 VForceOffline_r(vp, 0);
1080 #ifdef AFS_DEMAND_ATTACH_FS
1081 VnChangeState_r(vnp, vn_state_save);
1086 #ifdef AFS_DEMAND_ATTACH_FS
1087 /* XXX instead of dumping core, let's try to request a salvage
1088 * and just fail the putvnode */
1092 VnChangeState_r(vnp, VN_STATE_ERROR);
1093 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1100 * get a handle to a vnode object.
1102 * @param[out] ec error code
1103 * @param[in] vp volume object
1104 * @param[in] vnodeNumber vnode id
1105 * @param[in] locktype type of lock to acquire
1107 * @return vnode object pointer
1112 VGetVnode(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1113 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1116 retVal = VGetVnode_r(ec, vp, vnodeNumber, locktype);
1122 * get a handle to a vnode object.
1124 * @param[out] ec error code
1125 * @param[in] vp volume object
1126 * @param[in] vnodeNumber vnode id
1127 * @param[in] locktype type of lock to acquire
1129 * @return vnode object pointer
1131 * @internal vnode package internal use only
1133 * @pre VOL_LOCK held.
1134 * heavyweight ref held on volume object.
1137 VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1138 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1141 struct VnodeClassInfo *vcp;
1145 if (vnodeNumber == 0) {
1150 VNLog(100, 1, vnodeNumber, 0, 0, 0);
1152 #ifdef AFS_DEMAND_ATTACH_FS
1154 * once a volume has entered an error state, don't permit
1155 * further operations to proceed
1156 * -- tkeiser 11/21/2007
1158 VWaitExclusiveState_r(vp);
1159 if (VIsErrorState(V_attachState(vp))) {
1160 /* XXX is VSALVAGING acceptable here? */
1166 if (programType == fileServer && !V_inUse(vp)) {
1167 *ec = (vp->specialStatus ? vp->specialStatus : VOFFLINE);
1169 /* If the volume is VBUSY (being cloned or dumped) and this is
1170 * a READ operation, then don't fail.
1172 if ((*ec != VBUSY) || (locktype != READ_LOCK)) {
1177 class = vnodeIdToClass(vnodeNumber);
1178 vcp = &VnodeClassInfo[class];
1179 if (locktype == WRITE_LOCK && !VolumeWriteable(vp)) {
1180 *ec = (bit32) VREADONLY;
1184 if (locktype == WRITE_LOCK && programType == fileServer) {
1185 VAddToVolumeUpdateList_r(ec, vp);
1193 /* See whether the vnode is in the cache. */
1194 vnp = VLookupVnode(vp, vnodeNumber);
1196 /* vnode is in cache */
1198 VNLog(101, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
1199 VnCreateReservation_r(vnp);
1201 #ifdef AFS_DEMAND_ATTACH_FS
1203 * this is the one DAFS case where we may run into contention.
1204 * here's the basic control flow:
1206 * if locktype is READ_LOCK:
1207 * wait until vnode is not exclusive
1208 * set to VN_STATE_READ
1209 * increment read count
1212 * wait until vnode is quiescent
1213 * set to VN_STATE_EXCLUSIVE
1216 if (locktype == READ_LOCK) {
1217 VnWaitExclusiveState_r(vnp);
1219 VnWaitQuiescent_r(vnp);
1222 if (VnIsErrorState(Vn_state(vnp))) {
1223 VnCancelReservation_r(vnp);
1227 #endif /* AFS_DEMAND_ATTACH_FS */
1229 /* vnode not cached */
1231 /* Not in cache; tentatively grab most distantly used one from the LRU
1234 vnp = VGetFreeVnode_r(vcp);
1237 vnp->changed_newTime = vnp->changed_oldTime = 0;
1239 Vn_id(vnp) = vnodeNumber;
1240 VnCreateReservation_r(vnp);
1241 AddToVVnList(vp, vnp);
1242 #ifdef AFS_DEMAND_ATTACH_FS
1247 * XXX for non-DAFS, there is a serious
1248 * race condition here:
1250 * two threads can race to load a vnode. the net
1251 * result is two struct Vnodes can be allocated
1252 * and hashed, which point to the same underlying
1253 * disk data store. conflicting vnode locks can
1254 * thus be held concurrently.
1256 * for non-DAFS to be safe, VOL_LOCK really shouldn't
1257 * be dropped in VnLoad. Of course, this would likely
1258 * lead to an unacceptable slow-down.
1261 VnLoad(ec, vp, vnp, vcp, class);
1263 VnCancelReservation_r(vnp);
1266 #ifndef AFS_DEMAND_ATTACH_FS
1271 * there is no possibility for contention. we "own" this vnode.
1277 * it is imperative that nothing drop vol lock between here
1278 * and the VnBeginRead/VnChangeState stanza below
1281 VnLock(vnp, locktype, VOL_LOCK_HELD, MIGHT_DEADLOCK);
1283 /* Check that the vnode hasn't been removed while we were obtaining
1285 VNLog(102, 2, vnodeNumber, (intptr_t) vnp, 0, 0);
1286 if ((vnp->disk.type == vNull) || (Vn_cacheCheck(vnp) == 0)) {
1287 VnUnlock(vnp, locktype);
1288 VnCancelReservation_r(vnp);
1290 /* vnode is labelled correctly by now, so we don't have to invalidate it */
1294 #ifdef AFS_DEMAND_ATTACH_FS
1295 if (locktype == READ_LOCK) {
1298 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
1302 if (programType == fileServer)
1303 VBumpVolumeUsage_r(Vn_volume(vnp)); /* Hack; don't know where it should be
1304 * called from. Maybe VGetVolume */
1309 int TrustVnodeCacheEntry = 1;
1310 /* This variable is bogus--when it's set to 0, the hash chains fill
1311 up with multiple versions of the same vnode. Should fix this!! */
1313 VPutVnode(Error * ec, Vnode * vnp)
1316 VPutVnode_r(ec, vnp);
1321 * put back a handle to a vnode object.
1323 * @param[out] ec client error code
1324 * @param[in] vnp vnode object pointer
1326 * @pre VOL_LOCK held.
1327 * ref held on vnode.
1329 * @post ref dropped on vnode.
1330 * if vnode was modified or deleted, it is written out to disk
1331 * (assuming a write lock was held).
1333 * @internal volume package internal use only
1336 VPutVnode_r(Error * ec, Vnode * vnp)
1340 struct VnodeClassInfo *vcp;
1343 osi_Assert(Vn_refcount(vnp) != 0);
1344 class = vnodeIdToClass(Vn_id(vnp));
1345 vcp = &VnodeClassInfo[class];
1346 osi_Assert(vnp->disk.vnodeMagic == vcp->magic);
1347 VNLog(200, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1349 #ifdef AFS_DEMAND_ATTACH_FS
1350 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1352 writeLocked = WriteLocked(&vnp->lock);
1357 #ifdef AFS_PTHREAD_ENV
1358 pthread_t thisProcess = pthread_self();
1359 #else /* AFS_PTHREAD_ENV */
1360 PROCESS thisProcess;
1361 LWP_CurrentProcess(&thisProcess);
1362 #endif /* AFS_PTHREAD_ENV */
1363 VNLog(201, 2, (intptr_t) vnp,
1364 ((vnp->changed_newTime) << 1) | ((vnp->
1365 changed_oldTime) << 1) | vnp->
1367 if (thisProcess != vnp->writer)
1368 Abort("VPutVnode: Vnode at %"AFS_PTR_FMT" locked by another process!\n",
1372 if (vnp->changed_oldTime || vnp->changed_newTime || vnp->delete) {
1373 Volume *vp = Vn_volume(vnp);
1374 afs_uint32 now = FT_ApproxTime();
1375 osi_Assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1378 /* No longer any directory entries for this vnode. Free the Vnode */
1379 memset(&vnp->disk, 0, sizeof(vnp->disk));
1380 /* delete flag turned off further down */
1381 VNLog(202, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1382 } else if (vnp->changed_newTime) {
1383 vnp->disk.serverModifyTime = now;
1385 if (vnp->changed_newTime)
1387 V_updateDate(vp) = vp->updateTime = now;
1388 if(V_volUpCounter(vp)<MAXINT)
1389 V_volUpCounter(vp)++;
1392 /* The vnode has been changed. Write it out to disk */
1394 #ifdef AFS_DEMAND_ATTACH_FS
1395 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1397 osi_Assert(V_needsSalvaged(vp));
1401 VnStore(ec, vp, vnp, vcp, class);
1403 /* If the vnode is to be deleted, and we wrote the vnode out,
1404 * free its bitmap entry. Do after the vnode is written so we
1405 * don't allocate from bitmap before the vnode is written
1406 * (doing so could cause a "addled bitmap" message).
1408 if (vnp->delete && !*ec) {
1409 if (Vn_volume(vnp)->header->diskstuff.filecount-- < 1)
1410 Vn_volume(vnp)->header->diskstuff.filecount = 0;
1411 VFreeBitMapEntry_r(ec, vp, &vp->vnodeIndex[class],
1412 vnodeIdToBitNumber(Vn_id(vnp)),
1413 VOL_FREE_BITMAP_WAIT);
1417 vnp->changed_newTime = vnp->changed_oldTime = 0;
1419 #ifdef AFS_DEMAND_ATTACH_FS
1420 VnChangeState_r(vnp, VN_STATE_ONLINE);
1422 } else { /* Not write locked */
1423 if (vnp->changed_newTime || vnp->changed_oldTime || vnp->delete)
1425 ("VPutVnode: Change or delete flag for vnode "
1426 "%"AFS_PTR_FMT" is set but vnode is not write locked!\n",
1428 #ifdef AFS_DEMAND_ATTACH_FS
1433 /* Do not look at disk portion of vnode after this point; it may
1434 * have been deleted above */
1436 VnUnlock(vnp, ((writeLocked) ? WRITE_LOCK : READ_LOCK));
1437 VnCancelReservation_r(vnp);
1441 * Make an attempt to convert a vnode lock from write to read.
1442 * Do nothing if the vnode isn't write locked or the vnode has
1446 VVnodeWriteToRead(Error * ec, Vnode * vnp)
1450 retVal = VVnodeWriteToRead_r(ec, vnp);
1456 * convert vnode handle from mutually exclusive to shared access.
1458 * @param[out] ec client error code
1459 * @param[in] vnp vnode object pointer
1461 * @return unspecified use (see out argument 'ec' for error code return)
1463 * @pre VOL_LOCK held.
1464 * ref held on vnode.
1465 * write lock held on vnode.
1467 * @post read lock held on vnode.
1468 * if vnode was modified, it has been written to disk.
1470 * @internal volume package internal use only
1473 VVnodeWriteToRead_r(Error * ec, Vnode * vnp)
1477 struct VnodeClassInfo *vcp;
1478 #ifdef AFS_PTHREAD_ENV
1479 pthread_t thisProcess;
1480 #else /* AFS_PTHREAD_ENV */
1481 PROCESS thisProcess;
1482 #endif /* AFS_PTHREAD_ENV */
1485 osi_Assert(Vn_refcount(vnp) != 0);
1486 class = vnodeIdToClass(Vn_id(vnp));
1487 vcp = &VnodeClassInfo[class];
1488 osi_Assert(vnp->disk.vnodeMagic == vcp->magic);
1489 VNLog(300, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1491 #ifdef AFS_DEMAND_ATTACH_FS
1492 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1494 writeLocked = WriteLocked(&vnp->lock);
1501 VNLog(301, 2, (intptr_t) vnp,
1502 ((vnp->changed_newTime) << 1) | ((vnp->
1503 changed_oldTime) << 1) | vnp->
1507 #ifdef AFS_PTHREAD_ENV
1508 thisProcess = pthread_self();
1509 #else /* AFS_PTHREAD_ENV */
1510 LWP_CurrentProcess(&thisProcess);
1511 #endif /* AFS_PTHREAD_ENV */
1512 if (thisProcess != vnp->writer)
1513 Abort("VPutVnode: Vnode at %"AFS_PTR_FMT
1514 " locked by another process!\n", vnp);
1519 if (vnp->changed_oldTime || vnp->changed_newTime) {
1520 Volume *vp = Vn_volume(vnp);
1521 afs_uint32 now = FT_ApproxTime();
1522 osi_Assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1523 if (vnp->changed_newTime)
1524 vnp->disk.serverModifyTime = now;
1525 if (vnp->changed_newTime)
1526 V_updateDate(vp) = vp->updateTime = now;
1528 /* The inode has been changed. Write it out to disk */
1530 #ifdef AFS_DEMAND_ATTACH_FS
1531 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1533 osi_Assert(V_needsSalvaged(vp));
1537 VnStore(ec, vp, vnp, vcp, class);
1540 vnp->changed_newTime = vnp->changed_oldTime = 0;
1544 #ifdef AFS_DEMAND_ATTACH_FS
1545 VnChangeState_r(vnp, VN_STATE_ONLINE);
1548 ConvertWriteToReadLock(&vnp->lock);
1554 * initial size of ihandle pointer vector.
1556 * @see VInvalidateVnodesByVolume_r
1558 #define IH_VEC_BASE_SIZE 256
1561 * increment amount for growing ihandle pointer vector.
1563 * @see VInvalidateVnodesByVolume_r
1565 #define IH_VEC_INCREMENT 256
1568 * Compile list of ihandles to be released/reallyclosed at a later time.
1570 * @param[in] vp volume object pointer
1571 * @param[out] vec_out vector of ihandle pointers to be released/reallyclosed
1572 * @param[out] vec_len_out number of valid elements in ihandle vector
1574 * @pre - VOL_LOCK is held
1575 * - volume is in appropriate exclusive state (e.g. VOL_STATE_VNODE_CLOSE,
1576 * VOL_STATE_VNODE_RELEASE)
1578 * @post - all vnodes on VVn list are invalidated
1579 * - ih_vec is populated with all valid ihandles
1581 * @return operation status
1583 * @retval ENOMEM out of memory
1585 * @todo we should handle out of memory conditions more gracefully.
1587 * @internal vnode package internal use only
1590 VInvalidateVnodesByVolume_r(Volume * vp,
1591 IHandle_t *** vec_out,
1592 size_t * vec_len_out)
1596 size_t i = 0, vec_len;
1597 IHandle_t **ih_vec, **ih_vec_new;
1599 #ifdef AFS_DEMAND_ATTACH_FS
1601 #endif /* AFS_DEMAND_ATTACH_FS */
1603 vec_len = IH_VEC_BASE_SIZE;
1604 ih_vec = malloc(sizeof(IHandle_t *) * vec_len);
1605 #ifdef AFS_DEMAND_ATTACH_FS
1612 * Traverse the volume's vnode list. Pull all the ihandles out into a
1613 * thread-private array for later asynchronous processing.
1615 #ifdef AFS_DEMAND_ATTACH_FS
1618 for (queue_Scan(&vp->vnode_list, vnp, nvnp, Vnode)) {
1619 if (vnp->handle != NULL) {
1621 #ifdef AFS_DEMAND_ATTACH_FS
1624 vec_len += IH_VEC_INCREMENT;
1625 ih_vec_new = realloc(ih_vec, sizeof(IHandle_t *) * vec_len);
1626 #ifdef AFS_DEMAND_ATTACH_FS
1629 if (ih_vec_new == NULL) {
1633 ih_vec = ih_vec_new;
1634 #ifdef AFS_DEMAND_ATTACH_FS
1636 * Theoretically, the volume's VVn list should not change
1637 * because the volume is in an exclusive state. For the
1638 * sake of safety, we will restart the traversal from the
1639 * the beginning (which is not expensive because we're
1640 * deleting the items from the list as we go).
1642 goto restart_traversal;
1645 ih_vec[i++] = vnp->handle;
1648 DeleteFromVVnList(vnp);
1649 VInvalidateVnode_r(vnp);
1659 /* VCloseVnodeFiles - called when a volume is going off line. All open
1660 * files for vnodes in that volume are closed. This might be excessive,
1661 * since we may only be taking one volume of a volume group offline.
1664 VCloseVnodeFiles_r(Volume * vp)
1666 #ifdef AFS_DEMAND_ATTACH_FS
1667 VolState vol_state_save;
1669 IHandle_t ** ih_vec;
1672 #ifdef AFS_DEMAND_ATTACH_FS
1673 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_CLOSE);
1674 #endif /* AFS_DEMAND_ATTACH_FS */
1676 /* XXX need better error handling here */
1677 osi_Assert(VInvalidateVnodesByVolume_r(vp,
1683 * now we drop VOL_LOCK while we perform some potentially very
1684 * expensive operations in the background
1686 #ifdef AFS_DEMAND_ATTACH_FS
1690 for (i = 0; i < vec_len; i++) {
1691 IH_REALLYCLOSE(ih_vec[i]);
1692 IH_RELEASE(ih_vec[i]);
1697 #ifdef AFS_DEMAND_ATTACH_FS
1699 VChangeState_r(vp, vol_state_save);
1700 #endif /* AFS_DEMAND_ATTACH_FS */
1705 * shut down all vnode cache state for a given volume.
1707 * @param[in] vp volume object pointer
1709 * @pre VOL_LOCK is held
1711 * @post all file descriptors closed.
1712 * all inode handles released.
1713 * all vnode cache objects disassociated from volume.
1715 * @note for DAFS, these operations are performed outside the vol glock under
1716 * volume exclusive state VOL_STATE_VNODE_RELEASE. Please further note
1717 * that it would be a bug to acquire and release a volume reservation
1718 * during this exclusive operation. This is due to the fact that we are
1719 * generally called during the refcount 1->0 transition.
1721 * @todo we should handle failures in VInvalidateVnodesByVolume_r more
1724 * @see VInvalidateVnodesByVolume_r
1726 * @internal this routine is internal to the volume package
1729 VReleaseVnodeFiles_r(Volume * vp)
1731 #ifdef AFS_DEMAND_ATTACH_FS
1732 VolState vol_state_save;
1734 IHandle_t ** ih_vec;
1737 #ifdef AFS_DEMAND_ATTACH_FS
1738 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_RELEASE);
1739 #endif /* AFS_DEMAND_ATTACH_FS */
1741 /* XXX need better error handling here */
1742 osi_Assert(VInvalidateVnodesByVolume_r(vp,
1748 * now we drop VOL_LOCK while we perform some potentially very
1749 * expensive operations in the background
1751 #ifdef AFS_DEMAND_ATTACH_FS
1755 for (i = 0; i < vec_len; i++) {
1756 IH_RELEASE(ih_vec[i]);
1761 #ifdef AFS_DEMAND_ATTACH_FS
1763 VChangeState_r(vp, vol_state_save);
1764 #endif /* AFS_DEMAND_ATTACH_FS */