2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
15 Institution: The Information Technology Center, Carnegie-Mellon University
18 #include <afsconfig.h>
19 #include <afs/param.h>
20 #define MAXINT (~(1<<((sizeof(int)*8)-1)))
28 #ifdef AFS_PTHREAD_ENV
30 #else /* AFS_PTHREAD_ENV */
31 #include <afs/assert.h>
32 #endif /* AFS_PTHREAD_ENV */
35 #include "rx/rx_queue.h"
36 #include <afs/afsint.h>
38 #include <afs/errors.h>
41 #include <afs/afssyscalls.h>
45 #include "volume_inline.h"
46 #include "vnode_inline.h"
47 #include "partition.h"
49 #if defined(AFS_SGI_ENV)
50 #include "sys/types.h"
62 #include <sys/fcntl.h>
65 #endif /* AFS_NT40_ENV */
68 /*@printflike@*/ extern void Log(const char *format, ...);
70 /*@printflike@*/ extern void Abort(const char *format, ...);
73 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
75 private void StickOnLruChain_r(register Vnode * vnp,
76 register struct VnodeClassInfo *vcp);
83 #define BAD_IGET -1000
85 /* There are two separate vnode queue types defined here:
86 * Each hash conflict chain -- is singly linked, with a single head
87 * pointer. New entries are added at the beginning. Old
88 * entries are removed by linear search, which generally
89 * only occurs after a disk read).
90 * LRU chain -- is doubly linked, single head pointer.
91 * Entries are added at the head, reclaimed from the tail,
92 * or removed from anywhere in the queue.
96 /* Vnode hash table. Find hash chain by taking lower bits of
97 * (volume_hash_offset + vnode).
98 * This distributes the root inodes of the volumes over the
99 * hash table entries and also distributes the vnodes of
100 * volumes reasonably fairly. The volume_hash_offset field
101 * for each volume is established as the volume comes on line
102 * by using the VOLUME_HASH_OFFSET macro. This distributes the
103 * volumes fairly among the cache entries, both when servicing
104 * a small number of volumes and when servicing a large number.
107 /* logging stuff for finding bugs */
108 #define THELOGSIZE 5120
109 static afs_int32 theLog[THELOGSIZE];
110 static afs_int32 vnLogPtr = 0;
112 VNLog(afs_int32 aop, afs_int32 anparms, afs_int32 av1, afs_int32 av2,
113 afs_int32 av3, afs_int32 av4)
115 register afs_int32 temp;
118 /* copy data to array */
124 anparms = 4; /* do bounds checking */
126 temp = (aop << 16) | anparms;
127 theLog[vnLogPtr++] = temp;
128 if (vnLogPtr >= THELOGSIZE)
130 for (temp = 0; temp < anparms; temp++) {
131 theLog[vnLogPtr++] = data[temp];
132 if (vnLogPtr >= THELOGSIZE)
137 /* VolumeHashOffset -- returns a new value to be stored in the
138 * volumeHashOffset of a Volume structure. Called when a
139 * volume is initialized. Sets the volumeHashOffset so that
140 * vnode cache entries are distributed reasonably between
141 * volumes (the root vnodes of the volumes will hash to
142 * different values, and spacing is maintained between volumes
143 * when there are not many volumes represented), and spread
144 * equally amongst vnodes within a single volume.
147 VolumeHashOffset_r(void)
149 static int nextVolumeHashOffset = 0;
150 /* hashindex Must be power of two in size */
152 # define hashMask ((1<<hashShift)-1)
153 static byte hashindex[1 << hashShift] =
154 { 0, 128, 64, 192, 32, 160, 96, 224 };
156 offset = hashindex[nextVolumeHashOffset & hashMask]
157 + (nextVolumeHashOffset >> hashShift);
158 nextVolumeHashOffset++;
162 /* Change hashindex (above) if you change this constant */
163 #define VNODE_HASH_TABLE_SIZE 256
164 private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
165 #define VNODE_HASH(volumeptr,vnodenumber)\
166 ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
170 * add a vnode to the volume's vnode list.
172 * @param[in] vp volume object pointer
173 * @param[in] vnp vnode object pointer
175 * @note for DAFS, it may seem like we should be acquiring a lightweight ref
176 * on vp, but this would actually break things. Right now, this is ok
177 * because we destroy all vnode cache contents during during volume
182 * @internal volume package internal use only
185 AddToVVnList(Volume * vp, Vnode * vnp)
187 if (queue_IsOnQueue(vnp))
191 Vn_cacheCheck(vnp) = vp->cacheCheck;
192 queue_Append(&vp->vnode_list, vnp);
193 Vn_stateFlags(vnp) |= VN_ON_VVN;
197 * delete a vnode from the volume's vnode list.
201 * @internal volume package internal use only
204 DeleteFromVVnList(register Vnode * vnp)
206 Vn_volume(vnp) = NULL;
208 if (!queue_IsOnQueue(vnp))
212 Vn_stateFlags(vnp) &= ~(VN_ON_VVN);
216 * add a vnode to the end of the lru.
218 * @param[in] vcp vnode class info object pointer
219 * @param[in] vnp vnode object pointer
221 * @internal vnode package internal use only
224 AddToVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
226 if (Vn_stateFlags(vnp) & VN_ON_LRU) {
230 /* Add it to the circular LRU list */
231 if (vcp->lruHead == NULL)
232 Abort("VPutVnode: vcp->lruHead==NULL");
234 vnp->lruNext = vcp->lruHead;
235 vnp->lruPrev = vcp->lruHead->lruPrev;
236 vcp->lruHead->lruPrev = vnp;
237 vnp->lruPrev->lruNext = vnp;
241 /* If the vnode was just deleted, put it at the end of the chain so it
242 * will be reused immediately */
244 vcp->lruHead = vnp->lruNext;
246 Vn_stateFlags(vnp) |= VN_ON_LRU;
250 * delete a vnode from the lru.
252 * @param[in] vcp vnode class info object pointer
253 * @param[in] vnp vnode object pointer
255 * @internal vnode package internal use only
258 DeleteFromVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
260 if (!(Vn_stateFlags(vnp) & VN_ON_LRU)) {
264 if (vnp == vcp->lruHead)
265 vcp->lruHead = vcp->lruHead->lruNext;
267 if ((vnp == vcp->lruHead) ||
268 (vcp->lruHead == NULL))
269 Abort("DeleteFromVnLRU: lru chain addled!\n");
271 vnp->lruPrev->lruNext = vnp->lruNext;
272 vnp->lruNext->lruPrev = vnp->lruPrev;
274 Vn_stateFlags(vnp) &= ~(VN_ON_LRU);
278 * add a vnode to the vnode hash table.
280 * @param[in] vnp vnode object pointer
284 * @post vnode on hash
286 * @internal vnode package internal use only
289 AddToVnHash(Vnode * vnp)
291 unsigned int newHash;
293 if (!(Vn_stateFlags(vnp) & VN_ON_HASH)) {
294 newHash = VNODE_HASH(Vn_volume(vnp), Vn_id(vnp));
295 vnp->hashNext = VnodeHashTable[newHash];
296 VnodeHashTable[newHash] = vnp;
297 vnp->hashIndex = newHash;
299 Vn_stateFlags(vnp) |= VN_ON_HASH;
304 * delete a vnode from the vnode hash table.
311 * @post vnode removed from hash
313 * @internal vnode package internal use only
316 DeleteFromVnHash(Vnode * vnp)
320 if (Vn_stateFlags(vnp) & VN_ON_HASH) {
321 tvnp = VnodeHashTable[vnp->hashIndex];
323 VnodeHashTable[vnp->hashIndex] = vnp->hashNext;
325 while (tvnp && tvnp->hashNext != vnp)
326 tvnp = tvnp->hashNext;
328 tvnp->hashNext = vnp->hashNext;
331 vnp->hashNext = NULL;
333 Vn_stateFlags(vnp) &= ~(VN_ON_HASH);
339 * invalidate a vnode cache entry.
341 * @param[in] avnode vnode object pointer
345 * @post vnode metadata invalidated.
346 * vnode removed from hash table.
347 * DAFS: vnode state set to VN_STATE_INVALID.
349 * @internal vnode package internal use only
352 VInvalidateVnode_r(register struct Vnode *avnode)
354 avnode->changed_newTime = 0; /* don't let it get flushed out again */
355 avnode->changed_oldTime = 0;
356 avnode->delete = 0; /* it isn't deleted, really */
357 avnode->cacheCheck = 0; /* invalid: prevents future vnode searches from working */
358 DeleteFromVnHash(avnode);
359 #ifdef AFS_DEMAND_ATTACH_FS
360 VnChangeState_r(avnode, VN_STATE_INVALID);
366 * initialize vnode cache for a given vnode class.
368 * @param[in] class vnode class
369 * @param[in] nVnodes size of cache
371 * @post vnode cache allocated and initialized
373 * @internal volume package internal use only
375 * @note generally called by VInitVolumePackage_r
377 * @see VInitVolumePackage_r
380 VInitVnodes(VnodeClass class, int nVnodes)
383 register struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
385 vcp->allocs = vcp->gets = vcp->reads = vcp->writes = 0;
386 vcp->cacheSize = nVnodes;
389 assert(CHECKSIZE_SMALLVNODE);
391 vcp->residentSize = SIZEOF_SMALLVNODE;
392 vcp->diskSize = SIZEOF_SMALLDISKVNODE;
393 vcp->magic = SMALLVNODEMAGIC;
397 vcp->residentSize = SIZEOF_LARGEVNODE;
398 vcp->diskSize = SIZEOF_LARGEDISKVNODE;
399 vcp->magic = LARGEVNODEMAGIC;
403 int s = vcp->diskSize - 1;
413 va = (byte *) calloc(nVnodes, vcp->residentSize);
416 Vnode *vnp = (Vnode *) va;
417 Vn_refcount(vnp) = 0; /* no context switches */
418 Vn_stateFlags(vnp) |= VN_ON_LRU;
419 #ifdef AFS_DEMAND_ATTACH_FS
420 assert(pthread_cond_init(&Vn_stateCV(vnp), NULL) == 0);
421 Vn_state(vnp) = VN_STATE_INVALID;
423 #else /* !AFS_DEMAND_ATTACH_FS */
424 Lock_Init(&vnp->lock);
425 #endif /* !AFS_DEMAND_ATTACH_FS */
426 vnp->changed_oldTime = 0;
427 vnp->changed_newTime = 0;
428 Vn_volume(vnp) = NULL;
429 Vn_cacheCheck(vnp) = 0;
430 vnp->delete = Vn_id(vnp) = 0;
431 #ifdef AFS_PTHREAD_ENV
432 vnp->writer = (pthread_t) 0;
433 #else /* AFS_PTHREAD_ENV */
434 vnp->writer = (PROCESS) 0;
435 #endif /* AFS_PTHREAD_ENV */
439 if (vcp->lruHead == NULL)
440 vcp->lruHead = vnp->lruNext = vnp->lruPrev = vnp;
442 vnp->lruNext = vcp->lruHead;
443 vnp->lruPrev = vcp->lruHead->lruPrev;
444 vcp->lruHead->lruPrev = vnp;
445 vnp->lruPrev->lruNext = vnp;
448 va += vcp->residentSize;
455 * allocate an unused vnode from the lru chain.
457 * @param[in] vcp vnode class info object pointer
459 * @pre VOL_LOCK is held
461 * @post vnode object is removed from lru, and vnode hash table.
462 * vnode is disassociated from volume object.
463 * state is set to VN_STATE_INVALID.
464 * inode handle is released.
466 * @note we traverse backwards along the lru circlist. It shouldn't
467 * be necessary to specify that nUsers == 0 since if it is in the list,
468 * nUsers should be 0. Things shouldn't be in lruq unless no one is
471 * @warning DAFS: VOL_LOCK is dropped while doing inode handle release
473 * @return vnode object pointer
476 VGetFreeVnode_r(struct VnodeClassInfo * vcp)
480 vnp = vcp->lruHead->lruPrev;
481 #ifdef AFS_DEMAND_ATTACH_FS
482 if (Vn_refcount(vnp) != 0 || VnIsExclusiveState(Vn_state(vnp)) ||
483 Vn_readers(vnp) != 0)
484 Abort("VGetFreeVnode_r: in-use vnode in lruq");
486 if (Vn_refcount(vnp) != 0 || CheckLock(&vnp->lock))
487 Abort("VGetFreeVnode_r: locked vnode in lruq");
489 VNLog(1, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
492 * it's going to be overwritten soon enough.
493 * remove from LRU, delete hash entry, and
494 * disassociate from old parent volume before
495 * we have a chance to drop the vol glock
497 DeleteFromVnLRU(vcp, vnp);
498 DeleteFromVnHash(vnp);
499 if (Vn_volume(vnp)) {
500 DeleteFromVVnList(vnp);
503 /* drop the file descriptor */
505 #ifdef AFS_DEMAND_ATTACH_FS
506 VnChangeState_r(vnp, VN_STATE_RELEASING);
509 /* release is, potentially, a highly latent operation due to a couple
511 * - ihandle package lock contention
512 * - closing file descriptor(s) associated with ih
514 * Hance, we perform outside of the volume package lock in order to
515 * reduce the probability of contention.
517 IH_RELEASE(vnp->handle);
518 #ifdef AFS_DEMAND_ATTACH_FS
523 #ifdef AFS_DEMAND_ATTACH_FS
524 VnChangeState_r(vnp, VN_STATE_INVALID);
532 * lookup a vnode in the vnode cache hash table.
534 * @param[in] vp pointer to volume object
535 * @param[in] vnodeId vnode id
539 * @post matching vnode object or NULL is returned
541 * @return vnode object pointer
542 * @retval NULL no matching vnode object was found in the cache
544 * @internal vnode package internal use only
546 * @note this symbol is exported strictly for fssync debug protocol use
549 VLookupVnode(Volume * vp, VnodeId vnodeId)
552 unsigned int newHash;
554 newHash = VNODE_HASH(vp, vnodeId);
555 for (vnp = VnodeHashTable[newHash];
557 ((Vn_id(vnp) != vnodeId) ||
558 (Vn_volume(vnp) != vp) ||
559 (vp->cacheCheck != Vn_cacheCheck(vnp))));
560 vnp = vnp->hashNext);
567 VAllocVnode(Error * ec, Volume * vp, VnodeType type)
571 retVal = VAllocVnode_r(ec, vp, type);
577 * allocate a new vnode.
579 * @param[out] ec error code return
580 * @param[in] vp volume object pointer
581 * @param[in] type desired vnode type
583 * @return vnode object pointer
585 * @pre VOL_LOCK held;
586 * heavyweight ref held on vp
588 * @post vnode allocated and returned
591 VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
596 register struct VnodeClassInfo *vcp;
599 #ifdef AFS_DEMAND_ATTACH_FS
600 VolState vol_state_save;
605 #ifdef AFS_DEMAND_ATTACH_FS
607 * once a volume has entered an error state, don't permit
608 * further operations to proceed
609 * -- tkeiser 11/21/2007
611 VWaitExclusiveState_r(vp);
612 if (VIsErrorState(V_attachState(vp))) {
613 /* XXX is VSALVAGING acceptable here? */
619 if (programType == fileServer && !V_inUse(vp)) {
620 if (vp->specialStatus) {
621 *ec = vp->specialStatus;
627 class = vnodeTypeToClass(type);
628 vcp = &VnodeClassInfo[class];
630 if (!VolumeWriteable(vp)) {
631 *ec = (bit32) VREADONLY;
635 unique = vp->nextVnodeUnique++;
637 unique = vp->nextVnodeUnique++;
639 if (vp->nextVnodeUnique > V_uniquifier(vp)) {
640 VUpdateVolume_r(ec, vp, 0);
645 if (programType == fileServer) {
646 VAddToVolumeUpdateList_r(ec, vp);
651 /* Find a slot in the bit map */
652 bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
653 VOL_ALLOC_BITMAP_WAIT);
656 vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
660 * at this point we should be assured that V_attachState(vp) is non-exclusive
664 VNLog(2, 1, vnodeNumber, 0, 0, 0);
665 /* Prepare to move it to the new hash chain */
666 vnp = VLookupVnode(vp, vnodeNumber);
668 /* slot already exists. May even not be in lruq (consider store file locking a file being deleted)
669 * so we may have to wait for it below */
670 VNLog(3, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
672 VnCreateReservation_r(vnp);
673 if (Vn_refcount(vnp) == 1) {
674 /* we're the only user */
675 /* This won't block */
676 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
678 /* other users present; follow locking hierarchy */
679 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, MIGHT_DEADLOCK);
681 #ifdef AFS_DEMAND_ATTACH_FS
684 * vnode was cached, wait for any existing exclusive ops to finish.
685 * once we have reacquired the lock, re-verify volume state.
687 * note: any vnode error state is related to the old vnode; disregard.
689 VnWaitQuiescent_r(vnp);
690 if (VIsErrorState(V_attachState(vp))) {
691 VnUnlock(vnp, WRITE_LOCK);
692 VnCancelReservation_r(vnp);
699 * verify state of the world hasn't changed
701 * (technically, this should never happen because cachecheck
702 * is only updated during a volume attach, which should not
703 * happen when refs are held)
705 if (Vn_volume(vnp)->cacheCheck != Vn_cacheCheck(vnp)) {
706 VnUnlock(vnp, WRITE_LOCK);
707 VnCancelReservation_r(vnp);
713 /* no such vnode in the cache */
715 vnp = VGetFreeVnode_r(vcp);
717 /* Initialize the header fields so noone allocates another
718 * vnode with the same number */
719 Vn_id(vnp) = vnodeNumber;
720 VnCreateReservation_r(vnp);
721 AddToVVnList(vp, vnp);
722 #ifdef AFS_DEMAND_ATTACH_FS
726 /* This will never block (guaranteed by check in VGetFreeVnode_r() */
727 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
729 #ifdef AFS_DEMAND_ATTACH_FS
730 VnChangeState_r(vnp, VN_STATE_ALLOC);
733 /* Sanity check: is this vnode really not in use? */
736 IHandle_t *ihP = vp->vnodeIndex[class].handle;
738 off_t off = vnodeIndexOffset(vcp, vnodeNumber);
740 /* XXX we have a potential race here if two threads
741 * allocate new vnodes at the same time, and they
742 * both decide it's time to extend the index
745 #ifdef AFS_DEMAND_ATTACH_FS
747 * this race has been eliminated for the DAFS case
748 * using exclusive state VOL_STATE_VNODE_ALLOC
750 * if this becomes a bottleneck, there are ways to
751 * improve parallelism for this code path
752 * -- tkeiser 11/28/2007
754 VCreateReservation_r(vp);
755 VWaitExclusiveState_r(vp);
756 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_ALLOC);
762 Log("VAllocVnode: can't open index file!\n");
763 goto error_encountered;
765 if ((size = FDH_SIZE(fdP)) < 0) {
766 Log("VAllocVnode: can't stat index file!\n");
767 goto error_encountered;
769 if (FDH_SEEK(fdP, off, SEEK_SET) < 0) {
770 Log("VAllocVnode: can't seek on index file!\n");
771 goto error_encountered;
773 if (off + vcp->diskSize <= size) {
774 if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) != vcp->diskSize) {
775 Log("VAllocVnode: can't read index file!\n");
776 goto error_encountered;
778 if (vnp->disk.type != vNull) {
779 Log("VAllocVnode: addled bitmap or index!\n");
780 goto error_encountered;
783 /* growing file - grow in a reasonable increment */
784 char *buf = (char *)malloc(16 * 1024);
786 Abort("VAllocVnode: malloc failed\n");
787 memset(buf, 0, 16 * 1024);
788 (void)FDH_WRITE(fdP, buf, 16 * 1024);
794 #ifdef AFS_DEMAND_ATTACH_FS
795 VChangeState_r(vp, vol_state_save);
796 VCancelReservation_r(vp);
802 #ifdef AFS_DEMAND_ATTACH_FS
804 * close the file handle
806 * invalidate the vnode
807 * free up the bitmap entry (although salvager should take care of it)
809 * drop vnode lock and refs
814 VFreeBitMapEntry_r(ec, &vp->vnodeIndex[class], bitNumber);
815 VInvalidateVnode_r(vnp);
816 VnUnlock(vnp, WRITE_LOCK);
817 VnCancelReservation_r(vnp);
818 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
819 VCancelReservation_r(vp);
827 VNLog(4, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
828 #ifndef AFS_DEMAND_ATTACH_FS
833 VNLog(5, 1, (afs_int32) vnp, 0, 0, 0);
834 memset(&vnp->disk, 0, sizeof(vnp->disk));
835 vnp->changed_newTime = 0; /* set this bit when vnode is updated */
836 vnp->changed_oldTime = 0; /* set this on CopyOnWrite. */
838 vnp->disk.vnodeMagic = vcp->magic;
839 vnp->disk.type = type;
840 vnp->disk.uniquifier = unique;
843 vp->header->diskstuff.filecount++;
844 #ifdef AFS_DEMAND_ATTACH_FS
845 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
851 * load a vnode from disk.
853 * @param[out] ec client error code return
854 * @param[in] vp volume object pointer
855 * @param[in] vnp vnode object pointer
856 * @param[in] vcp vnode class info object pointer
857 * @param[in] class vnode class enumeration
859 * @pre vnode is registered in appropriate data structures;
860 * caller holds a ref on vnode; VOL_LOCK is held
862 * @post vnode data is loaded from disk.
863 * vnode state is set to VN_STATE_ONLINE.
864 * on failure, vnode is invalidated.
866 * @internal vnode package internal use only
869 VnLoad(Error * ec, Volume * vp, Vnode * vnp,
870 struct VnodeClassInfo * vcp, VnodeClass class)
872 /* vnode not cached */
875 IHandle_t *ihP = vp->vnodeIndex[class].handle;
881 #ifdef AFS_DEMAND_ATTACH_FS
882 VnChangeState_r(vnp, VN_STATE_LOAD);
885 /* This will never block */
886 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
891 Log("VnLoad: can't open index dev=%u, i=%s\n", vp->device,
892 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
894 goto error_encountered_nolock;
895 } else if (FDH_SEEK(fdP, vnodeIndexOffset(vcp, Vn_id(vnp)), SEEK_SET)
897 Log("VnLoad: can't seek on index file vn=%u\n", Vn_id(vnp));
899 goto error_encountered_nolock;
900 } else if ((n = FDH_READ(fdP, (char *)&vnp->disk, vcp->diskSize))
902 /* Don't take volume off line if the inumber is out of range
903 * or the inode table is full. */
905 Log("VnLoad: bad inumber %s\n",
906 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
909 } else if (n == -1 && errno == EIO) {
910 /* disk error; salvage */
911 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
913 /* vnode is not allocated */
915 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); read %d bytes, errno %d\n",
916 Vn_id(vnp), V_id(vp), V_name(vp), n, errno);
920 goto error_encountered_nolock;
925 /* Quick check to see that the data is reasonable */
926 if (vnp->disk.vnodeMagic != vcp->magic || vnp->disk.type == vNull) {
927 if (vnp->disk.type == vNull) {
931 struct vnodeIndex *index = &vp->vnodeIndex[class];
932 unsigned int bitNumber = vnodeIdToBitNumber(Vn_id(vnp));
933 unsigned int offset = bitNumber >> 3;
935 /* Test to see if vnode number is valid. */
936 if ((offset >= index->bitmapSize)
937 || ((*(index->bitmap + offset) & (1 << (bitNumber & 0x7)))
939 Log("VnLoad: Request for unallocated vnode %u, volume %u (%s) denied.\n", Vn_id(vnp), V_id(vp), V_name(vp));
943 Log("VnLoad: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
946 goto error_encountered;
949 IH_INIT(vnp->handle, V_device(vp), V_parentId(vp), VN_GET_INO(vnp));
950 VnUnlock(vnp, WRITE_LOCK);
951 #ifdef AFS_DEMAND_ATTACH_FS
952 VnChangeState_r(vnp, VN_STATE_ONLINE);
957 error_encountered_nolock:
959 FDH_REALLYCLOSE(fdP);
965 #ifdef AFS_DEMAND_ATTACH_FS
966 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
968 VForceOffline_r(vp, 0);
975 VInvalidateVnode_r(vnp);
976 VnUnlock(vnp, WRITE_LOCK);
980 * store a vnode to disk.
982 * @param[out] ec error code output
983 * @param[in] vp volume object pointer
984 * @param[in] vnp vnode object pointer
985 * @param[in] vcp vnode class info object pointer
986 * @param[in] class vnode class enumeration
988 * @pre VOL_LOCK held.
989 * caller holds refs to volume and vnode.
990 * DAFS: caller is responsible for performing state sanity checks.
992 * @post vnode state is stored to disk.
994 * @internal vnode package internal use only
997 VnStore(Error * ec, Volume * vp, Vnode * vnp,
998 struct VnodeClassInfo * vcp, VnodeClass class)
1001 IHandle_t *ihP = vp->vnodeIndex[class].handle;
1003 #ifdef AFS_DEMAND_ATTACH_FS
1004 VnState vn_state_save;
1009 #ifdef AFS_DEMAND_ATTACH_FS
1010 vn_state_save = VnChangeState_r(vnp, VN_STATE_STORE);
1013 offset = vnodeIndexOffset(vcp, Vn_id(vnp));
1017 Log("VnStore: can't open index file!\n");
1018 goto error_encountered;
1020 if (FDH_SEEK(fdP, offset, SEEK_SET) < 0) {
1021 Log("VnStore: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
1022 fdP, offset, errno);
1023 goto error_encountered;
1026 code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
1027 if (code != vcp->diskSize) {
1028 /* Don't force volume offline if the inumber is out of
1029 * range or the inode table is full.
1031 FDH_REALLYCLOSE(fdP);
1032 if (code == BAD_IGET) {
1033 Log("VnStore: bad inumber %s\n",
1035 vp->vnodeIndex[class].handle->ih_ino));
1038 #ifdef AFS_DEMAND_ATTACH_FS
1039 VnChangeState_r(vnp, VN_STATE_ERROR);
1042 Log("VnStore: Couldn't write vnode %u, volume %u (%s) (error %d)\n", Vn_id(vnp), V_id(Vn_volume(vnp)), V_name(Vn_volume(vnp)), code);
1043 #ifdef AFS_DEMAND_ATTACH_FS
1044 goto error_encountered;
1047 VForceOffline_r(vp, 0);
1057 #ifdef AFS_DEMAND_ATTACH_FS
1058 VnChangeState_r(vnp, vn_state_save);
1063 #ifdef AFS_DEMAND_ATTACH_FS
1064 /* XXX instead of dumping core, let's try to request a salvage
1065 * and just fail the putvnode */
1069 VnChangeState_r(vnp, VN_STATE_ERROR);
1070 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1077 * get a handle to a vnode object.
1079 * @param[out] ec error code
1080 * @param[in] vp volume object
1081 * @param[in] vnodeNumber vnode id
1082 * @param[in] locktype type of lock to acquire
1084 * @return vnode object pointer
1089 VGetVnode(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1090 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1093 retVal = VGetVnode_r(ec, vp, vnodeNumber, locktype);
1099 * get a handle to a vnode object.
1101 * @param[out] ec error code
1102 * @param[in] vp volume object
1103 * @param[in] vnodeNumber vnode id
1104 * @param[in] locktype type of lock to acquire
1106 * @return vnode object pointer
1108 * @internal vnode package internal use only
1110 * @pre VOL_LOCK held.
1111 * heavyweight ref held on volume object.
1114 VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1115 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1116 register Vnode *vnp;
1119 struct VnodeClassInfo *vcp;
1120 Volume * oldvp = NULL;
1124 if (vnodeNumber == 0) {
1129 VNLog(100, 1, vnodeNumber, 0, 0, 0);
1131 #ifdef AFS_DEMAND_ATTACH_FS
1133 * once a volume has entered an error state, don't permit
1134 * further operations to proceed
1135 * -- tkeiser 11/21/2007
1137 VWaitExclusiveState_r(vp);
1138 if (VIsErrorState(V_attachState(vp))) {
1139 /* XXX is VSALVAGING acceptable here? */
1145 if (programType == fileServer && !V_inUse(vp)) {
1146 *ec = (vp->specialStatus ? vp->specialStatus : VOFFLINE);
1148 /* If the volume is VBUSY (being cloned or dumped) and this is
1149 * a READ operation, then don't fail.
1151 if ((*ec != VBUSY) || (locktype != READ_LOCK)) {
1156 class = vnodeIdToClass(vnodeNumber);
1157 vcp = &VnodeClassInfo[class];
1158 if (locktype == WRITE_LOCK && !VolumeWriteable(vp)) {
1159 *ec = (bit32) VREADONLY;
1163 if (locktype == WRITE_LOCK && programType == fileServer) {
1164 VAddToVolumeUpdateList_r(ec, vp);
1172 /* See whether the vnode is in the cache. */
1173 vnp = VLookupVnode(vp, vnodeNumber);
1175 /* vnode is in cache */
1177 VNLog(101, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
1178 VnCreateReservation_r(vnp);
1180 #ifdef AFS_DEMAND_ATTACH_FS
1182 * this is the one DAFS case where we may run into contention.
1183 * here's the basic control flow:
1185 * if locktype is READ_LOCK:
1186 * wait until vnode is not exclusive
1187 * set to VN_STATE_READ
1188 * increment read count
1191 * wait until vnode is quiescent
1192 * set to VN_STATE_EXCLUSIVE
1195 if (locktype == READ_LOCK) {
1196 VnWaitExclusiveState_r(vnp);
1198 VnWaitQuiescent_r(vnp);
1201 if (VnIsErrorState(Vn_state(vnp))) {
1202 VnCancelReservation_r(vnp);
1206 #endif /* AFS_DEMAND_ATTACH_FS */
1208 /* vnode not cached */
1210 /* Not in cache; tentatively grab most distantly used one from the LRU
1213 vnp = VGetFreeVnode_r(vcp);
1216 vnp->changed_newTime = vnp->changed_oldTime = 0;
1218 Vn_id(vnp) = vnodeNumber;
1219 VnCreateReservation_r(vnp);
1220 AddToVVnList(vp, vnp);
1221 #ifdef AFS_DEMAND_ATTACH_FS
1226 * XXX for non-DAFS, there is a serious
1227 * race condition here:
1229 * two threads can race to load a vnode. the net
1230 * result is two struct Vnodes can be allocated
1231 * and hashed, which point to the same underlying
1232 * disk data store. conflicting vnode locks can
1233 * thus be held concurrently.
1235 * for non-DAFS to be safe, VOL_LOCK really shouldn't
1236 * be dropped in VnLoad. Of course, this would likely
1237 * lead to an unacceptable slow-down.
1240 VnLoad(ec, vp, vnp, vcp, class);
1242 VnCancelReservation_r(vnp);
1245 #ifndef AFS_DEMAND_ATTACH_FS
1250 * there is no possibility for contention. we "own" this vnode.
1256 * it is imperative that nothing drop vol lock between here
1257 * and the VnBeginRead/VnChangeState stanza below
1260 VnLock(vnp, locktype, VOL_LOCK_HELD, MIGHT_DEADLOCK);
1262 /* Check that the vnode hasn't been removed while we were obtaining
1264 VNLog(102, 2, vnodeNumber, (afs_int32) vnp, 0, 0);
1265 if ((vnp->disk.type == vNull) || (Vn_cacheCheck(vnp) == 0)) {
1266 VnUnlock(vnp, locktype);
1267 VnCancelReservation_r(vnp);
1269 /* vnode is labelled correctly by now, so we don't have to invalidate it */
1273 #ifdef AFS_DEMAND_ATTACH_FS
1274 if (locktype == READ_LOCK) {
1277 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
1281 if (programType == fileServer)
1282 VBumpVolumeUsage_r(Vn_volume(vnp)); /* Hack; don't know where it should be
1283 * called from. Maybe VGetVolume */
1288 int TrustVnodeCacheEntry = 1;
1289 /* This variable is bogus--when it's set to 0, the hash chains fill
1290 up with multiple versions of the same vnode. Should fix this!! */
1292 VPutVnode(Error * ec, register Vnode * vnp)
1295 VPutVnode_r(ec, vnp);
1300 * put back a handle to a vnode object.
1302 * @param[out] ec client error code
1303 * @param[in] vnp vnode object pointer
1305 * @pre VOL_LOCK held.
1306 * ref held on vnode.
1308 * @post ref dropped on vnode.
1309 * if vnode was modified or deleted, it is written out to disk
1310 * (assuming a write lock was held).
1312 * @internal volume package internal use only
1315 VPutVnode_r(Error * ec, register Vnode * vnp)
1319 struct VnodeClassInfo *vcp;
1323 assert(Vn_refcount(vnp) != 0);
1324 class = vnodeIdToClass(Vn_id(vnp));
1325 vcp = &VnodeClassInfo[class];
1326 assert(vnp->disk.vnodeMagic == vcp->magic);
1327 VNLog(200, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
1329 #ifdef AFS_DEMAND_ATTACH_FS
1330 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1332 writeLocked = WriteLocked(&vnp->lock);
1337 #ifdef AFS_PTHREAD_ENV
1338 pthread_t thisProcess = pthread_self();
1339 #else /* AFS_PTHREAD_ENV */
1340 PROCESS thisProcess;
1341 LWP_CurrentProcess(&thisProcess);
1342 #endif /* AFS_PTHREAD_ENV */
1343 VNLog(201, 2, (afs_int32) vnp,
1344 ((vnp->changed_newTime) << 1) | ((vnp->
1345 changed_oldTime) << 1) | vnp->
1347 if (thisProcess != vnp->writer)
1348 Abort("VPutVnode: Vnode at 0x%x locked by another process!\n",
1352 if (vnp->changed_oldTime || vnp->changed_newTime || vnp->delete) {
1353 Volume *vp = Vn_volume(vnp);
1354 afs_uint32 now = FT_ApproxTime();
1355 assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1358 /* No longer any directory entries for this vnode. Free the Vnode */
1359 memset(&vnp->disk, 0, sizeof(vnp->disk));
1360 /* delete flag turned off further down */
1361 VNLog(202, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
1362 } else if (vnp->changed_newTime) {
1363 vnp->disk.serverModifyTime = now;
1365 if (vnp->changed_newTime)
1367 V_updateDate(vp) = vp->updateTime = now;
1368 if(V_volUpCounter(vp)<MAXINT)
1369 V_volUpCounter(vp)++;
1372 /* The vnode has been changed. Write it out to disk */
1374 #ifdef AFS_DEMAND_ATTACH_FS
1375 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1377 assert(V_needsSalvaged(vp));
1381 VnStore(ec, vp, vnp, vcp, class);
1383 /* If the vnode is to be deleted, and we wrote the vnode out,
1384 * free its bitmap entry. Do after the vnode is written so we
1385 * don't allocate from bitmap before the vnode is written
1386 * (doing so could cause a "addled bitmap" message).
1388 if (vnp->delete && !*ec) {
1389 if (Vn_volume(vnp)->header->diskstuff.filecount-- < 1)
1390 Vn_volume(vnp)->header->diskstuff.filecount = 0;
1391 VFreeBitMapEntry_r(ec, &vp->vnodeIndex[class],
1392 vnodeIdToBitNumber(Vn_id(vnp)));
1396 vnp->changed_newTime = vnp->changed_oldTime = 0;
1398 #ifdef AFS_DEMAND_ATTACH_FS
1399 VnChangeState_r(vnp, VN_STATE_ONLINE);
1401 } else { /* Not write locked */
1402 if (vnp->changed_newTime || vnp->changed_oldTime || vnp->delete)
1404 ("VPutVnode: Change or delete flag for vnode 0x%x is set but vnode is not write locked!\n",
1406 #ifdef AFS_DEMAND_ATTACH_FS
1411 /* Do not look at disk portion of vnode after this point; it may
1412 * have been deleted above */
1414 VnUnlock(vnp, ((writeLocked) ? WRITE_LOCK : READ_LOCK));
1415 VnCancelReservation_r(vnp);
1419 * Make an attempt to convert a vnode lock from write to read.
1420 * Do nothing if the vnode isn't write locked or the vnode has
1424 VVnodeWriteToRead(Error * ec, register Vnode * vnp)
1428 retVal = VVnodeWriteToRead_r(ec, vnp);
1434 * convert vnode handle from mutually exclusive to shared access.
1436 * @param[out] ec client error code
1437 * @param[in] vnp vnode object pointer
1439 * @return unspecified use (see out argument 'ec' for error code return)
1441 * @pre VOL_LOCK held.
1442 * ref held on vnode.
1443 * write lock held on vnode.
1445 * @post read lock held on vnode.
1446 * if vnode was modified, it has been written to disk.
1448 * @internal volume package internal use only
1451 VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
1455 struct VnodeClassInfo *vcp;
1457 #ifdef AFS_PTHREAD_ENV
1458 pthread_t thisProcess;
1459 #else /* AFS_PTHREAD_ENV */
1460 PROCESS thisProcess;
1461 #endif /* AFS_PTHREAD_ENV */
1464 assert(Vn_refcount(vnp) != 0);
1465 class = vnodeIdToClass(Vn_id(vnp));
1466 vcp = &VnodeClassInfo[class];
1467 assert(vnp->disk.vnodeMagic == vcp->magic);
1468 VNLog(300, 2, Vn_id(vnp), (afs_int32) vnp, 0, 0);
1470 #ifdef AFS_DEMAND_ATTACH_FS
1471 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1473 writeLocked = WriteLocked(&vnp->lock);
1480 VNLog(301, 2, (afs_int32) vnp,
1481 ((vnp->changed_newTime) << 1) | ((vnp->
1482 changed_oldTime) << 1) | vnp->
1486 #ifdef AFS_PTHREAD_ENV
1487 thisProcess = pthread_self();
1488 #else /* AFS_PTHREAD_ENV */
1489 LWP_CurrentProcess(&thisProcess);
1490 #endif /* AFS_PTHREAD_ENV */
1491 if (thisProcess != vnp->writer)
1492 Abort("VPutVnode: Vnode at 0x%x locked by another process!\n",
1498 if (vnp->changed_oldTime || vnp->changed_newTime) {
1499 Volume *vp = Vn_volume(vnp);
1500 afs_uint32 now = FT_ApproxTime();
1501 assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1502 if (vnp->changed_newTime)
1503 vnp->disk.serverModifyTime = now;
1504 if (vnp->changed_newTime)
1505 V_updateDate(vp) = vp->updateTime = now;
1507 /* The inode has been changed. Write it out to disk */
1509 #ifdef AFS_DEMAND_ATTACH_FS
1510 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1512 assert(V_needsSalvaged(vp));
1516 VnStore(ec, vp, vnp, vcp, class);
1520 vnp->changed_newTime = vnp->changed_oldTime = 0;
1524 #ifdef AFS_DEMAND_ATTACH_FS
1525 VnChangeState_r(vnp, VN_STATE_ONLINE);
1528 ConvertWriteToReadLock(&vnp->lock);
1534 * initial size of ihandle pointer vector.
1536 * @see VInvalidateVnodesByVolume_r
1538 #define IH_VEC_BASE_SIZE 256
1541 * increment amount for growing ihandle pointer vector.
1543 * @see VInvalidateVnodesByVolume_r
1545 #define IH_VEC_INCREMENT 256
1548 * Compile list of ihandles to be released/reallyclosed at a later time.
1550 * @param[in] vp volume object pointer
1551 * @param[out] vec_out vector of ihandle pointers to be released/reallyclosed
1552 * @param[out] vec_len_out number of valid elements in ihandle vector
1554 * @pre - VOL_LOCK is held
1555 * - volume is in appropriate exclusive state (e.g. VOL_STATE_VNODE_CLOSE,
1556 * VOL_STATE_VNODE_RELEASE)
1558 * @post - all vnodes on VVn list are invalidated
1559 * - ih_vec is populated with all valid ihandles
1561 * @return operation status
1563 * @retval ENOMEM out of memory
1565 * @todo we should handle out of memory conditions more gracefully.
1567 * @internal vnode package internal use only
1570 VInvalidateVnodesByVolume_r(Volume * vp,
1571 IHandle_t *** vec_out,
1572 size_t * vec_len_out)
1576 size_t i = 0, vec_len;
1577 IHandle_t **ih_vec, **ih_vec_new;
1579 #ifdef AFS_DEMAND_ATTACH_FS
1581 #endif /* AFS_DEMAND_ATTACH_FS */
1583 vec_len = IH_VEC_BASE_SIZE;
1584 ih_vec = malloc(sizeof(IHandle_t *) * vec_len);
1585 #ifdef AFS_DEMAND_ATTACH_FS
1592 * Traverse the volume's vnode list. Pull all the ihandles out into a
1593 * thread-private array for later asynchronous processing.
1596 for (queue_Scan(&vp->vnode_list, vnp, nvnp, Vnode)) {
1597 if (vnp->handle != NULL) {
1599 #ifdef AFS_DEMAND_ATTACH_FS
1602 vec_len += IH_VEC_INCREMENT;
1603 ih_vec_new = realloc(ih_vec, sizeof(IHandle_t *) * vec_len);
1604 #ifdef AFS_DEMAND_ATTACH_FS
1607 if (ih_vec_new == NULL) {
1611 ih_vec = ih_vec_new;
1612 #ifdef AFS_DEMAND_ATTACH_FS
1614 * Theoretically, the volume's VVn list should not change
1615 * because the volume is in an exclusive state. For the
1616 * sake of safety, we will restart the traversal from the
1617 * the beginning (which is not expensive because we're
1618 * deleting the items from the list as we go).
1620 goto restart_traversal;
1623 ih_vec[i++] = vnp->handle;
1626 DeleteFromVVnList(vnp);
1627 VInvalidateVnode_r(vnp);
1637 /* VCloseVnodeFiles - called when a volume is going off line. All open
1638 * files for vnodes in that volume are closed. This might be excessive,
1639 * since we may only be taking one volume of a volume group offline.
1642 VCloseVnodeFiles_r(Volume * vp)
1644 #ifdef AFS_DEMAND_ATTACH_FS
1645 VolState vol_state_save;
1647 IHandle_t ** ih_vec;
1650 #ifdef AFS_DEMAND_ATTACH_FS
1651 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_CLOSE);
1652 #endif /* AFS_DEMAND_ATTACH_FS */
1654 /* XXX need better error handling here */
1655 assert(VInvalidateVnodesByVolume_r(vp,
1661 * now we drop VOL_LOCK while we perform some potentially very
1662 * expensive operations in the background
1664 #ifdef AFS_DEMAND_ATTACH_FS
1668 for (i = 0; i < vec_len; i++) {
1669 IH_REALLYCLOSE(ih_vec[i]);
1670 IH_RELEASE(ih_vec[i]);
1675 #ifdef AFS_DEMAND_ATTACH_FS
1677 VChangeState_r(vp, vol_state_save);
1678 #endif /* AFS_DEMAND_ATTACH_FS */
1683 * shut down all vnode cache state for a given volume.
1685 * @param[in] vp volume object pointer
1687 * @pre VOL_LOCK is held
1689 * @post all file descriptors closed.
1690 * all inode handles released.
1691 * all vnode cache objects disassociated from volume.
1693 * @note for DAFS, these operations are performed outside the vol glock under
1694 * volume exclusive state VOL_STATE_VNODE_RELEASE. Please further note
1695 * that it would be a bug to acquire and release a volume reservation
1696 * during this exclusive operation. This is due to the fact that we are
1697 * generally called during the refcount 1->0 transition.
1699 * @todo we should handle failures in VInvalidateVnodesByVolume_r more
1702 * @see VInvalidateVnodesByVolume_r
1704 * @internal this routine is internal to the volume package
1707 VReleaseVnodeFiles_r(Volume * vp)
1709 #ifdef AFS_DEMAND_ATTACH_FS
1710 VolState vol_state_save;
1712 IHandle_t ** ih_vec;
1715 #ifdef AFS_DEMAND_ATTACH_FS
1716 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_RELEASE);
1717 #endif /* AFS_DEMAND_ATTACH_FS */
1719 /* XXX need better error handling here */
1720 assert(VInvalidateVnodesByVolume_r(vp,
1726 * now we drop VOL_LOCK while we perform some potentially very
1727 * expensive operations in the background
1729 #ifdef AFS_DEMAND_ATTACH_FS
1733 for (i = 0; i < vec_len; i++) {
1734 IH_RELEASE(ih_vec[i]);
1739 #ifdef AFS_DEMAND_ATTACH_FS
1741 VChangeState_r(vp, vol_state_save);
1742 #endif /* AFS_DEMAND_ATTACH_FS */