2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
15 Institution: The Information Technology Center, Carnegie-Mellon University
18 #include <afsconfig.h>
19 #include <afs/param.h>
20 #define MAXINT (~(1<<((sizeof(int)*8)-1)))
27 #ifdef AFS_PTHREAD_ENV
29 #else /* AFS_PTHREAD_ENV */
30 #include <afs/assert.h>
31 #endif /* AFS_PTHREAD_ENV */
34 #include "rx/rx_queue.h"
35 #include <afs/afsint.h>
37 #include <afs/errors.h>
40 #include <afs/afssyscalls.h>
44 #include "volume_inline.h"
45 #include "vnode_inline.h"
46 #include "partition.h"
48 #if defined(AFS_SGI_ENV)
49 #include "sys/types.h"
61 #include <sys/fcntl.h>
64 #endif /* AFS_NT40_ENV */
71 /*@printflike@*/ extern void Log(const char *format, ...);
73 /*@printflike@*/ extern void Abort(const char *format, ...) AFS_NORETURN;
76 struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
78 void VNLog(afs_int32 aop, afs_int32 anparms, ... );
85 #define BAD_IGET -1000
87 /* There are two separate vnode queue types defined here:
88 * Each hash conflict chain -- is singly linked, with a single head
89 * pointer. New entries are added at the beginning. Old
90 * entries are removed by linear search, which generally
91 * only occurs after a disk read).
92 * LRU chain -- is doubly linked, single head pointer.
93 * Entries are added at the head, reclaimed from the tail,
94 * or removed from anywhere in the queue.
98 /* Vnode hash table. Find hash chain by taking lower bits of
99 * (volume_hash_offset + vnode).
100 * This distributes the root inodes of the volumes over the
101 * hash table entries and also distributes the vnodes of
102 * volumes reasonably fairly. The volume_hash_offset field
103 * for each volume is established as the volume comes on line
104 * by using the VOLUME_HASH_OFFSET macro. This distributes the
105 * volumes fairly among the cache entries, both when servicing
106 * a small number of volumes and when servicing a large number.
109 /* logging stuff for finding bugs */
110 #define THELOGSIZE 5120
111 static afs_int32 theLog[THELOGSIZE];
112 static afs_int32 vnLogPtr = 0;
114 VNLog(afs_int32 aop, afs_int32 anparms, ... )
116 register afs_int32 temp;
119 va_start(ap, anparms);
122 anparms = 4; /* do bounds checking */
124 temp = (aop << 16) | anparms;
125 theLog[vnLogPtr++] = temp;
126 if (vnLogPtr >= THELOGSIZE)
128 for (temp = 0; temp < anparms; temp++) {
129 theLog[vnLogPtr++] = va_arg(ap, afs_int32);
130 if (vnLogPtr >= THELOGSIZE)
136 /* VolumeHashOffset -- returns a new value to be stored in the
137 * volumeHashOffset of a Volume structure. Called when a
138 * volume is initialized. Sets the volumeHashOffset so that
139 * vnode cache entries are distributed reasonably between
140 * volumes (the root vnodes of the volumes will hash to
141 * different values, and spacing is maintained between volumes
142 * when there are not many volumes represented), and spread
143 * equally amongst vnodes within a single volume.
146 VolumeHashOffset_r(void)
148 static int nextVolumeHashOffset = 0;
149 /* hashindex Must be power of two in size */
151 # define hashMask ((1<<hashShift)-1)
152 static byte hashindex[1 << hashShift] =
153 { 0, 128, 64, 192, 32, 160, 96, 224 };
155 offset = hashindex[nextVolumeHashOffset & hashMask]
156 + (nextVolumeHashOffset >> hashShift);
157 nextVolumeHashOffset++;
161 /* Change hashindex (above) if you change this constant */
162 #define VNODE_HASH_TABLE_SIZE 256
163 private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
164 #define VNODE_HASH(volumeptr,vnodenumber)\
165 ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
169 * add a vnode to the volume's vnode list.
171 * @param[in] vp volume object pointer
172 * @param[in] vnp vnode object pointer
174 * @note for DAFS, it may seem like we should be acquiring a lightweight ref
175 * on vp, but this would actually break things. Right now, this is ok
176 * because we destroy all vnode cache contents during during volume
181 * @internal volume package internal use only
184 AddToVVnList(Volume * vp, Vnode * vnp)
186 if (queue_IsOnQueue(vnp))
190 Vn_cacheCheck(vnp) = vp->cacheCheck;
191 queue_Append(&vp->vnode_list, vnp);
192 Vn_stateFlags(vnp) |= VN_ON_VVN;
196 * delete a vnode from the volume's vnode list.
200 * @internal volume package internal use only
203 DeleteFromVVnList(register Vnode * vnp)
205 Vn_volume(vnp) = NULL;
207 if (!queue_IsOnQueue(vnp))
211 Vn_stateFlags(vnp) &= ~(VN_ON_VVN);
215 * add a vnode to the end of the lru.
217 * @param[in] vcp vnode class info object pointer
218 * @param[in] vnp vnode object pointer
220 * @internal vnode package internal use only
223 AddToVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
225 if (Vn_stateFlags(vnp) & VN_ON_LRU) {
229 /* Add it to the circular LRU list */
230 if (vcp->lruHead == NULL)
231 Abort("VPutVnode: vcp->lruHead==NULL");
233 vnp->lruNext = vcp->lruHead;
234 vnp->lruPrev = vcp->lruHead->lruPrev;
235 vcp->lruHead->lruPrev = vnp;
236 vnp->lruPrev->lruNext = vnp;
240 /* If the vnode was just deleted, put it at the end of the chain so it
241 * will be reused immediately */
243 vcp->lruHead = vnp->lruNext;
245 Vn_stateFlags(vnp) |= VN_ON_LRU;
249 * delete a vnode from the lru.
251 * @param[in] vcp vnode class info object pointer
252 * @param[in] vnp vnode object pointer
254 * @internal vnode package internal use only
257 DeleteFromVnLRU(struct VnodeClassInfo * vcp, Vnode * vnp)
259 if (!(Vn_stateFlags(vnp) & VN_ON_LRU)) {
263 if (vnp == vcp->lruHead)
264 vcp->lruHead = vcp->lruHead->lruNext;
266 if ((vnp == vcp->lruHead) ||
267 (vcp->lruHead == NULL))
268 Abort("DeleteFromVnLRU: lru chain addled!\n");
270 vnp->lruPrev->lruNext = vnp->lruNext;
271 vnp->lruNext->lruPrev = vnp->lruPrev;
273 Vn_stateFlags(vnp) &= ~(VN_ON_LRU);
277 * add a vnode to the vnode hash table.
279 * @param[in] vnp vnode object pointer
283 * @post vnode on hash
285 * @internal vnode package internal use only
288 AddToVnHash(Vnode * vnp)
290 unsigned int newHash;
292 if (!(Vn_stateFlags(vnp) & VN_ON_HASH)) {
293 newHash = VNODE_HASH(Vn_volume(vnp), Vn_id(vnp));
294 vnp->hashNext = VnodeHashTable[newHash];
295 VnodeHashTable[newHash] = vnp;
296 vnp->hashIndex = newHash;
298 Vn_stateFlags(vnp) |= VN_ON_HASH;
303 * delete a vnode from the vnode hash table.
310 * @post vnode removed from hash
312 * @internal vnode package internal use only
315 DeleteFromVnHash(Vnode * vnp)
319 if (Vn_stateFlags(vnp) & VN_ON_HASH) {
320 tvnp = VnodeHashTable[vnp->hashIndex];
322 VnodeHashTable[vnp->hashIndex] = vnp->hashNext;
324 while (tvnp && tvnp->hashNext != vnp)
325 tvnp = tvnp->hashNext;
327 tvnp->hashNext = vnp->hashNext;
330 vnp->hashNext = NULL;
332 Vn_stateFlags(vnp) &= ~(VN_ON_HASH);
338 * invalidate a vnode cache entry.
340 * @param[in] avnode vnode object pointer
344 * @post vnode metadata invalidated.
345 * vnode removed from hash table.
346 * DAFS: vnode state set to VN_STATE_INVALID.
348 * @internal vnode package internal use only
351 VInvalidateVnode_r(register struct Vnode *avnode)
353 avnode->changed_newTime = 0; /* don't let it get flushed out again */
354 avnode->changed_oldTime = 0;
355 avnode->delete = 0; /* it isn't deleted, really */
356 avnode->cacheCheck = 0; /* invalid: prevents future vnode searches from working */
357 DeleteFromVnHash(avnode);
358 #ifdef AFS_DEMAND_ATTACH_FS
359 VnChangeState_r(avnode, VN_STATE_INVALID);
365 * initialize vnode cache for a given vnode class.
367 * @param[in] class vnode class
368 * @param[in] nVnodes size of cache
370 * @post vnode cache allocated and initialized
372 * @internal volume package internal use only
374 * @note generally called by VInitVolumePackage_r
376 * @see VInitVolumePackage_r
379 VInitVnodes(VnodeClass class, int nVnodes)
382 register struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
384 vcp->allocs = vcp->gets = vcp->reads = vcp->writes = 0;
385 vcp->cacheSize = nVnodes;
388 assert(CHECKSIZE_SMALLVNODE);
390 vcp->residentSize = SIZEOF_SMALLVNODE;
391 vcp->diskSize = SIZEOF_SMALLDISKVNODE;
392 vcp->magic = SMALLVNODEMAGIC;
396 vcp->residentSize = SIZEOF_LARGEVNODE;
397 vcp->diskSize = SIZEOF_LARGEDISKVNODE;
398 vcp->magic = LARGEVNODEMAGIC;
402 int s = vcp->diskSize - 1;
412 va = (byte *) calloc(nVnodes, vcp->residentSize);
415 Vnode *vnp = (Vnode *) va;
416 Vn_refcount(vnp) = 0; /* no context switches */
417 Vn_stateFlags(vnp) |= VN_ON_LRU;
418 #ifdef AFS_DEMAND_ATTACH_FS
419 assert(pthread_cond_init(&Vn_stateCV(vnp), NULL) == 0);
420 Vn_state(vnp) = VN_STATE_INVALID;
422 #else /* !AFS_DEMAND_ATTACH_FS */
423 Lock_Init(&vnp->lock);
424 #endif /* !AFS_DEMAND_ATTACH_FS */
425 vnp->changed_oldTime = 0;
426 vnp->changed_newTime = 0;
427 Vn_volume(vnp) = NULL;
428 Vn_cacheCheck(vnp) = 0;
429 vnp->delete = Vn_id(vnp) = 0;
430 #ifdef AFS_PTHREAD_ENV
431 vnp->writer = (pthread_t) 0;
432 #else /* AFS_PTHREAD_ENV */
433 vnp->writer = (PROCESS) 0;
434 #endif /* AFS_PTHREAD_ENV */
438 if (vcp->lruHead == NULL)
439 vcp->lruHead = vnp->lruNext = vnp->lruPrev = vnp;
441 vnp->lruNext = vcp->lruHead;
442 vnp->lruPrev = vcp->lruHead->lruPrev;
443 vcp->lruHead->lruPrev = vnp;
444 vnp->lruPrev->lruNext = vnp;
447 va += vcp->residentSize;
454 * allocate an unused vnode from the lru chain.
456 * @param[in] vcp vnode class info object pointer
458 * @pre VOL_LOCK is held
460 * @post vnode object is removed from lru, and vnode hash table.
461 * vnode is disassociated from volume object.
462 * state is set to VN_STATE_INVALID.
463 * inode handle is released.
465 * @note we traverse backwards along the lru circlist. It shouldn't
466 * be necessary to specify that nUsers == 0 since if it is in the list,
467 * nUsers should be 0. Things shouldn't be in lruq unless no one is
470 * @warning DAFS: VOL_LOCK is dropped while doing inode handle release
472 * @return vnode object pointer
475 VGetFreeVnode_r(struct VnodeClassInfo * vcp)
479 vnp = vcp->lruHead->lruPrev;
480 #ifdef AFS_DEMAND_ATTACH_FS
481 if (Vn_refcount(vnp) != 0 || VnIsExclusiveState(Vn_state(vnp)) ||
482 Vn_readers(vnp) != 0)
483 Abort("VGetFreeVnode_r: in-use vnode in lruq");
485 if (Vn_refcount(vnp) != 0 || CheckLock(&vnp->lock))
486 Abort("VGetFreeVnode_r: locked vnode in lruq");
488 VNLog(1, 2, Vn_id(vnp), (intptr_t)vnp, 0, 0);
491 * it's going to be overwritten soon enough.
492 * remove from LRU, delete hash entry, and
493 * disassociate from old parent volume before
494 * we have a chance to drop the vol glock
496 DeleteFromVnLRU(vcp, vnp);
497 DeleteFromVnHash(vnp);
498 if (Vn_volume(vnp)) {
499 DeleteFromVVnList(vnp);
502 /* drop the file descriptor */
504 #ifdef AFS_DEMAND_ATTACH_FS
505 VnChangeState_r(vnp, VN_STATE_RELEASING);
508 /* release is, potentially, a highly latent operation due to a couple
510 * - ihandle package lock contention
511 * - closing file descriptor(s) associated with ih
513 * Hance, we perform outside of the volume package lock in order to
514 * reduce the probability of contention.
516 IH_RELEASE(vnp->handle);
517 #ifdef AFS_DEMAND_ATTACH_FS
522 #ifdef AFS_DEMAND_ATTACH_FS
523 VnChangeState_r(vnp, VN_STATE_INVALID);
531 * lookup a vnode in the vnode cache hash table.
533 * @param[in] vp pointer to volume object
534 * @param[in] vnodeId vnode id
538 * @post matching vnode object or NULL is returned
540 * @return vnode object pointer
541 * @retval NULL no matching vnode object was found in the cache
543 * @internal vnode package internal use only
545 * @note this symbol is exported strictly for fssync debug protocol use
548 VLookupVnode(Volume * vp, VnodeId vnodeId)
551 unsigned int newHash;
553 newHash = VNODE_HASH(vp, vnodeId);
554 for (vnp = VnodeHashTable[newHash];
556 ((Vn_id(vnp) != vnodeId) ||
557 (Vn_volume(vnp) != vp) ||
558 (vp->cacheCheck != Vn_cacheCheck(vnp))));
559 vnp = vnp->hashNext);
566 VAllocVnode(Error * ec, Volume * vp, VnodeType type)
570 retVal = VAllocVnode_r(ec, vp, type);
576 * allocate a new vnode.
578 * @param[out] ec error code return
579 * @param[in] vp volume object pointer
580 * @param[in] type desired vnode type
582 * @return vnode object pointer
584 * @pre VOL_LOCK held;
585 * heavyweight ref held on vp
587 * @post vnode allocated and returned
590 VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
595 register struct VnodeClassInfo *vcp;
598 #ifdef AFS_DEMAND_ATTACH_FS
599 VolState vol_state_save;
604 #ifdef AFS_DEMAND_ATTACH_FS
606 * once a volume has entered an error state, don't permit
607 * further operations to proceed
608 * -- tkeiser 11/21/2007
610 VWaitExclusiveState_r(vp);
611 if (VIsErrorState(V_attachState(vp))) {
612 /* XXX is VSALVAGING acceptable here? */
618 if (programType == fileServer && !V_inUse(vp)) {
619 if (vp->specialStatus) {
620 *ec = vp->specialStatus;
626 class = vnodeTypeToClass(type);
627 vcp = &VnodeClassInfo[class];
629 if (!VolumeWriteable(vp)) {
630 *ec = (bit32) VREADONLY;
634 unique = vp->nextVnodeUnique++;
636 unique = vp->nextVnodeUnique++;
638 if (vp->nextVnodeUnique > V_uniquifier(vp)) {
639 VUpdateVolume_r(ec, vp, 0);
644 if (programType == fileServer) {
645 VAddToVolumeUpdateList_r(ec, vp);
650 /* Find a slot in the bit map */
651 bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
652 VOL_ALLOC_BITMAP_WAIT);
655 vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
659 * at this point we should be assured that V_attachState(vp) is non-exclusive
663 VNLog(2, 1, vnodeNumber, 0, 0, 0);
664 /* Prepare to move it to the new hash chain */
665 vnp = VLookupVnode(vp, vnodeNumber);
667 /* slot already exists. May even not be in lruq (consider store file locking a file being deleted)
668 * so we may have to wait for it below */
669 VNLog(3, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
671 VnCreateReservation_r(vnp);
672 if (Vn_refcount(vnp) == 1) {
673 /* we're the only user */
674 /* This won't block */
675 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
677 /* other users present; follow locking hierarchy */
678 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, MIGHT_DEADLOCK);
680 #ifdef AFS_DEMAND_ATTACH_FS
683 * vnode was cached, wait for any existing exclusive ops to finish.
684 * once we have reacquired the lock, re-verify volume state.
686 * note: any vnode error state is related to the old vnode; disregard.
688 VnWaitQuiescent_r(vnp);
689 if (VIsErrorState(V_attachState(vp))) {
690 VnUnlock(vnp, WRITE_LOCK);
691 VnCancelReservation_r(vnp);
698 * verify state of the world hasn't changed
700 * (technically, this should never happen because cachecheck
701 * is only updated during a volume attach, which should not
702 * happen when refs are held)
704 if (Vn_volume(vnp)->cacheCheck != Vn_cacheCheck(vnp)) {
705 VnUnlock(vnp, WRITE_LOCK);
706 VnCancelReservation_r(vnp);
712 /* no such vnode in the cache */
714 vnp = VGetFreeVnode_r(vcp);
716 /* Initialize the header fields so noone allocates another
717 * vnode with the same number */
718 Vn_id(vnp) = vnodeNumber;
719 VnCreateReservation_r(vnp);
720 AddToVVnList(vp, vnp);
721 #ifdef AFS_DEMAND_ATTACH_FS
725 /* This will never block (guaranteed by check in VGetFreeVnode_r() */
726 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
728 #ifdef AFS_DEMAND_ATTACH_FS
729 VnChangeState_r(vnp, VN_STATE_ALLOC);
732 /* Sanity check: is this vnode really not in use? */
735 IHandle_t *ihP = vp->vnodeIndex[class].handle;
737 afs_foff_t off = vnodeIndexOffset(vcp, vnodeNumber);
740 /* XXX we have a potential race here if two threads
741 * allocate new vnodes at the same time, and they
742 * both decide it's time to extend the index
745 #ifdef AFS_DEMAND_ATTACH_FS
747 * this race has been eliminated for the DAFS case
748 * using exclusive state VOL_STATE_VNODE_ALLOC
750 * if this becomes a bottleneck, there are ways to
751 * improve parallelism for this code path
752 * -- tkeiser 11/28/2007
754 VCreateReservation_r(vp);
755 VWaitExclusiveState_r(vp);
756 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_ALLOC);
762 Log("VAllocVnode: can't open index file!\n");
764 goto error_encountered;
766 if ((size = FDH_SIZE(fdP)) < 0) {
767 Log("VAllocVnode: can't stat index file!\n");
769 goto error_encountered;
771 if (FDH_SEEK(fdP, off, SEEK_SET) < 0) {
772 Log("VAllocVnode: can't seek on index file!\n");
774 goto error_encountered;
776 if (off + vcp->diskSize <= size) {
777 if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) != vcp->diskSize) {
778 Log("VAllocVnode: can't read index file!\n");
780 goto error_encountered;
782 if (vnp->disk.type != vNull) {
783 Log("VAllocVnode: addled bitmap or index!\n");
785 goto error_encountered;
788 /* growing file - grow in a reasonable increment */
789 char *buf = (char *)malloc(16 * 1024);
791 Log("VAllocVnode: can't grow vnode index: out of memory\n");
793 goto error_encountered;
795 memset(buf, 0, 16 * 1024);
796 if ((FDH_WRITE(fdP, buf, 16 * 1024)) != 16 * 1024) {
797 Log("VAllocVnode: can't grow vnode index: write failed\n");
800 goto error_encountered;
807 #ifdef AFS_DEMAND_ATTACH_FS
808 VChangeState_r(vp, vol_state_save);
809 VCancelReservation_r(vp);
816 * close the file handle
818 * invalidate the vnode
819 * free up the bitmap entry (although salvager should take care of it)
821 * drop vnode lock and refs
826 VFreeBitMapEntry_r(&tmp, &vp->vnodeIndex[class], bitNumber);
827 VInvalidateVnode_r(vnp);
828 VnUnlock(vnp, WRITE_LOCK);
829 VnCancelReservation_r(vnp);
830 #ifdef AFS_DEMAND_ATTACH_FS
831 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
832 VCancelReservation_r(vp);
834 VForceOffline_r(vp, 0);
839 VNLog(4, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
840 #ifndef AFS_DEMAND_ATTACH_FS
845 VNLog(5, 1, (intptr_t)vnp, 0, 0, 0);
846 memset(&vnp->disk, 0, sizeof(vnp->disk));
847 vnp->changed_newTime = 0; /* set this bit when vnode is updated */
848 vnp->changed_oldTime = 0; /* set this on CopyOnWrite. */
850 vnp->disk.vnodeMagic = vcp->magic;
851 vnp->disk.type = type;
852 vnp->disk.uniquifier = unique;
855 vp->header->diskstuff.filecount++;
856 #ifdef AFS_DEMAND_ATTACH_FS
857 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
863 * load a vnode from disk.
865 * @param[out] ec client error code return
866 * @param[in] vp volume object pointer
867 * @param[in] vnp vnode object pointer
868 * @param[in] vcp vnode class info object pointer
869 * @param[in] class vnode class enumeration
871 * @pre vnode is registered in appropriate data structures;
872 * caller holds a ref on vnode; VOL_LOCK is held
874 * @post vnode data is loaded from disk.
875 * vnode state is set to VN_STATE_ONLINE.
876 * on failure, vnode is invalidated.
878 * @internal vnode package internal use only
881 VnLoad(Error * ec, Volume * vp, Vnode * vnp,
882 struct VnodeClassInfo * vcp, VnodeClass class)
884 /* vnode not cached */
888 IHandle_t *ihP = vp->vnodeIndex[class].handle;
894 #ifdef AFS_DEMAND_ATTACH_FS
895 VnChangeState_r(vnp, VN_STATE_LOAD);
898 /* This will never block */
899 VnLock(vnp, WRITE_LOCK, VOL_LOCK_HELD, WILL_NOT_DEADLOCK);
904 Log("VnLoad: can't open index dev=%u, i=%s\n", vp->device,
905 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
907 goto error_encountered_nolock;
908 } else if (FDH_SEEK(fdP, vnodeIndexOffset(vcp, Vn_id(vnp)), SEEK_SET)
910 Log("VnLoad: can't seek on index file vn=%u\n", Vn_id(vnp));
912 goto error_encountered_nolock;
913 } else if ((nBytes = FDH_READ(fdP, (char *)&vnp->disk, vcp->diskSize))
915 /* Don't take volume off line if the inumber is out of range
916 * or the inode table is full. */
917 if (nBytes == BAD_IGET) {
918 Log("VnLoad: bad inumber %s\n",
919 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
922 } else if (nBytes == -1 && errno == EIO) {
923 /* disk error; salvage */
924 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
926 /* vnode is not allocated */
928 Log("VnLoad: Couldn't read vnode %u, volume %u (%s); read %d bytes, errno %d\n",
929 Vn_id(vnp), V_id(vp), V_name(vp), (int)nBytes, errno);
933 goto error_encountered_nolock;
938 /* Quick check to see that the data is reasonable */
939 if (vnp->disk.vnodeMagic != vcp->magic || vnp->disk.type == vNull) {
940 if (vnp->disk.type == vNull) {
944 struct vnodeIndex *index = &vp->vnodeIndex[class];
945 unsigned int bitNumber = vnodeIdToBitNumber(Vn_id(vnp));
946 unsigned int offset = bitNumber >> 3;
948 /* Test to see if vnode number is valid. */
949 if ((offset >= index->bitmapSize)
950 || ((*(index->bitmap + offset) & (1 << (bitNumber & 0x7)))
952 Log("VnLoad: Request for unallocated vnode %u, volume %u (%s) denied.\n", Vn_id(vnp), V_id(vp), V_name(vp));
956 Log("VnLoad: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", Vn_id(vnp), V_id(vp), V_name(vp));
959 goto error_encountered;
962 IH_INIT(vnp->handle, V_device(vp), V_parentId(vp), VN_GET_INO(vnp));
963 VnUnlock(vnp, WRITE_LOCK);
964 #ifdef AFS_DEMAND_ATTACH_FS
965 VnChangeState_r(vnp, VN_STATE_ONLINE);
970 error_encountered_nolock:
972 FDH_REALLYCLOSE(fdP);
978 #ifdef AFS_DEMAND_ATTACH_FS
979 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
981 VForceOffline_r(vp, 0);
988 VInvalidateVnode_r(vnp);
989 VnUnlock(vnp, WRITE_LOCK);
993 * store a vnode to disk.
995 * @param[out] ec error code output
996 * @param[in] vp volume object pointer
997 * @param[in] vnp vnode object pointer
998 * @param[in] vcp vnode class info object pointer
999 * @param[in] class vnode class enumeration
1001 * @pre VOL_LOCK held.
1002 * caller holds refs to volume and vnode.
1003 * DAFS: caller is responsible for performing state sanity checks.
1005 * @post vnode state is stored to disk.
1007 * @internal vnode package internal use only
1010 VnStore(Error * ec, Volume * vp, Vnode * vnp,
1011 struct VnodeClassInfo * vcp, VnodeClass class)
1015 IHandle_t *ihP = vp->vnodeIndex[class].handle;
1017 #ifdef AFS_DEMAND_ATTACH_FS
1018 VnState vn_state_save;
1023 #ifdef AFS_DEMAND_ATTACH_FS
1024 vn_state_save = VnChangeState_r(vnp, VN_STATE_STORE);
1027 offset = vnodeIndexOffset(vcp, Vn_id(vnp));
1031 Log("VnStore: can't open index file!\n");
1032 goto error_encountered;
1034 if (FDH_SEEK(fdP, offset, SEEK_SET) < 0) {
1035 Log("VnStore: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
1036 fdP, offset, errno);
1037 goto error_encountered;
1040 nBytes = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
1041 if (nBytes != vcp->diskSize) {
1042 /* Don't force volume offline if the inumber is out of
1043 * range or the inode table is full.
1045 FDH_REALLYCLOSE(fdP);
1046 if (nBytes == BAD_IGET) {
1047 Log("VnStore: bad inumber %s\n",
1049 vp->vnodeIndex[class].handle->ih_ino));
1052 #ifdef AFS_DEMAND_ATTACH_FS
1053 VnChangeState_r(vnp, VN_STATE_ERROR);
1056 Log("VnStore: Couldn't write vnode %u, volume %u (%s) (error %d)\n", Vn_id(vnp), V_id(Vn_volume(vnp)), V_name(Vn_volume(vnp)), (int)nBytes);
1057 #ifdef AFS_DEMAND_ATTACH_FS
1058 goto error_encountered;
1061 VForceOffline_r(vp, 0);
1071 #ifdef AFS_DEMAND_ATTACH_FS
1072 VnChangeState_r(vnp, vn_state_save);
1077 #ifdef AFS_DEMAND_ATTACH_FS
1078 /* XXX instead of dumping core, let's try to request a salvage
1079 * and just fail the putvnode */
1083 VnChangeState_r(vnp, VN_STATE_ERROR);
1084 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1091 * get a handle to a vnode object.
1093 * @param[out] ec error code
1094 * @param[in] vp volume object
1095 * @param[in] vnodeNumber vnode id
1096 * @param[in] locktype type of lock to acquire
1098 * @return vnode object pointer
1103 VGetVnode(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1104 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1107 retVal = VGetVnode_r(ec, vp, vnodeNumber, locktype);
1113 * get a handle to a vnode object.
1115 * @param[out] ec error code
1116 * @param[in] vp volume object
1117 * @param[in] vnodeNumber vnode id
1118 * @param[in] locktype type of lock to acquire
1120 * @return vnode object pointer
1122 * @internal vnode package internal use only
1124 * @pre VOL_LOCK held.
1125 * heavyweight ref held on volume object.
1128 VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
1129 { /* READ_LOCK or WRITE_LOCK, as defined in lock.h */
1130 register Vnode *vnp;
1132 struct VnodeClassInfo *vcp;
1136 if (vnodeNumber == 0) {
1141 VNLog(100, 1, vnodeNumber, 0, 0, 0);
1143 #ifdef AFS_DEMAND_ATTACH_FS
1145 * once a volume has entered an error state, don't permit
1146 * further operations to proceed
1147 * -- tkeiser 11/21/2007
1149 VWaitExclusiveState_r(vp);
1150 if (VIsErrorState(V_attachState(vp))) {
1151 /* XXX is VSALVAGING acceptable here? */
1157 if (programType == fileServer && !V_inUse(vp)) {
1158 *ec = (vp->specialStatus ? vp->specialStatus : VOFFLINE);
1160 /* If the volume is VBUSY (being cloned or dumped) and this is
1161 * a READ operation, then don't fail.
1163 if ((*ec != VBUSY) || (locktype != READ_LOCK)) {
1168 class = vnodeIdToClass(vnodeNumber);
1169 vcp = &VnodeClassInfo[class];
1170 if (locktype == WRITE_LOCK && !VolumeWriteable(vp)) {
1171 *ec = (bit32) VREADONLY;
1175 if (locktype == WRITE_LOCK && programType == fileServer) {
1176 VAddToVolumeUpdateList_r(ec, vp);
1184 /* See whether the vnode is in the cache. */
1185 vnp = VLookupVnode(vp, vnodeNumber);
1187 /* vnode is in cache */
1189 VNLog(101, 2, vnodeNumber, (intptr_t)vnp, 0, 0);
1190 VnCreateReservation_r(vnp);
1192 #ifdef AFS_DEMAND_ATTACH_FS
1194 * this is the one DAFS case where we may run into contention.
1195 * here's the basic control flow:
1197 * if locktype is READ_LOCK:
1198 * wait until vnode is not exclusive
1199 * set to VN_STATE_READ
1200 * increment read count
1203 * wait until vnode is quiescent
1204 * set to VN_STATE_EXCLUSIVE
1207 if (locktype == READ_LOCK) {
1208 VnWaitExclusiveState_r(vnp);
1210 VnWaitQuiescent_r(vnp);
1213 if (VnIsErrorState(Vn_state(vnp))) {
1214 VnCancelReservation_r(vnp);
1218 #endif /* AFS_DEMAND_ATTACH_FS */
1220 /* vnode not cached */
1222 /* Not in cache; tentatively grab most distantly used one from the LRU
1225 vnp = VGetFreeVnode_r(vcp);
1228 vnp->changed_newTime = vnp->changed_oldTime = 0;
1230 Vn_id(vnp) = vnodeNumber;
1231 VnCreateReservation_r(vnp);
1232 AddToVVnList(vp, vnp);
1233 #ifdef AFS_DEMAND_ATTACH_FS
1238 * XXX for non-DAFS, there is a serious
1239 * race condition here:
1241 * two threads can race to load a vnode. the net
1242 * result is two struct Vnodes can be allocated
1243 * and hashed, which point to the same underlying
1244 * disk data store. conflicting vnode locks can
1245 * thus be held concurrently.
1247 * for non-DAFS to be safe, VOL_LOCK really shouldn't
1248 * be dropped in VnLoad. Of course, this would likely
1249 * lead to an unacceptable slow-down.
1252 VnLoad(ec, vp, vnp, vcp, class);
1254 VnCancelReservation_r(vnp);
1257 #ifndef AFS_DEMAND_ATTACH_FS
1262 * there is no possibility for contention. we "own" this vnode.
1268 * it is imperative that nothing drop vol lock between here
1269 * and the VnBeginRead/VnChangeState stanza below
1272 VnLock(vnp, locktype, VOL_LOCK_HELD, MIGHT_DEADLOCK);
1274 /* Check that the vnode hasn't been removed while we were obtaining
1276 VNLog(102, 2, vnodeNumber, (intptr_t) vnp, 0, 0);
1277 if ((vnp->disk.type == vNull) || (Vn_cacheCheck(vnp) == 0)) {
1278 VnUnlock(vnp, locktype);
1279 VnCancelReservation_r(vnp);
1281 /* vnode is labelled correctly by now, so we don't have to invalidate it */
1285 #ifdef AFS_DEMAND_ATTACH_FS
1286 if (locktype == READ_LOCK) {
1289 VnChangeState_r(vnp, VN_STATE_EXCLUSIVE);
1293 if (programType == fileServer)
1294 VBumpVolumeUsage_r(Vn_volume(vnp)); /* Hack; don't know where it should be
1295 * called from. Maybe VGetVolume */
1300 int TrustVnodeCacheEntry = 1;
1301 /* This variable is bogus--when it's set to 0, the hash chains fill
1302 up with multiple versions of the same vnode. Should fix this!! */
1304 VPutVnode(Error * ec, register Vnode * vnp)
1307 VPutVnode_r(ec, vnp);
1312 * put back a handle to a vnode object.
1314 * @param[out] ec client error code
1315 * @param[in] vnp vnode object pointer
1317 * @pre VOL_LOCK held.
1318 * ref held on vnode.
1320 * @post ref dropped on vnode.
1321 * if vnode was modified or deleted, it is written out to disk
1322 * (assuming a write lock was held).
1324 * @internal volume package internal use only
1327 VPutVnode_r(Error * ec, register Vnode * vnp)
1331 struct VnodeClassInfo *vcp;
1334 assert(Vn_refcount(vnp) != 0);
1335 class = vnodeIdToClass(Vn_id(vnp));
1336 vcp = &VnodeClassInfo[class];
1337 assert(vnp->disk.vnodeMagic == vcp->magic);
1338 VNLog(200, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1340 #ifdef AFS_DEMAND_ATTACH_FS
1341 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1343 writeLocked = WriteLocked(&vnp->lock);
1348 #ifdef AFS_PTHREAD_ENV
1349 pthread_t thisProcess = pthread_self();
1350 #else /* AFS_PTHREAD_ENV */
1351 PROCESS thisProcess;
1352 LWP_CurrentProcess(&thisProcess);
1353 #endif /* AFS_PTHREAD_ENV */
1354 VNLog(201, 2, (intptr_t) vnp,
1355 ((vnp->changed_newTime) << 1) | ((vnp->
1356 changed_oldTime) << 1) | vnp->
1358 if (thisProcess != vnp->writer)
1359 Abort("VPutVnode: Vnode at 0x%x locked by another process!\n",
1363 if (vnp->changed_oldTime || vnp->changed_newTime || vnp->delete) {
1364 Volume *vp = Vn_volume(vnp);
1365 afs_uint32 now = FT_ApproxTime();
1366 assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1369 /* No longer any directory entries for this vnode. Free the Vnode */
1370 memset(&vnp->disk, 0, sizeof(vnp->disk));
1371 /* delete flag turned off further down */
1372 VNLog(202, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1373 } else if (vnp->changed_newTime) {
1374 vnp->disk.serverModifyTime = now;
1376 if (vnp->changed_newTime)
1378 V_updateDate(vp) = vp->updateTime = now;
1379 if(V_volUpCounter(vp)<MAXINT)
1380 V_volUpCounter(vp)++;
1383 /* The vnode has been changed. Write it out to disk */
1385 #ifdef AFS_DEMAND_ATTACH_FS
1386 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1388 assert(V_needsSalvaged(vp));
1392 VnStore(ec, vp, vnp, vcp, class);
1394 /* If the vnode is to be deleted, and we wrote the vnode out,
1395 * free its bitmap entry. Do after the vnode is written so we
1396 * don't allocate from bitmap before the vnode is written
1397 * (doing so could cause a "addled bitmap" message).
1399 if (vnp->delete && !*ec) {
1400 if (Vn_volume(vnp)->header->diskstuff.filecount-- < 1)
1401 Vn_volume(vnp)->header->diskstuff.filecount = 0;
1402 VFreeBitMapEntry_r(ec, &vp->vnodeIndex[class],
1403 vnodeIdToBitNumber(Vn_id(vnp)));
1407 vnp->changed_newTime = vnp->changed_oldTime = 0;
1409 #ifdef AFS_DEMAND_ATTACH_FS
1410 VnChangeState_r(vnp, VN_STATE_ONLINE);
1412 } else { /* Not write locked */
1413 if (vnp->changed_newTime || vnp->changed_oldTime || vnp->delete)
1415 ("VPutVnode: Change or delete flag for vnode 0x%x is set but vnode is not write locked!\n",
1417 #ifdef AFS_DEMAND_ATTACH_FS
1422 /* Do not look at disk portion of vnode after this point; it may
1423 * have been deleted above */
1425 VnUnlock(vnp, ((writeLocked) ? WRITE_LOCK : READ_LOCK));
1426 VnCancelReservation_r(vnp);
1430 * Make an attempt to convert a vnode lock from write to read.
1431 * Do nothing if the vnode isn't write locked or the vnode has
1435 VVnodeWriteToRead(Error * ec, register Vnode * vnp)
1439 retVal = VVnodeWriteToRead_r(ec, vnp);
1445 * convert vnode handle from mutually exclusive to shared access.
1447 * @param[out] ec client error code
1448 * @param[in] vnp vnode object pointer
1450 * @return unspecified use (see out argument 'ec' for error code return)
1452 * @pre VOL_LOCK held.
1453 * ref held on vnode.
1454 * write lock held on vnode.
1456 * @post read lock held on vnode.
1457 * if vnode was modified, it has been written to disk.
1459 * @internal volume package internal use only
1462 VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
1466 struct VnodeClassInfo *vcp;
1467 #ifdef AFS_PTHREAD_ENV
1468 pthread_t thisProcess;
1469 #else /* AFS_PTHREAD_ENV */
1470 PROCESS thisProcess;
1471 #endif /* AFS_PTHREAD_ENV */
1474 assert(Vn_refcount(vnp) != 0);
1475 class = vnodeIdToClass(Vn_id(vnp));
1476 vcp = &VnodeClassInfo[class];
1477 assert(vnp->disk.vnodeMagic == vcp->magic);
1478 VNLog(300, 2, Vn_id(vnp), (intptr_t) vnp, 0, 0);
1480 #ifdef AFS_DEMAND_ATTACH_FS
1481 writeLocked = (Vn_state(vnp) == VN_STATE_EXCLUSIVE);
1483 writeLocked = WriteLocked(&vnp->lock);
1490 VNLog(301, 2, (intptr_t) vnp,
1491 ((vnp->changed_newTime) << 1) | ((vnp->
1492 changed_oldTime) << 1) | vnp->
1496 #ifdef AFS_PTHREAD_ENV
1497 thisProcess = pthread_self();
1498 #else /* AFS_PTHREAD_ENV */
1499 LWP_CurrentProcess(&thisProcess);
1500 #endif /* AFS_PTHREAD_ENV */
1501 if (thisProcess != vnp->writer)
1502 Abort("VPutVnode: Vnode at 0x%x locked by another process!\n",
1508 if (vnp->changed_oldTime || vnp->changed_newTime) {
1509 Volume *vp = Vn_volume(vnp);
1510 afs_uint32 now = FT_ApproxTime();
1511 assert(Vn_cacheCheck(vnp) == vp->cacheCheck);
1512 if (vnp->changed_newTime)
1513 vnp->disk.serverModifyTime = now;
1514 if (vnp->changed_newTime)
1515 V_updateDate(vp) = vp->updateTime = now;
1517 /* The inode has been changed. Write it out to disk */
1519 #ifdef AFS_DEMAND_ATTACH_FS
1520 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0);
1522 assert(V_needsSalvaged(vp));
1526 VnStore(ec, vp, vnp, vcp, class);
1529 vnp->changed_newTime = vnp->changed_oldTime = 0;
1533 #ifdef AFS_DEMAND_ATTACH_FS
1534 VnChangeState_r(vnp, VN_STATE_ONLINE);
1537 ConvertWriteToReadLock(&vnp->lock);
1543 * initial size of ihandle pointer vector.
1545 * @see VInvalidateVnodesByVolume_r
1547 #define IH_VEC_BASE_SIZE 256
1550 * increment amount for growing ihandle pointer vector.
1552 * @see VInvalidateVnodesByVolume_r
1554 #define IH_VEC_INCREMENT 256
1557 * Compile list of ihandles to be released/reallyclosed at a later time.
1559 * @param[in] vp volume object pointer
1560 * @param[out] vec_out vector of ihandle pointers to be released/reallyclosed
1561 * @param[out] vec_len_out number of valid elements in ihandle vector
1563 * @pre - VOL_LOCK is held
1564 * - volume is in appropriate exclusive state (e.g. VOL_STATE_VNODE_CLOSE,
1565 * VOL_STATE_VNODE_RELEASE)
1567 * @post - all vnodes on VVn list are invalidated
1568 * - ih_vec is populated with all valid ihandles
1570 * @return operation status
1572 * @retval ENOMEM out of memory
1574 * @todo we should handle out of memory conditions more gracefully.
1576 * @internal vnode package internal use only
1579 VInvalidateVnodesByVolume_r(Volume * vp,
1580 IHandle_t *** vec_out,
1581 size_t * vec_len_out)
1585 size_t i = 0, vec_len;
1586 IHandle_t **ih_vec, **ih_vec_new;
1588 #ifdef AFS_DEMAND_ATTACH_FS
1590 #endif /* AFS_DEMAND_ATTACH_FS */
1592 vec_len = IH_VEC_BASE_SIZE;
1593 ih_vec = malloc(sizeof(IHandle_t *) * vec_len);
1594 #ifdef AFS_DEMAND_ATTACH_FS
1601 * Traverse the volume's vnode list. Pull all the ihandles out into a
1602 * thread-private array for later asynchronous processing.
1604 #ifdef AFS_DEMAND_ATTACH_FS
1607 for (queue_Scan(&vp->vnode_list, vnp, nvnp, Vnode)) {
1608 if (vnp->handle != NULL) {
1610 #ifdef AFS_DEMAND_ATTACH_FS
1613 vec_len += IH_VEC_INCREMENT;
1614 ih_vec_new = realloc(ih_vec, sizeof(IHandle_t *) * vec_len);
1615 #ifdef AFS_DEMAND_ATTACH_FS
1618 if (ih_vec_new == NULL) {
1622 ih_vec = ih_vec_new;
1623 #ifdef AFS_DEMAND_ATTACH_FS
1625 * Theoretically, the volume's VVn list should not change
1626 * because the volume is in an exclusive state. For the
1627 * sake of safety, we will restart the traversal from the
1628 * the beginning (which is not expensive because we're
1629 * deleting the items from the list as we go).
1631 goto restart_traversal;
1634 ih_vec[i++] = vnp->handle;
1637 DeleteFromVVnList(vnp);
1638 VInvalidateVnode_r(vnp);
1648 /* VCloseVnodeFiles - called when a volume is going off line. All open
1649 * files for vnodes in that volume are closed. This might be excessive,
1650 * since we may only be taking one volume of a volume group offline.
1653 VCloseVnodeFiles_r(Volume * vp)
1655 #ifdef AFS_DEMAND_ATTACH_FS
1656 VolState vol_state_save;
1658 IHandle_t ** ih_vec;
1661 #ifdef AFS_DEMAND_ATTACH_FS
1662 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_CLOSE);
1663 #endif /* AFS_DEMAND_ATTACH_FS */
1665 /* XXX need better error handling here */
1666 assert(VInvalidateVnodesByVolume_r(vp,
1672 * now we drop VOL_LOCK while we perform some potentially very
1673 * expensive operations in the background
1675 #ifdef AFS_DEMAND_ATTACH_FS
1679 for (i = 0; i < vec_len; i++) {
1680 IH_REALLYCLOSE(ih_vec[i]);
1685 #ifdef AFS_DEMAND_ATTACH_FS
1687 VChangeState_r(vp, vol_state_save);
1688 #endif /* AFS_DEMAND_ATTACH_FS */
1693 * shut down all vnode cache state for a given volume.
1695 * @param[in] vp volume object pointer
1697 * @pre VOL_LOCK is held
1699 * @post all file descriptors closed.
1700 * all inode handles released.
1701 * all vnode cache objects disassociated from volume.
1703 * @note for DAFS, these operations are performed outside the vol glock under
1704 * volume exclusive state VOL_STATE_VNODE_RELEASE. Please further note
1705 * that it would be a bug to acquire and release a volume reservation
1706 * during this exclusive operation. This is due to the fact that we are
1707 * generally called during the refcount 1->0 transition.
1709 * @todo we should handle failures in VInvalidateVnodesByVolume_r more
1712 * @see VInvalidateVnodesByVolume_r
1714 * @internal this routine is internal to the volume package
1717 VReleaseVnodeFiles_r(Volume * vp)
1719 #ifdef AFS_DEMAND_ATTACH_FS
1720 VolState vol_state_save;
1722 IHandle_t ** ih_vec;
1725 #ifdef AFS_DEMAND_ATTACH_FS
1726 vol_state_save = VChangeState_r(vp, VOL_STATE_VNODE_RELEASE);
1727 #endif /* AFS_DEMAND_ATTACH_FS */
1729 /* XXX need better error handling here */
1730 assert(VInvalidateVnodesByVolume_r(vp,
1736 * now we drop VOL_LOCK while we perform some potentially very
1737 * expensive operations in the background
1739 #ifdef AFS_DEMAND_ATTACH_FS
1743 for (i = 0; i < vec_len; i++) {
1744 IH_RELEASE(ih_vec[i]);
1749 #ifdef AFS_DEMAND_ATTACH_FS
1751 VChangeState_r(vp, vol_state_save);
1752 #endif /* AFS_DEMAND_ATTACH_FS */