2 * Copyright 2000, International Business Machines Corporation and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
9 * Portions Copyright (c) 2005-2008 Sine Nomine Associates
12 /* 1/1/89: NB: this stuff is all going to be replaced. Don't take it too seriously */
17 Institution: The Information Technology Center, Carnegie-Mellon University
21 #include <afsconfig.h>
22 #include <afs/param.h>
28 #include <afs/afsint.h>
31 #include <sys/param.h>
32 #if !defined(AFS_SGI_ENV)
35 #else /* AFS_OSF_ENV */
36 #ifdef AFS_VFSINCL_ENV
39 #include <sys/fs/ufs_fs.h>
41 #if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV)
42 #include <ufs/ufs/dinode.h>
43 #include <ufs/ffs/fs.h>
48 #else /* AFS_VFSINCL_ENV */
49 #if !defined(AFS_AIX_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV)
52 #endif /* AFS_VFSINCL_ENV */
53 #endif /* AFS_OSF_ENV */
54 #endif /* AFS_SGI_ENV */
55 #endif /* AFS_NT40_ENV */
73 #if defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV)
75 #include <sys/mnttab.h>
76 #include <sys/mntent.h>
82 #if defined(AFS_SGI_ENV)
85 #ifdef AFS_SGI_EFS_IOPS_ENV
86 #define ROOTINO EFS_ROOTINO
87 #include <sys/fs/efs.h>
88 #include "sgiefs/efs.h" /* until 5.1 release */
93 #ifndef AFS_LINUX20_ENV
94 #include <fstab.h> /* Need to find in libc 5, present in libc 6 */
97 #endif /* AFS_SGI_ENV */
99 #endif /* AFS_HPUX_ENV */
103 #include <netinet/in.h>
104 #include <sys/wait.h>
107 #include <sys/time.h>
108 #endif /* ITIMER_REAL */
109 #endif /* AFS_NT40_ENV */
110 #if defined(AFS_SUN5_ENV) || defined(AFS_NT40_ENV) || defined(AFS_LINUX20_ENV)
117 #include <afs/errors.h>
120 #include <afs/afssyscalls.h>
122 #include <afs/afsutil.h>
126 #include "daemon_com.h"
128 #include "salvsync.h"
131 #include "partition.h"
132 #include "volume_inline.h"
133 #ifdef AFS_PTHREAD_ENV
135 #else /* AFS_PTHREAD_ENV */
136 #include "afs/assert.h"
137 #endif /* AFS_PTHREAD_ENV */
144 #if !defined(offsetof)
149 #define afs_stat stat64
150 #define afs_fstat fstat64
151 #define afs_open open64
152 #else /* !O_LARGEFILE */
153 #define afs_stat stat
154 #define afs_fstat fstat
155 #define afs_open open
156 #endif /* !O_LARGEFILE */
158 #ifdef AFS_PTHREAD_ENV
159 pthread_mutex_t vol_glock_mutex;
160 pthread_mutex_t vol_trans_mutex;
161 pthread_cond_t vol_put_volume_cond;
162 pthread_cond_t vol_sleep_cond;
163 int vol_attach_threads = 1;
164 #endif /* AFS_PTHREAD_ENV */
166 #ifdef AFS_DEMAND_ATTACH_FS
167 pthread_mutex_t vol_salvsync_mutex;
168 #endif /* AFS_DEMAND_ATTACH_FS */
171 extern void *calloc(), *realloc();
174 /*@printflike@*/ extern void Log(const char *format, ...);
176 /* Forward declarations */
177 static Volume *attach2(Error * ec, VolId vid, char *path,
178 register struct VolumeHeader *header,
179 struct DiskPartition64 *partp, Volume * vp,
180 int isbusy, int mode);
181 static void ReallyFreeVolume(Volume * vp);
182 #ifdef AFS_DEMAND_ATTACH_FS
183 static void FreeVolume(Volume * vp);
184 #else /* !AFS_DEMAND_ATTACH_FS */
185 #define FreeVolume(vp) ReallyFreeVolume(vp)
186 static void VScanUpdateList(void);
187 #endif /* !AFS_DEMAND_ATTACH_FS */
188 static void VInitVolumeHeaderCache(afs_uint32 howMany);
189 static int GetVolumeHeader(register Volume * vp);
190 static void ReleaseVolumeHeader(register struct volHeader *hd);
191 static void FreeVolumeHeader(register Volume * vp);
192 static void AddVolumeToHashTable(register Volume * vp, int hashid);
193 static void DeleteVolumeFromHashTable(register Volume * vp);
194 static int VHold(Volume * vp);
195 static int VHold_r(Volume * vp);
196 static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class);
197 static void VReleaseVolumeHandles_r(Volume * vp);
198 static void VCloseVolumeHandles_r(Volume * vp);
199 static void LoadVolumeHeader(Error * ec, Volume * vp);
200 static int VCheckOffline(register Volume * vp);
201 static int VCheckDetach(register Volume * vp);
202 static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags);
203 static int VolumeExternalName_r(VolumeId volumeId, char * name, size_t len);
205 int LogLevel; /* Vice loglevel--not defined as extern so that it will be
206 * defined when not linked with vice, XXXX */
207 ProgramType programType; /* The type of program using the package */
209 /* extended volume package statistics */
212 #ifdef VOL_LOCK_DEBUG
213 pthread_t vol_glock_holder = 0;
217 #define VOLUME_BITMAP_GROWSIZE 16 /* bytes, => 128vnodes */
218 /* Must be a multiple of 4 (1 word) !! */
220 /* this parameter needs to be tunable at runtime.
221 * 128 was really inadequate for largish servers -- at 16384 volumes this
222 * puts average chain length at 128, thus an average 65 deref's to find a volptr.
223 * talk about bad spatial locality...
225 * an AVL or splay tree might work a lot better, but we'll just increase
226 * the default hash table size for now
228 #define DEFAULT_VOLUME_HASH_SIZE 256 /* Must be a power of 2!! */
229 #define DEFAULT_VOLUME_HASH_MASK (DEFAULT_VOLUME_HASH_SIZE-1)
230 #define VOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask))
233 * turn volume hash chains into partially ordered lists.
234 * when the threshold is exceeded between two adjacent elements,
235 * perform a chain rebalancing operation.
237 * keep the threshold high in order to keep cache line invalidates
238 * low "enough" on SMPs
240 #define VOLUME_HASH_REORDER_THRESHOLD 200
243 * when possible, don't just reorder single elements, but reorder
244 * entire chains of elements at once. a chain of elements that
245 * exceed the element previous to the pivot by at least CHAIN_THRESH
246 * accesses are moved in front of the chain whose elements have at
247 * least CHAIN_THRESH less accesses than the pivot element
249 #define VOLUME_HASH_REORDER_CHAIN_THRESH (VOLUME_HASH_REORDER_THRESHOLD / 2)
251 #include "rx/rx_queue.h"
254 VolumeHashTable_t VolumeHashTable = {
255 DEFAULT_VOLUME_HASH_SIZE,
256 DEFAULT_VOLUME_HASH_MASK,
261 static void VInitVolumeHash(void);
265 /* This macro is used where an ffs() call does not exist. Was in util/ffs.c */
269 afs_int32 ffs_tmp = x;
273 for (ffs_i = 1;; ffs_i++) {
280 #endif /* !AFS_HAVE_FFS */
282 #ifdef AFS_PTHREAD_ENV
283 typedef struct diskpartition_queue_t {
284 struct rx_queue queue;
285 struct DiskPartition64 * diskP;
286 } diskpartition_queue_t;
287 typedef struct vinitvolumepackage_thread_t {
288 struct rx_queue queue;
289 pthread_cond_t thread_done_cv;
290 int n_threads_complete;
291 } vinitvolumepackage_thread_t;
292 static void * VInitVolumePackageThread(void * args);
293 #endif /* AFS_PTHREAD_ENV */
295 static int VAttachVolumesByPartition(struct DiskPartition64 *diskP,
296 int * nAttached, int * nUnattached);
299 #ifdef AFS_DEMAND_ATTACH_FS
300 /* demand attach fileserver extensions */
303 * in the future we will support serialization of VLRU state into the fs_state
306 * these structures are the beginning of that effort
308 struct VLRU_DiskHeader {
309 struct versionStamp stamp; /* magic and structure version number */
310 afs_uint32 mtime; /* time of dump to disk */
311 afs_uint32 num_records; /* number of VLRU_DiskEntry records */
314 struct VLRU_DiskEntry {
315 afs_uint32 vid; /* volume ID */
316 afs_uint32 idx; /* generation */
317 afs_uint32 last_get; /* timestamp of last get */
320 struct VLRU_StartupQueue {
321 struct VLRU_DiskEntry * entry;
326 typedef struct vshutdown_thread_t {
328 pthread_mutex_t lock;
330 pthread_cond_t master_cv;
332 int n_threads_complete;
334 int schedule_version;
337 byte n_parts_done_pass;
338 byte part_thread_target[VOLMAXPARTS+1];
339 byte part_done_pass[VOLMAXPARTS+1];
340 struct rx_queue * part_pass_head[VOLMAXPARTS+1];
341 int stats[4][VOLMAXPARTS+1];
342 } vshutdown_thread_t;
343 static void * VShutdownThread(void * args);
346 static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode);
347 static int VCheckFree(Volume * vp);
350 static void AddVolumeToVByPList_r(Volume * vp);
351 static void DeleteVolumeFromVByPList_r(Volume * vp);
352 static void VVByPListBeginExclusive_r(struct DiskPartition64 * dp);
353 static void VVByPListEndExclusive_r(struct DiskPartition64 * dp);
354 static void VVByPListWait_r(struct DiskPartition64 * dp);
356 /* online salvager */
357 static int VCheckSalvage(register Volume * vp);
358 static int VUpdateSalvagePriority_r(Volume * vp);
359 static int VScheduleSalvage_r(Volume * vp);
360 static int VCancelSalvage_r(Volume * vp, int reason);
362 /* Volume hash table */
363 static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp);
364 static void VHashBeginExclusive_r(VolumeHashChainHead * head);
365 static void VHashEndExclusive_r(VolumeHashChainHead * head);
366 static void VHashWait_r(VolumeHashChainHead * head);
369 static int ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass);
370 static int ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
371 struct rx_queue ** idx);
372 static void ShutdownController(vshutdown_thread_t * params);
373 static void ShutdownCreateSchedule(vshutdown_thread_t * params);
376 static void VLRU_ComputeConstants(void);
377 static void VInitVLRU(void);
378 static void VLRU_Init_Node_r(volatile Volume * vp);
379 static void VLRU_Add_r(volatile Volume * vp);
380 static void VLRU_Delete_r(volatile Volume * vp);
381 static void VLRU_UpdateAccess_r(volatile Volume * vp);
382 static void * VLRU_ScannerThread(void * args);
383 static void VLRU_Scan_r(int idx);
384 static void VLRU_Promote_r(int idx);
385 static void VLRU_Demote_r(int idx);
386 static void VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append);
389 static int VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh);
390 static int VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh);
391 static int VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh);
392 #endif /* AFS_DEMAND_ATTACH_FS */
395 struct Lock vol_listLock; /* Lock obtained when listing volumes:
396 * prevents a volume from being missed
397 * if the volume is attached during a
401 static int TimeZoneCorrection; /* Number of seconds west of GMT */
403 /* Common message used when the volume goes off line */
404 char *VSalvageMessage =
405 "Files in this volume are currently unavailable; call operations";
407 int VInit; /* 0 - uninitialized,
408 * 1 - initialized but not all volumes have been attached,
409 * 2 - initialized and all volumes have been attached,
410 * 3 - initialized, all volumes have been attached, and
411 * VConnectFS() has completed. */
414 bit32 VolumeCacheCheck; /* Incremented everytime a volume goes on line--
415 * used to stamp volume headers and in-core
416 * vnodes. When the volume goes on-line the
417 * vnode will be invalidated
418 * access only with VOL_LOCK held */
423 /***************************************************/
424 /* Startup routines */
425 /***************************************************/
428 VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVnodes,
429 int connect, afs_uint32 volcache)
431 int errors = 0; /* Number of errors while finding vice partitions. */
437 memset(&VStats, 0, sizeof(VStats));
438 VStats.hdr_cache_size = 200;
440 VInitPartitionPackage();
442 #ifdef AFS_DEMAND_ATTACH_FS
443 if (programType == fileServer) {
446 VLRU_SetOptions(VLRU_SET_ENABLED, 0);
450 #ifdef AFS_PTHREAD_ENV
451 assert(pthread_mutex_init(&vol_glock_mutex, NULL) == 0);
452 assert(pthread_mutex_init(&vol_trans_mutex, NULL) == 0);
453 assert(pthread_cond_init(&vol_put_volume_cond, NULL) == 0);
454 assert(pthread_cond_init(&vol_sleep_cond, NULL) == 0);
455 #else /* AFS_PTHREAD_ENV */
457 #endif /* AFS_PTHREAD_ENV */
458 Lock_Init(&vol_listLock);
460 srandom(time(0)); /* For VGetVolumeInfo */
461 gettimeofday(&tv, &tz);
462 TimeZoneCorrection = tz.tz_minuteswest * 60;
464 #ifdef AFS_DEMAND_ATTACH_FS
465 assert(pthread_mutex_init(&vol_salvsync_mutex, NULL) == 0);
466 #endif /* AFS_DEMAND_ATTACH_FS */
468 /* Ok, we have done enough initialization that fileserver can
469 * start accepting calls, even though the volumes may not be
470 * available just yet.
474 #if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_SERVER)
475 if (programType == salvageServer) {
478 #endif /* AFS_DEMAND_ATTACH_FS */
479 #ifdef FSSYNC_BUILD_SERVER
480 if (programType == fileServer) {
484 #if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT)
485 if (programType == fileServer) {
486 /* establish a connection to the salvager at this point */
487 assert(VConnectSALV() != 0);
489 #endif /* AFS_DEMAND_ATTACH_FS */
491 if (volcache > VStats.hdr_cache_size)
492 VStats.hdr_cache_size = volcache;
493 VInitVolumeHeaderCache(VStats.hdr_cache_size);
495 VInitVnodes(vLarge, nLargeVnodes);
496 VInitVnodes(vSmall, nSmallVnodes);
499 errors = VAttachPartitions();
503 if (programType == fileServer) {
504 struct DiskPartition64 *diskP;
505 #ifdef AFS_PTHREAD_ENV
506 struct vinitvolumepackage_thread_t params;
507 struct diskpartition_queue_t * dpq;
508 int i, threads, parts;
510 pthread_attr_t attrs;
512 assert(pthread_cond_init(¶ms.thread_done_cv,NULL) == 0);
514 params.n_threads_complete = 0;
516 /* create partition work queue */
517 for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
518 dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
521 queue_Append(¶ms,dpq);
524 threads = MIN(parts, vol_attach_threads);
527 /* spawn off a bunch of initialization threads */
528 assert(pthread_attr_init(&attrs) == 0);
529 assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
531 Log("VInitVolumePackage: beginning parallel fileserver startup\n");
532 #ifdef AFS_DEMAND_ATTACH_FS
533 Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n",
535 #else /* AFS_DEMAND_ATTACH_FS */
536 Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n",
538 #endif /* AFS_DEMAND_ATTACH_FS */
541 for (i=0; i < threads; i++) {
542 assert(pthread_create
543 (&tid, &attrs, &VInitVolumePackageThread,
547 while(params.n_threads_complete < threads) {
548 VOL_CV_WAIT(¶ms.thread_done_cv);
552 assert(pthread_attr_destroy(&attrs) == 0);
554 /* if we're only going to run one init thread, don't bother creating
556 Log("VInitVolumePackage: beginning single-threaded fileserver startup\n");
557 #ifdef AFS_DEMAND_ATTACH_FS
558 Log("VInitVolumePackage: using 1 thread to pre-attach volumes on %d partition(s)\n",
560 #else /* AFS_DEMAND_ATTACH_FS */
561 Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n",
563 #endif /* AFS_DEMAND_ATTACH_FS */
565 VInitVolumePackageThread(¶ms);
568 assert(pthread_cond_destroy(¶ms.thread_done_cv) == 0);
570 #else /* AFS_PTHREAD_ENV */
574 /* Attach all the volumes in this partition */
575 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
576 int nAttached = 0, nUnattached = 0;
577 assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
579 #endif /* AFS_PTHREAD_ENV */
582 VInit = 2; /* Initialized, and all volumes have been attached */
583 #ifdef FSSYNC_BUILD_CLIENT
584 if (programType == volumeUtility && connect) {
586 Log("Unable to connect to file server; aborted\n");
590 #ifdef AFS_DEMAND_ATTACH_FS
591 else if (programType == salvageServer) {
593 Log("Unable to connect to file server; aborted\n");
597 #endif /* AFS_DEMAND_ATTACH_FS */
598 #endif /* FSSYNC_BUILD_CLIENT */
602 #ifdef AFS_PTHREAD_ENV
604 VInitVolumePackageThread(void * args) {
605 int errors = 0; /* Number of errors while finding vice partitions. */
609 struct DiskPartition64 *diskP;
610 struct vinitvolumepackage_thread_t * params;
611 struct diskpartition_queue_t * dpq;
613 params = (vinitvolumepackage_thread_t *) args;
617 /* Attach all the volumes in this partition */
618 while (queue_IsNotEmpty(params)) {
619 int nAttached = 0, nUnattached = 0;
621 dpq = queue_First(params,diskpartition_queue_t);
627 assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
632 params->n_threads_complete++;
633 pthread_cond_signal(¶ms->thread_done_cv);
637 #endif /* AFS_PTHREAD_ENV */
640 * attach all volumes on a given disk partition
643 VAttachVolumesByPartition(struct DiskPartition64 *diskP, int * nAttached, int * nUnattached)
649 Log("Partition %s: attaching volumes\n", diskP->name);
650 dirp = opendir(VPartitionPath(diskP));
652 Log("opendir on Partition %s failed!\n", diskP->name);
656 while ((dp = readdir(dirp))) {
658 p = strrchr(dp->d_name, '.');
659 if (p != NULL && strcmp(p, VHDREXT) == 0) {
662 #ifdef AFS_DEMAND_ATTACH_FS
663 vp = VPreAttachVolumeByName(&error, diskP->name, dp->d_name);
664 #else /* AFS_DEMAND_ATTACH_FS */
665 vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
667 #endif /* AFS_DEMAND_ATTACH_FS */
668 (*(vp ? nAttached : nUnattached))++;
669 if (error == VOFFLINE)
670 Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
671 else if (LogLevel >= 5) {
672 Log("Partition %s: attached volume %d (%s)\n",
673 diskP->name, VolumeNumber(dp->d_name),
676 #if !defined(AFS_DEMAND_ATTACH_FS)
680 #endif /* AFS_DEMAND_ATTACH_FS */
684 Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached);
690 /***************************************************/
691 /* Shutdown routines */
692 /***************************************************/
696 * highly multithreaded volume package shutdown
698 * with the demand attach fileserver extensions,
699 * VShutdown has been modified to be multithreaded.
700 * In order to achieve optimal use of many threads,
701 * the shutdown code involves one control thread and
702 * n shutdown worker threads. The control thread
703 * periodically examines the number of volumes available
704 * for shutdown on each partition, and produces a worker
705 * thread allocation schedule. The idea is to eliminate
706 * redundant scheduling computation on the workers by
707 * having a single master scheduler.
709 * The scheduler's objectives are:
711 * each partition with volumes remaining gets allocated
712 * at least 1 thread (assuming sufficient threads)
714 * threads are allocated proportional to the number of
715 * volumes remaining to be offlined. This ensures that
716 * the OS I/O scheduler has many requests to elevator
717 * seek on partitions that will (presumably) take the
718 * longest amount of time (from now) to finish shutdown
719 * (3) keep threads busy
720 * when there are extra threads, they are assigned to
721 * partitions using a simple round-robin algorithm
723 * In the future, we may wish to add the ability to adapt
724 * to the relative performance patterns of each disk
729 * multi-step shutdown process
731 * demand attach shutdown is a four-step process. Each
732 * shutdown "pass" shuts down increasingly more difficult
733 * volumes. The main purpose is to achieve better cache
734 * utilization during shutdown.
737 * shutdown volumes in the unattached, pre-attached
740 * shutdown attached volumes with cached volume headers
742 * shutdown all volumes in non-exclusive states
744 * shutdown all remaining volumes
751 register Volume *vp, *np;
752 register afs_int32 code;
753 #ifdef AFS_DEMAND_ATTACH_FS
754 struct DiskPartition64 * diskP;
755 struct diskpartition_queue_t * dpq;
756 vshutdown_thread_t params;
758 pthread_attr_t attrs;
760 memset(¶ms, 0, sizeof(vshutdown_thread_t));
762 for (params.n_parts=0, diskP = DiskPartitionList;
763 diskP; diskP = diskP->next, params.n_parts++);
765 Log("VShutdown: shutting down on-line volumes on %d partition%s...\n",
766 params.n_parts, params.n_parts > 1 ? "s" : "");
768 if (vol_attach_threads > 1) {
769 /* prepare for parallel shutdown */
770 params.n_threads = vol_attach_threads;
771 assert(pthread_mutex_init(¶ms.lock, NULL) == 0);
772 assert(pthread_cond_init(¶ms.cv, NULL) == 0);
773 assert(pthread_cond_init(¶ms.master_cv, NULL) == 0);
774 assert(pthread_attr_init(&attrs) == 0);
775 assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
778 /* setup the basic partition information structures for
779 * parallel shutdown */
780 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
782 struct rx_queue * qp, * nqp;
786 VVByPListWait_r(diskP);
787 VVByPListBeginExclusive_r(diskP);
790 for (queue_Scan(&diskP->vol_list, qp, nqp, rx_queue)) {
791 vp = (Volume *)((char *)qp - offsetof(Volume, vol_list));
795 Log("VShutdown: partition %s has %d volumes with attached headers\n",
796 VPartitionPath(diskP), count);
799 /* build up the pass 0 shutdown work queue */
800 dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
803 queue_Prepend(¶ms, dpq);
805 params.part_pass_head[diskP->index] = queue_First(&diskP->vol_list, rx_queue);
808 Log("VShutdown: beginning parallel fileserver shutdown\n");
809 Log("VShutdown: using %d threads to offline volumes on %d partition%s\n",
810 vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" );
812 /* do pass 0 shutdown */
813 assert(pthread_mutex_lock(¶ms.lock) == 0);
814 for (i=0; i < params.n_threads; i++) {
815 assert(pthread_create
816 (&tid, &attrs, &VShutdownThread,
820 /* wait for all the pass 0 shutdowns to complete */
821 while (params.n_threads_complete < params.n_threads) {
822 assert(pthread_cond_wait(¶ms.master_cv, ¶ms.lock) == 0);
824 params.n_threads_complete = 0;
826 assert(pthread_cond_broadcast(¶ms.cv) == 0);
827 assert(pthread_mutex_unlock(¶ms.lock) == 0);
829 Log("VShutdown: pass 0 completed using the 1 thread per partition algorithm\n");
830 Log("VShutdown: starting passes 1 through 3 using finely-granular mp-fast algorithm\n");
832 /* run the parallel shutdown scheduler. it will drop the glock internally */
833 ShutdownController(¶ms);
835 /* wait for all the workers to finish pass 3 and terminate */
836 while (params.pass < 4) {
837 VOL_CV_WAIT(¶ms.cv);
840 assert(pthread_attr_destroy(&attrs) == 0);
841 assert(pthread_cond_destroy(¶ms.cv) == 0);
842 assert(pthread_cond_destroy(¶ms.master_cv) == 0);
843 assert(pthread_mutex_destroy(¶ms.lock) == 0);
845 /* drop the VByPList exclusive reservations */
846 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
847 VVByPListEndExclusive_r(diskP);
848 Log("VShutdown: %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
849 VPartitionPath(diskP),
850 params.stats[0][diskP->index],
851 params.stats[1][diskP->index],
852 params.stats[2][diskP->index],
853 params.stats[3][diskP->index]);
856 Log("VShutdown: shutdown finished using %d threads\n", params.n_threads);
858 /* if we're only going to run one shutdown thread, don't bother creating
860 Log("VShutdown: beginning single-threaded fileserver shutdown\n");
862 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
863 VShutdownByPartition_r(diskP);
867 Log("VShutdown: complete.\n");
868 #else /* AFS_DEMAND_ATTACH_FS */
869 Log("VShutdown: shutting down on-line volumes...\n");
870 for (i = 0; i < VolumeHashTable.Size; i++) {
871 /* try to hold first volume in the hash table */
872 for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) {
876 Log("VShutdown: Attempting to take volume %u offline.\n",
879 /* next, take the volume offline (drops reference count) */
880 VOffline_r(vp, "File server was shut down");
884 Log("VShutdown: complete.\n");
885 #endif /* AFS_DEMAND_ATTACH_FS */
896 #ifdef AFS_DEMAND_ATTACH_FS
899 * shutdown control thread
902 ShutdownController(vshutdown_thread_t * params)
905 struct DiskPartition64 * diskP;
907 vshutdown_thread_t shadow;
909 ShutdownCreateSchedule(params);
911 while ((params->pass < 4) &&
912 (params->n_threads_complete < params->n_threads)) {
913 /* recompute schedule once per second */
915 memcpy(&shadow, params, sizeof(vshutdown_thread_t));
919 Log("ShutdownController: schedule version=%d, vol_remaining=%d, pass=%d\n",
920 shadow.schedule_version, shadow.vol_remaining, shadow.pass);
921 Log("ShutdownController: n_threads_complete=%d, n_parts_done_pass=%d\n",
922 shadow.n_threads_complete, shadow.n_parts_done_pass);
923 for (diskP = DiskPartitionList; diskP; diskP=diskP->next) {
925 Log("ShutdownController: part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n",
928 shadow.part_thread_target[id],
929 shadow.part_done_pass[id],
930 shadow.part_pass_head[id]);
936 ShutdownCreateSchedule(params);
940 /* create the shutdown thread work schedule.
941 * this scheduler tries to implement fairness
942 * by allocating at least 1 thread to each
943 * partition with volumes to be shutdown,
944 * and then it attempts to allocate remaining
945 * threads based upon the amount of work left
948 ShutdownCreateSchedule(vshutdown_thread_t * params)
950 struct DiskPartition64 * diskP;
951 int sum, thr_workload, thr_left;
952 int part_residue[VOLMAXPARTS+1];
955 /* compute the total number of outstanding volumes */
957 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
958 sum += diskP->vol_list.len;
961 params->schedule_version++;
962 params->vol_remaining = sum;
967 /* compute average per-thread workload */
968 thr_workload = sum / params->n_threads;
969 if (sum % params->n_threads)
972 thr_left = params->n_threads;
973 memset(&part_residue, 0, sizeof(part_residue));
975 /* for fairness, give every partition with volumes remaining
976 * at least one thread */
977 for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
979 if (diskP->vol_list.len) {
980 params->part_thread_target[id] = 1;
983 params->part_thread_target[id] = 0;
987 if (thr_left && thr_workload) {
988 /* compute length-weighted workloads */
991 for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
993 delta = (diskP->vol_list.len / thr_workload) -
994 params->part_thread_target[id];
998 if (delta < thr_left) {
999 params->part_thread_target[id] += delta;
1002 params->part_thread_target[id] += thr_left;
1010 /* try to assign any leftover threads to partitions that
1011 * had volume lengths closer to needing thread_target+1 */
1012 int max_residue, max_id;
1014 /* compute the residues */
1015 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
1017 part_residue[id] = diskP->vol_list.len -
1018 (params->part_thread_target[id] * thr_workload);
1021 /* now try to allocate remaining threads to partitions with the
1022 * highest residues */
1025 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
1027 if (part_residue[id] > max_residue) {
1028 max_residue = part_residue[id];
1037 params->part_thread_target[max_id]++;
1039 part_residue[max_id] = 0;
1044 /* punt and give any remaining threads equally to each partition */
1046 if (thr_left >= params->n_parts) {
1047 alloc = thr_left / params->n_parts;
1048 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
1050 params->part_thread_target[id] += alloc;
1055 /* finish off the last of the threads */
1056 for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) {
1058 params->part_thread_target[id]++;
1064 /* worker thread for parallel shutdown */
1066 VShutdownThread(void * args)
1068 struct rx_queue *qp;
1070 vshutdown_thread_t * params;
1071 int part, code, found, pass, schedule_version_save, count;
1072 struct DiskPartition64 *diskP;
1073 struct diskpartition_queue_t * dpq;
1076 params = (vshutdown_thread_t *) args;
1078 /* acquire the shutdown pass 0 lock */
1079 assert(pthread_mutex_lock(¶ms->lock) == 0);
1081 /* if there's still pass 0 work to be done,
1082 * get a work entry, and do a pass 0 shutdown */
1083 if (queue_IsNotEmpty(params)) {
1084 dpq = queue_First(params, diskpartition_queue_t);
1086 assert(pthread_mutex_unlock(¶ms->lock) == 0);
1092 while (ShutdownVolumeWalk_r(diskP, 0, ¶ms->part_pass_head[id]))
1094 params->stats[0][diskP->index] = count;
1095 assert(pthread_mutex_lock(¶ms->lock) == 0);
1098 params->n_threads_complete++;
1099 if (params->n_threads_complete == params->n_threads) {
1100 /* notify control thread that all workers have completed pass 0 */
1101 assert(pthread_cond_signal(¶ms->master_cv) == 0);
1103 while (params->pass == 0) {
1104 assert(pthread_cond_wait(¶ms->cv, ¶ms->lock) == 0);
1108 assert(pthread_mutex_unlock(¶ms->lock) == 0);
1111 pass = params->pass;
1114 /* now escalate through the more complicated shutdowns */
1116 schedule_version_save = params->schedule_version;
1118 /* find a disk partition to work on */
1119 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
1121 if (params->part_thread_target[id] && !params->part_done_pass[id]) {
1122 params->part_thread_target[id]--;
1129 /* hmm. for some reason the controller thread couldn't find anything for
1130 * us to do. let's see if there's anything we can do */
1131 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
1133 if (diskP->vol_list.len && !params->part_done_pass[id]) {
1136 } else if (!params->part_done_pass[id]) {
1137 params->part_done_pass[id] = 1;
1138 params->n_parts_done_pass++;
1140 Log("VShutdown: done shutting down volumes on partition %s.\n",
1141 VPartitionPath(diskP));
1147 /* do work on this partition until either the controller
1148 * creates a new schedule, or we run out of things to do
1149 * on this partition */
1152 while (!params->part_done_pass[id] &&
1153 (schedule_version_save == params->schedule_version)) {
1154 /* ShutdownVolumeWalk_r will drop the glock internally */
1155 if (!ShutdownVolumeWalk_r(diskP, pass, ¶ms->part_pass_head[id])) {
1156 if (!params->part_done_pass[id]) {
1157 params->part_done_pass[id] = 1;
1158 params->n_parts_done_pass++;
1160 Log("VShutdown: done shutting down volumes on partition %s.\n",
1161 VPartitionPath(diskP));
1169 params->stats[pass][id] += count;
1171 /* ok, everyone is done this pass, proceed */
1174 params->n_threads_complete++;
1175 while (params->pass == pass) {
1176 if (params->n_threads_complete == params->n_threads) {
1177 /* we are the last thread to complete, so we will
1178 * reinitialize worker pool state for the next pass */
1179 params->n_threads_complete = 0;
1180 params->n_parts_done_pass = 0;
1182 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
1184 params->part_done_pass[id] = 0;
1185 params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue);
1188 /* compute a new thread schedule before releasing all the workers */
1189 ShutdownCreateSchedule(params);
1191 /* wake up all the workers */
1192 assert(pthread_cond_broadcast(¶ms->cv) == 0);
1195 Log("VShutdown: pass %d completed using %d threads on %d partitions\n",
1196 pass, params->n_threads, params->n_parts);
1199 VOL_CV_WAIT(¶ms->cv);
1202 pass = params->pass;
1216 /* shut down all volumes on a given disk partition
1218 * note that this function will not allow mp-fast
1219 * shutdown of a partition */
1221 VShutdownByPartition_r(struct DiskPartition64 * dp)
1227 /* wait for other exclusive ops to finish */
1228 VVByPListWait_r(dp);
1230 /* begin exclusive access */
1231 VVByPListBeginExclusive_r(dp);
1233 /* pick the low-hanging fruit first,
1234 * then do the complicated ones last
1235 * (has the advantage of keeping
1236 * in-use volumes up until the bitter end) */
1237 for (pass = 0, total=0; pass < 4; pass++) {
1238 pass_stats[pass] = ShutdownVByPForPass_r(dp, pass);
1239 total += pass_stats[pass];
1242 /* end exclusive access */
1243 VVByPListEndExclusive_r(dp);
1245 Log("VShutdownByPartition: shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
1246 total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]);
1251 /* internal shutdown functionality
1253 * for multi-pass shutdown:
1254 * 0 to only "shutdown" {pre,un}attached and error state volumes
1255 * 1 to also shutdown attached volumes w/ volume header loaded
1256 * 2 to also shutdown attached volumes w/o volume header loaded
1257 * 3 to also shutdown exclusive state volumes
1259 * caller MUST hold exclusive access on the hash chain
1260 * because we drop vol_glock_mutex internally
1262 * this function is reentrant for passes 1--3
1263 * (e.g. multiple threads can cooperate to
1264 * shutdown a partition mp-fast)
1266 * pass 0 is not scaleable because the volume state data is
1267 * synchronized by vol_glock mutex, and the locking overhead
1268 * is too high to drop the lock long enough to do linked list
1272 ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass)
1274 struct rx_queue * q = queue_First(&dp->vol_list, rx_queue);
1277 while (ShutdownVolumeWalk_r(dp, pass, &q))
1283 /* conditionally shutdown one volume on partition dp
1284 * returns 1 if a volume was shutdown in this pass,
1287 ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
1288 struct rx_queue ** idx)
1290 struct rx_queue *qp, *nqp;
1295 for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) {
1296 vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list));
1300 if ((V_attachState(vp) != VOL_STATE_UNATTACHED) &&
1301 (V_attachState(vp) != VOL_STATE_ERROR) &&
1302 (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
1306 if ((V_attachState(vp) == VOL_STATE_ATTACHED) &&
1307 (vp->header == NULL)) {
1311 if (VIsExclusiveState(V_attachState(vp))) {
1316 DeleteVolumeFromVByPList_r(vp);
1317 VShutdownVolume_r(vp);
1327 * shutdown a specific volume
1329 /* caller MUST NOT hold a heavyweight ref on vp */
1331 VShutdownVolume_r(Volume * vp)
1335 VCreateReservation_r(vp);
1337 if (LogLevel >= 5) {
1338 Log("VShutdownVolume_r: vid=%u, device=%d, state=%hu\n",
1339 vp->hashid, vp->partition->device, V_attachState(vp));
1342 /* wait for other blocking ops to finish */
1343 VWaitExclusiveState_r(vp);
1345 assert(VIsValidState(V_attachState(vp)));
1347 switch(V_attachState(vp)) {
1348 case VOL_STATE_SALVAGING:
1349 /* make sure salvager knows we don't want
1350 * the volume back */
1351 VCancelSalvage_r(vp, SALVSYNC_SHUTDOWN);
1352 case VOL_STATE_PREATTACHED:
1353 case VOL_STATE_ERROR:
1354 VChangeState_r(vp, VOL_STATE_UNATTACHED);
1355 case VOL_STATE_UNATTACHED:
1357 case VOL_STATE_GOING_OFFLINE:
1358 case VOL_STATE_SHUTTING_DOWN:
1359 case VOL_STATE_ATTACHED:
1363 Log("VShutdown: Attempting to take volume %u offline.\n",
1366 /* take the volume offline (drops reference count) */
1367 VOffline_r(vp, "File server was shut down");
1372 VCancelReservation_r(vp);
1376 #endif /* AFS_DEMAND_ATTACH_FS */
1379 /***************************************************/
1380 /* Header I/O routines */
1381 /***************************************************/
1383 /* open a descriptor for the inode (h),
1384 * read in an on-disk structure into buffer (to) of size (size),
1385 * verify versionstamp in structure has magic (magic) and
1386 * optionally verify version (version) if (version) is nonzero
1389 ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
1392 struct versionStamp *vsn;
1407 if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
1409 FDH_REALLYCLOSE(fdP);
1412 vsn = (struct versionStamp *)to;
1413 if (FDH_READ(fdP, to, size) != size || vsn->magic != magic) {
1415 FDH_REALLYCLOSE(fdP);
1420 /* Check is conditional, in case caller wants to inspect version himself */
1421 if (version && vsn->version != version) {
1427 WriteVolumeHeader_r(Error * ec, Volume * vp)
1429 IHandle_t *h = V_diskDataHandle(vp);
1439 if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
1441 FDH_REALLYCLOSE(fdP);
1444 if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
1445 != sizeof(V_disk(vp))) {
1447 FDH_REALLYCLOSE(fdP);
1453 /* VolumeHeaderToDisk
1454 * Allows for storing 64 bit inode numbers in on-disk volume header
1457 /* convert in-memory representation of a volume header to the
1458 * on-disk representation of a volume header */
1460 VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
1463 memset((char *)dh, 0, sizeof(VolumeDiskHeader_t));
1464 dh->stamp = h->stamp;
1466 dh->parent = h->parent;
1468 #ifdef AFS_64BIT_IOPS_ENV
1469 dh->volumeInfo_lo = (afs_int32) h->volumeInfo & 0xffffffff;
1470 dh->volumeInfo_hi = (afs_int32) (h->volumeInfo >> 32) & 0xffffffff;
1471 dh->smallVnodeIndex_lo = (afs_int32) h->smallVnodeIndex & 0xffffffff;
1472 dh->smallVnodeIndex_hi =
1473 (afs_int32) (h->smallVnodeIndex >> 32) & 0xffffffff;
1474 dh->largeVnodeIndex_lo = (afs_int32) h->largeVnodeIndex & 0xffffffff;
1475 dh->largeVnodeIndex_hi =
1476 (afs_int32) (h->largeVnodeIndex >> 32) & 0xffffffff;
1477 dh->linkTable_lo = (afs_int32) h->linkTable & 0xffffffff;
1478 dh->linkTable_hi = (afs_int32) (h->linkTable >> 32) & 0xffffffff;
1480 dh->volumeInfo_lo = h->volumeInfo;
1481 dh->smallVnodeIndex_lo = h->smallVnodeIndex;
1482 dh->largeVnodeIndex_lo = h->largeVnodeIndex;
1483 dh->linkTable_lo = h->linkTable;
1487 /* DiskToVolumeHeader
1488 * Converts an on-disk representation of a volume header to
1489 * the in-memory representation of a volume header.
1491 * Makes the assumption that AFS has *always*
1492 * zero'd the volume header file so that high parts of inode
1493 * numbers are 0 in older (SGI EFS) volume header files.
1496 DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
1498 memset((char *)h, 0, sizeof(VolumeHeader_t));
1499 h->stamp = dh->stamp;
1501 h->parent = dh->parent;
1503 #ifdef AFS_64BIT_IOPS_ENV
1505 (Inode) dh->volumeInfo_lo | ((Inode) dh->volumeInfo_hi << 32);
1507 h->smallVnodeIndex =
1508 (Inode) dh->smallVnodeIndex_lo | ((Inode) dh->
1509 smallVnodeIndex_hi << 32);
1511 h->largeVnodeIndex =
1512 (Inode) dh->largeVnodeIndex_lo | ((Inode) dh->
1513 largeVnodeIndex_hi << 32);
1515 (Inode) dh->linkTable_lo | ((Inode) dh->linkTable_hi << 32);
1517 h->volumeInfo = dh->volumeInfo_lo;
1518 h->smallVnodeIndex = dh->smallVnodeIndex_lo;
1519 h->largeVnodeIndex = dh->largeVnodeIndex_lo;
1520 h->linkTable = dh->linkTable_lo;
1525 /***************************************************/
1526 /* Volume Attachment routines */
1527 /***************************************************/
1529 #ifdef AFS_DEMAND_ATTACH_FS
1531 * pre-attach a volume given its path.
1533 * @param[out] ec outbound error code
1534 * @param[in] partition partition path string
1535 * @param[in] name volume id string
1537 * @return volume object pointer
1539 * @note A pre-attached volume will only have its partition
1540 * and hashid fields initialized. At first call to
1541 * VGetVolume, the volume will be fully attached.
1545 VPreAttachVolumeByName(Error * ec, char *partition, char *name)
1549 vp = VPreAttachVolumeByName_r(ec, partition, name);
1555 * pre-attach a volume given its path.
1557 * @param[out] ec outbound error code
1558 * @param[in] partition path to vice partition
1559 * @param[in] name volume id string
1561 * @return volume object pointer
1563 * @pre VOL_LOCK held
1565 * @internal volume package internal use only.
1568 VPreAttachVolumeByName_r(Error * ec, char *partition, char *name)
1570 return VPreAttachVolumeById_r(ec,
1572 VolumeNumber(name));
1576 * pre-attach a volume given its path and numeric volume id.
1578 * @param[out] ec error code return
1579 * @param[in] partition path to vice partition
1580 * @param[in] volumeId numeric volume id
1582 * @return volume object pointer
1584 * @pre VOL_LOCK held
1586 * @internal volume package internal use only.
1589 VPreAttachVolumeById_r(Error * ec,
1594 struct DiskPartition64 *partp;
1598 assert(programType == fileServer);
1600 if (!(partp = VGetPartition_r(partition, 0))) {
1602 Log("VPreAttachVolumeById_r: Error getting partition (%s)\n", partition);
1606 vp = VLookupVolume_r(ec, volumeId, NULL);
1611 return VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
1615 * preattach a volume.
1617 * @param[out] ec outbound error code
1618 * @param[in] partp pointer to partition object
1619 * @param[in] vp pointer to volume object
1620 * @param[in] vid volume id
1622 * @return volume object pointer
1624 * @pre VOL_LOCK is held.
1626 * @warning Returned volume object pointer does not have to
1627 * equal the pointer passed in as argument vp. There
1628 * are potential race conditions which can result in
1629 * the pointers having different values. It is up to
1630 * the caller to make sure that references are handled
1631 * properly in this case.
1633 * @note If there is already a volume object registered with
1634 * the same volume id, its pointer MUST be passed as
1635 * argument vp. Failure to do so will result in a silent
1636 * failure to preattach.
1638 * @internal volume package internal use only.
1641 VPreAttachVolumeByVp_r(Error * ec,
1642 struct DiskPartition64 * partp,
1650 /* check to see if pre-attach already happened */
1652 (V_attachState(vp) != VOL_STATE_UNATTACHED) &&
1653 (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
1654 !VIsErrorState(V_attachState(vp))) {
1656 * pre-attach is a no-op in all but the following cases:
1658 * - volume is unattached
1659 * - volume is in an error state
1660 * - volume is pre-attached
1662 Log("VPreattachVolumeByVp_r: volume %u not in quiescent state\n", vid);
1665 /* we're re-attaching a volume; clear out some old state */
1666 memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage));
1668 if (V_partition(vp) != partp) {
1669 /* XXX potential race */
1670 DeleteVolumeFromVByPList_r(vp);
1673 /* if we need to allocate a new Volume struct,
1674 * go ahead and drop the vol glock, otherwise
1675 * do the basic setup synchronised, as it's
1676 * probably not worth dropping the lock */
1679 /* allocate the volume structure */
1680 vp = nvp = (Volume *) malloc(sizeof(Volume));
1682 memset(vp, 0, sizeof(Volume));
1683 queue_Init(&vp->vnode_list);
1684 assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
1687 /* link the volume with its associated vice partition */
1688 vp->device = partp->device;
1689 vp->partition = partp;
1692 /* if we dropped the lock, reacquire the lock,
1693 * check for pre-attach races, and then add
1694 * the volume to the hash table */
1697 nvp = VLookupVolume_r(ec, vid, NULL);
1702 } else if (nvp) { /* race detected */
1707 /* hack to make up for VChangeState_r() decrementing
1708 * the old state counter */
1709 VStats.state_levels[0]++;
1713 /* put pre-attached volume onto the hash table
1714 * and bring it up to the pre-attached state */
1715 AddVolumeToHashTable(vp, vp->hashid);
1716 AddVolumeToVByPList_r(vp);
1717 VLRU_Init_Node_r(vp);
1718 VChangeState_r(vp, VOL_STATE_PREATTACHED);
1721 Log("VPreAttachVolumeByVp_r: volume %u pre-attached\n", vp->hashid);
1729 #endif /* AFS_DEMAND_ATTACH_FS */
1731 /* Attach an existing volume, given its pathname, and return a
1732 pointer to the volume header information. The volume also
1733 normally goes online at this time. An offline volume
1734 must be reattached to make it go online */
1736 VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
1740 retVal = VAttachVolumeByName_r(ec, partition, name, mode);
1746 VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
1748 register Volume *vp = NULL, *svp = NULL;
1750 struct afs_stat status;
1751 struct VolumeDiskHeader diskHeader;
1752 struct VolumeHeader iheader;
1753 struct DiskPartition64 *partp;
1757 #ifdef AFS_DEMAND_ATTACH_FS
1758 VolumeStats stats_save;
1759 #endif /* AFS_DEMAND_ATTACH_FS */
1763 volumeId = VolumeNumber(name);
1765 if (!(partp = VGetPartition_r(partition, 0))) {
1767 Log("VAttachVolume: Error getting partition (%s)\n", partition);
1771 if (programType == volumeUtility) {
1773 VLockPartition_r(partition);
1774 } else if (programType == fileServer) {
1775 #ifdef AFS_DEMAND_ATTACH_FS
1776 /* lookup the volume in the hash table */
1777 vp = VLookupVolume_r(ec, volumeId, NULL);
1783 /* save any counters that are supposed to
1784 * be monotonically increasing over the
1785 * lifetime of the fileserver */
1786 memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
1788 memset(&stats_save, 0, sizeof(VolumeStats));
1791 /* if there's something in the hash table, and it's not
1792 * in the pre-attach state, then we may need to detach
1793 * it before proceeding */
1794 if (vp && (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
1795 VCreateReservation_r(vp);
1796 VWaitExclusiveState_r(vp);
1798 /* at this point state must be one of:
1807 if (vp->specialStatus == VBUSY)
1810 /* if it's already attached, see if we can return it */
1811 if (V_attachState(vp) == VOL_STATE_ATTACHED) {
1812 VGetVolumeByVp_r(ec, vp);
1813 if (V_inUse(vp) == fileServer) {
1814 VCancelReservation_r(vp);
1818 /* otherwise, we need to detach, and attempt to re-attach */
1819 VDetachVolume_r(ec, vp);
1821 Log("VAttachVolume: Error detaching old volume instance (%s)\n", name);
1824 /* if it isn't fully attached, delete from the hash tables,
1825 and let the refcounter handle the rest */
1826 DeleteVolumeFromHashTable(vp);
1827 DeleteVolumeFromVByPList_r(vp);
1830 VCancelReservation_r(vp);
1834 /* pre-attach volume if it hasn't been done yet */
1836 (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
1837 (V_attachState(vp) == VOL_STATE_ERROR)) {
1839 vp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
1847 /* handle pre-attach races
1849 * multiple threads can race to pre-attach a volume,
1850 * but we can't let them race beyond that
1852 * our solution is to let the first thread to bring
1853 * the volume into an exclusive state win; the other
1854 * threads just wait until it finishes bringing the
1855 * volume online, and then they do a vgetvolumebyvp
1857 if (svp && (svp != vp)) {
1858 /* wait for other exclusive ops to finish */
1859 VCreateReservation_r(vp);
1860 VWaitExclusiveState_r(vp);
1862 /* get a heavyweight ref, kill the lightweight ref, and return */
1863 VGetVolumeByVp_r(ec, vp);
1864 VCancelReservation_r(vp);
1868 /* at this point, we are chosen as the thread to do
1869 * demand attachment for this volume. all other threads
1870 * doing a getvolume on vp->hashid will block until we finish */
1872 /* make sure any old header cache entries are invalidated
1873 * before proceeding */
1874 FreeVolumeHeader(vp);
1876 VChangeState_r(vp, VOL_STATE_ATTACHING);
1878 /* restore any saved counters */
1879 memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
1880 #else /* AFS_DEMAND_ATTACH_FS */
1881 vp = VGetVolume_r(ec, volumeId);
1883 if (V_inUse(vp) == fileServer)
1885 if (vp->specialStatus == VBUSY)
1887 VDetachVolume_r(ec, vp);
1889 Log("VAttachVolume: Error detaching volume (%s)\n", name);
1893 #endif /* AFS_DEMAND_ATTACH_FS */
1897 strcpy(path, VPartitionPath(partp));
1903 if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
1904 Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
1911 n = read(fd, &diskHeader, sizeof(diskHeader));
1913 if (n != sizeof(diskHeader)
1914 || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
1915 Log("VAttachVolume: Error reading volume header %s\n", path);
1920 if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
1921 Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
1927 DiskToVolumeHeader(&iheader, &diskHeader);
1928 #ifdef FSSYNC_BUILD_CLIENT
1929 if (programType == volumeUtility && mode != V_SECRETLY && mode != V_PEEK) {
1931 if (FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_NEEDVOLUME, mode, NULL)
1933 Log("VAttachVolume: attach of volume %u apparently denied by file server\n", iheader.id);
1934 *ec = VNOVOL; /* XXXX */
1942 vp = (Volume *) calloc(1, sizeof(Volume));
1944 vp->device = partp->device;
1945 vp->partition = partp;
1946 queue_Init(&vp->vnode_list);
1947 #ifdef AFS_DEMAND_ATTACH_FS
1948 assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
1949 #endif /* AFS_DEMAND_ATTACH_FS */
1952 /* attach2 is entered without any locks, and returns
1953 * with vol_glock_mutex held */
1954 vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
1956 if (programType == volumeUtility && vp) {
1957 if ((mode == V_VOLUPD) || (VolumeWriteable(vp) && (mode == V_CLONE))) {
1958 /* mark volume header as in use so that volser crashes lead to a
1959 * salvage attempt */
1960 VUpdateVolume_r(ec, vp, 0);
1962 #ifdef AFS_DEMAND_ATTACH_FS
1963 /* for dafs, we should tell the fileserver, except for V_PEEK
1964 * where we know it is not necessary */
1965 if (mode == V_PEEK) {
1966 vp->needsPutBack = 0;
1968 vp->needsPutBack = 1;
1970 #else /* !AFS_DEMAND_ATTACH_FS */
1971 /* duplicate computation in fssync.c about whether the server
1972 * takes the volume offline or not. If the volume isn't
1973 * offline, we must not return it when we detach the volume,
1974 * or the server will abort */
1975 if (mode == V_READONLY || mode == V_PEEK
1976 || (!VolumeWriteable(vp) && (mode == V_CLONE || mode == V_DUMP)))
1977 vp->needsPutBack = 0;
1979 vp->needsPutBack = 1;
1980 #endif /* !AFS_DEMAND_ATTACH_FS */
1982 /* OK, there's a problem here, but one that I don't know how to
1983 * fix right now, and that I don't think should arise often.
1984 * Basically, we should only put back this volume to the server if
1985 * it was given to us by the server, but since we don't have a vp,
1986 * we can't run the VolumeWriteable function to find out as we do
1987 * above when computing vp->needsPutBack. So we send it back, but
1988 * there's a path in VAttachVolume on the server which may abort
1989 * if this volume doesn't have a header. Should be pretty rare
1990 * for all of that to happen, but if it does, probably the right
1991 * fix is for the server to allow the return of readonly volumes
1992 * that it doesn't think are really checked out. */
1993 #ifdef FSSYNC_BUILD_CLIENT
1994 if (programType == volumeUtility && vp == NULL &&
1995 mode != V_SECRETLY && mode != V_PEEK) {
1996 FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_ON, 0, NULL);
1999 if (programType == fileServer && vp) {
2000 #ifdef AFS_DEMAND_ATTACH_FS
2002 * we can get here in cases where we don't "own"
2003 * the volume (e.g. volume owned by a utility).
2004 * short circuit around potential disk header races.
2006 if (V_attachState(vp) != VOL_STATE_ATTACHED) {
2010 V_needsCallback(vp) = 0;
2012 if (VInit >= 2 && V_BreakVolumeCallbacks) {
2013 Log("VAttachVolume: Volume %u was changed externally; breaking callbacks\n", V_id(vp));
2014 (*V_BreakVolumeCallbacks) (V_id(vp));
2017 VUpdateVolume_r(ec, vp, 0);
2019 Log("VAttachVolume: Error updating volume\n");
2024 if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
2025 #ifndef AFS_DEMAND_ATTACH_FS
2026 /* This is a hack: by temporarily setting the incore
2027 * dontSalvage flag ON, the volume will be put back on the
2028 * Update list (with dontSalvage OFF again). It will then
2029 * come back in N minutes with DONT_SALVAGE eventually
2030 * set. This is the way that volumes that have never had
2031 * it set get it set; or that volumes that have been
2032 * offline without DONT SALVAGE having been set also
2033 * eventually get it set */
2034 V_dontSalvage(vp) = DONT_SALVAGE;
2035 #endif /* !AFS_DEMAND_ATTACH_FS */
2036 VAddToVolumeUpdateList_r(ec, vp);
2038 Log("VAttachVolume: Error adding volume to update list\n");
2045 Log("VOnline: volume %u (%s) attached and online\n", V_id(vp),
2050 if (programType == volumeUtility) {
2051 VUnlockPartition_r(partition);
2054 #ifdef AFS_DEMAND_ATTACH_FS
2055 /* attach failed; make sure we're in error state */
2056 if (vp && !VIsErrorState(V_attachState(vp))) {
2057 VChangeState_r(vp, VOL_STATE_ERROR);
2059 #endif /* AFS_DEMAND_ATTACH_FS */
2066 #ifdef AFS_DEMAND_ATTACH_FS
2067 /* VAttachVolumeByVp_r
2069 * finish attaching a volume that is
2070 * in a less than fully attached state
2072 /* caller MUST hold a ref count on vp */
2074 VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
2076 char name[VMAXPATHLEN];
2077 int fd, n, reserve = 0;
2078 struct afs_stat status;
2079 struct VolumeDiskHeader diskHeader;
2080 struct VolumeHeader iheader;
2081 struct DiskPartition64 *partp;
2086 VolumeStats stats_save;
2089 /* volume utility should never call AttachByVp */
2090 assert(programType == fileServer);
2092 volumeId = vp->hashid;
2093 partp = vp->partition;
2094 VolumeExternalName_r(volumeId, name, sizeof(name));
2097 /* if another thread is performing a blocking op, wait */
2098 VWaitExclusiveState_r(vp);
2100 memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
2102 /* if it's already attached, see if we can return it */
2103 if (V_attachState(vp) == VOL_STATE_ATTACHED) {
2104 VGetVolumeByVp_r(ec, vp);
2105 if (V_inUse(vp) == fileServer) {
2108 if (vp->specialStatus == VBUSY)
2110 VDetachVolume_r(ec, vp);
2112 Log("VAttachVolume: Error detaching volume (%s)\n", name);
2118 /* pre-attach volume if it hasn't been done yet */
2120 (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
2121 (V_attachState(vp) == VOL_STATE_ERROR)) {
2122 nvp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
2128 VCreateReservation_r(nvp);
2134 VChangeState_r(vp, VOL_STATE_ATTACHING);
2136 /* restore monotonically increasing stats */
2137 memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
2142 /* compute path to disk header,
2144 * and verify magic and version stamps */
2145 strcpy(path, VPartitionPath(partp));
2151 if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
2152 Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
2159 n = read(fd, &diskHeader, sizeof(diskHeader));
2161 if (n != sizeof(diskHeader)
2162 || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
2163 Log("VAttachVolume: Error reading volume header %s\n", path);
2168 if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
2169 Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
2175 /* convert on-disk header format to in-memory header format */
2176 DiskToVolumeHeader(&iheader, &diskHeader);
2180 * NOTE: attach2 is entered without any locks, and returns
2181 * with vol_glock_mutex held */
2182 vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
2185 * the event that an error was encountered, or
2186 * the volume was not brought to an attached state
2187 * for any reason, skip to the end. We cannot
2188 * safely call VUpdateVolume unless we "own" it.
2192 (V_attachState(vp) != VOL_STATE_ATTACHED)) {
2196 V_needsCallback(vp) = 0;
2197 VUpdateVolume_r(ec, vp, 0);
2199 Log("VAttachVolume: Error updating volume %u\n", vp->hashid);
2203 if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
2204 #ifndef AFS_DEMAND_ATTACH_FS
2205 /* This is a hack: by temporarily setting the incore
2206 * dontSalvage flag ON, the volume will be put back on the
2207 * Update list (with dontSalvage OFF again). It will then
2208 * come back in N minutes with DONT_SALVAGE eventually
2209 * set. This is the way that volumes that have never had
2210 * it set get it set; or that volumes that have been
2211 * offline without DONT SALVAGE having been set also
2212 * eventually get it set */
2213 V_dontSalvage(vp) = DONT_SALVAGE;
2214 #endif /* !AFS_DEMAND_ATTACH_FS */
2215 VAddToVolumeUpdateList_r(ec, vp);
2217 Log("VAttachVolume: Error adding volume %u to update list\n", vp->hashid);
2224 Log("VOnline: volume %u (%s) attached and online\n", V_id(vp),
2228 VCancelReservation_r(nvp);
2231 if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) {
2232 if (vp && !VIsErrorState(V_attachState(vp))) {
2233 VChangeState_r(vp, VOL_STATE_ERROR);
2240 #endif /* AFS_DEMAND_ATTACH_FS */
2243 * called without any locks held
2244 * returns with vol_glock_mutex held
2247 attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * header,
2248 struct DiskPartition64 * partp, register Volume * vp, int isbusy, int mode)
2250 vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
2251 IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent,
2252 header->largeVnodeIndex);
2253 IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header->parent,
2254 header->smallVnodeIndex);
2255 IH_INIT(vp->diskDataHandle, partp->device, header->parent,
2256 header->volumeInfo);
2257 IH_INIT(vp->linkHandle, partp->device, header->parent, header->linkTable);
2258 vp->shuttingDown = 0;
2259 vp->goingOffline = 0;
2261 #ifdef AFS_DEMAND_ATTACH_FS
2262 vp->stats.last_attach = FT_ApproxTime();
2263 vp->stats.attaches++;
2267 IncUInt64(&VStats.attaches);
2268 vp->cacheCheck = ++VolumeCacheCheck;
2269 /* just in case this ever rolls over */
2270 if (!vp->cacheCheck)
2271 vp->cacheCheck = ++VolumeCacheCheck;
2272 GetVolumeHeader(vp);
2275 #if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
2276 /* demand attach changes the V_PEEK mechanism
2278 * we can now suck the current disk data structure over
2279 * the fssync interface without going to disk
2281 * (technically, we don't need to restrict this feature
2282 * to demand attach fileservers. However, I'm trying
2283 * to limit the number of common code changes)
2285 if (programType != fileServer && mode == V_PEEK) {
2287 res.payload.len = sizeof(VolumeDiskData);
2288 res.payload.buf = &vp->header->diskstuff;
2290 if (FSYNC_VolOp(volumeId,
2291 VPartitionPath(partp),
2292 FSYNC_VOL_QUERY_HDR,
2295 goto disk_header_loaded;
2298 #endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */
2299 (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
2300 sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
2302 #ifdef AFS_DEMAND_ATTACH_FS
2305 IncUInt64(&VStats.hdr_loads);
2306 IncUInt64(&vp->stats.hdr_loads);
2308 #endif /* AFS_DEMAND_ATTACH_FS */
2311 Log("VAttachVolume: Error reading diskDataHandle vol header %s; error=%u\n", path, *ec);
2316 #ifdef AFS_DEMAND_ATTACH_FS
2319 /* check for pending volume operations */
2320 if (vp->pending_vol_op) {
2321 /* see if the pending volume op requires exclusive access */
2322 if (!VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
2323 /* mark the volume down */
2325 VChangeState_r(vp, VOL_STATE_UNATTACHED);
2326 if (V_offlineMessage(vp)[0] == '\0')
2327 strlcpy(V_offlineMessage(vp),
2328 "A volume utility is running.",
2329 sizeof(V_offlineMessage(vp)));
2330 V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
2332 /* check to see if we should set the specialStatus flag */
2333 if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) {
2334 vp->specialStatus = VBUSY;
2339 V_attachFlags(vp) |= VOL_HDR_LOADED;
2340 vp->stats.last_hdr_load = vp->stats.last_attach;
2342 #endif /* AFS_DEMAND_ATTACH_FS */
2345 struct IndexFileHeader iHead;
2347 #if OPENAFS_VOL_STATS
2349 * We just read in the diskstuff part of the header. If the detailed
2350 * volume stats area has not yet been initialized, we should bzero the
2351 * area and mark it as initialized.
2353 if (!(V_stat_initialized(vp))) {
2354 memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
2355 V_stat_initialized(vp) = 1;
2357 #endif /* OPENAFS_VOL_STATS */
2359 (void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle,
2360 (char *)&iHead, sizeof(iHead),
2361 SMALLINDEXMAGIC, SMALLINDEXVERSION);
2364 Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec);
2369 struct IndexFileHeader iHead;
2371 (void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle,
2372 (char *)&iHead, sizeof(iHead),
2373 LARGEINDEXMAGIC, LARGEINDEXVERSION);
2376 Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec);
2380 #ifdef AFS_NAMEI_ENV
2382 struct versionStamp stamp;
2384 (void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp,
2385 sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION);
2388 Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec);
2391 #endif /* AFS_NAMEI_ENV */
2393 #if defined(AFS_DEMAND_ATTACH_FS)
2394 if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) {
2396 if (programType == fileServer) {
2397 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
2400 Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
2406 /* volume operation in progress */
2410 #else /* AFS_DEMAND_ATTACH_FS */
2412 Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
2417 #endif /* AFS_DEMAND_ATTACH_FS */
2419 if (V_needsSalvaged(vp)) {
2420 if (vp->specialStatus)
2421 vp->specialStatus = 0;
2423 #if defined(AFS_DEMAND_ATTACH_FS)
2424 if (programType == fileServer) {
2425 VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
2428 Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
2432 #else /* AFS_DEMAND_ATTACH_FS */
2435 #endif /* AFS_DEMAND_ATTACH_FS */
2440 if (programType == fileServer) {
2441 #ifndef FAST_RESTART
2442 if (V_inUse(vp) && VolumeWriteable(vp)) {
2443 if (!V_needsSalvaged(vp)) {
2444 V_needsSalvaged(vp) = 1;
2445 VUpdateVolume_r(ec, vp, 0);
2447 #if defined(AFS_DEMAND_ATTACH_FS)
2448 VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
2450 #else /* AFS_DEMAND_ATTACH_FS */
2451 Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
2454 #endif /* AFS_DEMAND_ATTACH_FS */
2457 #endif /* FAST_RESTART */
2459 if (V_destroyMe(vp) == DESTROY_ME) {
2460 #if defined(AFS_DEMAND_ATTACH_FS)
2461 /* schedule a salvage so the volume goes away on disk */
2462 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
2463 VChangeState_r(vp, VOL_STATE_ERROR);
2465 #endif /* AFS_DEMAND_ATTACH_FS */
2467 Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
2473 vp->nextVnodeUnique = V_uniquifier(vp);
2474 vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL;
2475 #ifndef BITMAP_LATER
2476 if (programType == fileServer && VolumeWriteable(vp)) {
2478 for (i = 0; i < nVNODECLASSES; i++) {
2479 VGetBitmap_r(ec, vp, i);
2481 #ifdef AFS_DEMAND_ATTACH_FS
2482 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
2484 #else /* AFS_DEMAND_ATTACH_FS */
2486 #endif /* AFS_DEMAND_ATTACH_FS */
2487 Log("VAttachVolume: error getting bitmap for volume (%s)\n",
2493 #endif /* BITMAP_LATER */
2495 if (programType == fileServer) {
2496 if (vp->specialStatus)
2497 vp->specialStatus = 0;
2498 if (V_blessed(vp) && V_inService(vp) && !V_needsSalvaged(vp)) {
2499 V_inUse(vp) = fileServer;
2500 V_offlineMessage(vp)[0] = '\0';
2503 V_inUse(vp) = programType;
2504 V_checkoutMode(vp) = mode;
2507 AddVolumeToHashTable(vp, V_id(vp));
2508 #ifdef AFS_DEMAND_ATTACH_FS
2509 AddVolumeToVByPList_r(vp);
2511 if ((programType != fileServer) ||
2512 (V_inUse(vp) == fileServer)) {
2513 VChangeState_r(vp, VOL_STATE_ATTACHED);
2515 VChangeState_r(vp, VOL_STATE_UNATTACHED);
2521 /* Attach an existing volume.
2522 The volume also normally goes online at this time.
2523 An offline volume must be reattached to make it go online.
2527 VAttachVolume(Error * ec, VolumeId volumeId, int mode)
2531 retVal = VAttachVolume_r(ec, volumeId, mode);
2537 VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
2540 VGetVolumePath(ec, volumeId, &part, &name);
2542 register Volume *vp;
2544 vp = VGetVolume_r(&error, volumeId);
2546 assert(V_inUse(vp) == 0);
2547 VDetachVolume_r(ec, vp);
2551 return VAttachVolumeByName_r(ec, part, name, mode);
2554 /* Increment a reference count to a volume, sans context swaps. Requires
2555 * possibly reading the volume header in from the disk, since there's
2556 * an invariant in the volume package that nUsers>0 ==> vp->header is valid.
2558 * N.B. This call can fail if we can't read in the header!! In this case
2559 * we still guarantee we won't context swap, but the ref count won't be
2560 * incremented (otherwise we'd violate the invariant).
2562 /* NOTE: with the demand attach fileserver extensions, the global lock
2563 * is dropped within VHold */
2564 #ifdef AFS_DEMAND_ATTACH_FS
2566 VHold_r(register Volume * vp)
2570 VCreateReservation_r(vp);
2571 VWaitExclusiveState_r(vp);
2573 LoadVolumeHeader(&error, vp);
2575 VCancelReservation_r(vp);
2579 VCancelReservation_r(vp);
2582 #else /* AFS_DEMAND_ATTACH_FS */
2584 VHold_r(register Volume * vp)
2588 LoadVolumeHeader(&error, vp);
2594 #endif /* AFS_DEMAND_ATTACH_FS */
2597 VHold(register Volume * vp)
2601 retVal = VHold_r(vp);
2607 /***************************************************/
2608 /* get and put volume routines */
2609 /***************************************************/
2612 * put back a heavyweight reference to a volume object.
2614 * @param[in] vp volume object pointer
2616 * @pre VOL_LOCK held
2618 * @post heavyweight volume reference put back.
2619 * depending on state, volume may have been taken offline,
2620 * detached, salvaged, freed, etc.
2622 * @internal volume package internal use only
2625 VPutVolume_r(register Volume * vp)
2627 assert(--vp->nUsers >= 0);
2628 if (vp->nUsers == 0) {
2630 ReleaseVolumeHeader(vp->header);
2631 #ifdef AFS_DEMAND_ATTACH_FS
2632 if (!VCheckDetach(vp)) {
2636 #else /* AFS_DEMAND_ATTACH_FS */
2638 #endif /* AFS_DEMAND_ATTACH_FS */
2643 VPutVolume(register Volume * vp)
2651 /* Get a pointer to an attached volume. The pointer is returned regardless
2652 of whether or not the volume is in service or on/off line. An error
2653 code, however, is returned with an indication of the volume's status */
2655 VGetVolume(Error * ec, Error * client_ec, VolId volumeId)
2659 retVal = GetVolume(ec, client_ec, volumeId, NULL, 0);
2665 VGetVolume_r(Error * ec, VolId volumeId)
2667 return GetVolume(ec, NULL, volumeId, NULL, 0);
2670 /* try to get a volume we've previously looked up */
2671 /* for demand attach fs, caller MUST NOT hold a ref count on vp */
2673 VGetVolumeByVp_r(Error * ec, Volume * vp)
2675 return GetVolume(ec, NULL, vp->hashid, vp, 0);
2678 /* private interface for getting a volume handle
2679 * volumeId must be provided.
2680 * hint is an optional parameter to speed up hash lookups
2681 * flags is not used at this time
2683 /* for demand attach fs, caller MUST NOT hold a ref count on hint */
2685 GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags)
2688 /* pull this profiling/debugging code out of regular builds */
2690 #define VGET_CTR_INC(x) x++
2691 unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V5 = 0, V6 =
2692 0, V7 = 0, V8 = 0, V9 = 0;
2693 unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0;
2695 #define VGET_CTR_INC(x)
2697 #ifdef AFS_DEMAND_ATTACH_FS
2698 Volume *avp, * rvp = hint;
2701 #ifdef AFS_DEMAND_ATTACH_FS
2703 VCreateReservation_r(rvp);
2705 #endif /* AFS_DEMAND_ATTACH_FS */
2713 vp = VLookupVolume_r(ec, volumeId, vp);
2719 #ifdef AFS_DEMAND_ATTACH_FS
2720 if (rvp && (rvp != vp)) {
2721 /* break reservation on old vp */
2722 VCancelReservation_r(rvp);
2725 #endif /* AFS_DEMAND_ATTACH_FS */
2731 /* Until we have reached an initialization level of 2
2732 * we don't know whether this volume exists or not.
2733 * We can't sleep and retry later because before a volume
2734 * is attached, the caller tries to get it first. Just
2735 * return VOFFLINE and the caller can choose whether to
2736 * retry the command or not. */
2746 IncUInt64(&VStats.hdr_gets);
2748 #ifdef AFS_DEMAND_ATTACH_FS
2749 /* block if someone else is performing an exclusive op on this volume */
2752 VCreateReservation_r(rvp);
2754 VWaitExclusiveState_r(vp);
2756 /* short circuit with VNOVOL in the following circumstances:
2759 * VOL_STATE_SHUTTING_DOWN
2761 if ((V_attachState(vp) == VOL_STATE_ERROR) ||
2762 (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN)) {
2769 * short circuit with VOFFLINE in the following circumstances:
2771 * VOL_STATE_UNATTACHED
2773 if (V_attachState(vp) == VOL_STATE_UNATTACHED) {
2779 /* allowable states:
2787 if (vp->salvage.requested) {
2788 VUpdateSalvagePriority_r(vp);
2791 if (V_attachState(vp) == VOL_STATE_PREATTACHED) {
2792 avp = VAttachVolumeByVp_r(ec, vp, 0);
2795 /* VAttachVolumeByVp_r can return a pointer
2796 * != the vp passed to it under certain
2797 * conditions; make sure we don't leak
2798 * reservations if that happens */
2800 VCancelReservation_r(rvp);
2802 VCreateReservation_r(rvp);
2812 if (!vp->pending_vol_op) {
2827 if ((V_attachState(vp) == VOL_STATE_SALVAGING) ||
2828 (*ec == VSALVAGING)) {
2830 /* see CheckVnode() in afsfileprocs.c for an explanation
2831 * of this error code logic */
2832 afs_uint32 now = FT_ApproxTime();
2833 if ((vp->stats.last_salvage + (10 * 60)) >= now) {
2836 *client_ec = VRESTARTING;
2845 LoadVolumeHeader(ec, vp);
2848 /* Only log the error if it was a totally unexpected error. Simply
2849 * a missing inode is likely to be caused by the volume being deleted */
2850 if (errno != ENXIO || LogLevel)
2851 Log("Volume %u: couldn't reread volume header\n",
2853 #ifdef AFS_DEMAND_ATTACH_FS
2854 if (programType == fileServer) {
2855 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
2860 #else /* AFS_DEMAND_ATTACH_FS */
2863 #endif /* AFS_DEMAND_ATTACH_FS */
2867 #ifdef AFS_DEMAND_ATTACH_FS
2869 * this test MUST happen after the volume header is loaded
2871 if (vp->pending_vol_op && !VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
2873 /* see CheckVnode() in afsfileprocs.c for an explanation
2874 * of this error code logic */
2875 afs_uint32 now = FT_ApproxTime();
2876 if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
2879 *client_ec = VRESTARTING;
2883 ReleaseVolumeHeader(vp->header);
2887 #endif /* AFS_DEMAND_ATTACH_FS */
2890 if (vp->shuttingDown) {
2897 if (programType == fileServer) {
2899 if (vp->goingOffline) {
2901 #ifdef AFS_DEMAND_ATTACH_FS
2902 /* wait for the volume to go offline */
2903 if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
2904 VWaitStateChange_r(vp);
2906 #elif defined(AFS_PTHREAD_ENV)
2907 VOL_CV_WAIT(&vol_put_volume_cond);
2908 #else /* AFS_PTHREAD_ENV */
2909 LWP_WaitProcess(VPutVolume);
2910 #endif /* AFS_PTHREAD_ENV */
2913 if (vp->specialStatus) {
2915 *ec = vp->specialStatus;
2916 } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
2919 } else if (V_inUse(vp) == 0) {
2930 #ifdef AFS_DEMAND_ATTACH_FS
2931 /* if no error, bump nUsers */
2934 VLRU_UpdateAccess_r(vp);
2937 VCancelReservation_r(rvp);
2940 if (client_ec && !*client_ec) {
2943 #else /* AFS_DEMAND_ATTACH_FS */
2944 /* if no error, bump nUsers */
2951 #endif /* AFS_DEMAND_ATTACH_FS */
2958 /***************************************************/
2959 /* Volume offline/detach routines */
2960 /***************************************************/
2962 /* caller MUST hold a heavyweight ref on vp */
2963 #ifdef AFS_DEMAND_ATTACH_FS
2965 VTakeOffline_r(register Volume * vp)
2969 assert(vp->nUsers > 0);
2970 assert(programType == fileServer);
2972 VCreateReservation_r(vp);
2973 VWaitExclusiveState_r(vp);
2975 vp->goingOffline = 1;
2976 V_needsSalvaged(vp) = 1;
2978 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
2979 VCancelReservation_r(vp);
2981 #else /* AFS_DEMAND_ATTACH_FS */
2983 VTakeOffline_r(register Volume * vp)
2985 assert(vp->nUsers > 0);
2986 assert(programType == fileServer);
2988 vp->goingOffline = 1;
2989 V_needsSalvaged(vp) = 1;
2991 #endif /* AFS_DEMAND_ATTACH_FS */
2994 VTakeOffline(register Volume * vp)
3002 * force a volume offline.
3004 * @param[in] vp volume object pointer
3005 * @param[in] flags flags (see note below)
3007 * @note the flag VOL_FORCEOFF_NOUPDATE is a recursion control flag
3008 * used when VUpdateVolume_r needs to call VForceOffline_r
3009 * (which in turn would normally call VUpdateVolume_r)
3011 * @see VUpdateVolume_r
3013 * @pre VOL_LOCK must be held.
3014 * for DAFS, caller must hold ref.
3016 * @note for DAFS, it _is safe_ to call this function from an
3019 * @post needsSalvaged flag is set.
3020 * for DAFS, salvage is requested.
3021 * no further references to the volume through the volume
3022 * package will be honored.
3023 * all file descriptor and vnode caches are invalidated.
3025 * @warning this is a heavy-handed interface. it results in
3026 * a volume going offline regardless of the current
3027 * reference count state.
3029 * @internal volume package internal use only
3032 VForceOffline_r(Volume * vp, int flags)
3036 #ifdef AFS_DEMAND_ATTACH_FS
3037 VChangeState_r(vp, VOL_STATE_ERROR);
3042 strcpy(V_offlineMessage(vp),
3043 "Forced offline due to internal error: volume needs to be salvaged");
3044 Log("Volume %u forced offline: it needs salvaging!\n", V_id(vp));
3047 vp->goingOffline = 0;
3048 V_needsSalvaged(vp) = 1;
3049 if (!(flags & VOL_FORCEOFF_NOUPDATE)) {
3050 VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF);
3053 #ifdef AFS_DEMAND_ATTACH_FS
3054 VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
3055 #endif /* AFS_DEMAND_ATTACH_FS */
3057 #ifdef AFS_PTHREAD_ENV
3058 assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
3059 #else /* AFS_PTHREAD_ENV */
3060 LWP_NoYieldSignal(VPutVolume);
3061 #endif /* AFS_PTHREAD_ENV */
3063 VReleaseVolumeHandles_r(vp);
3067 * force a volume offline.
3069 * @param[in] vp volume object pointer
3071 * @see VForceOffline_r
3074 VForceOffline(Volume * vp)
3077 VForceOffline_r(vp, 0);
3081 /* The opposite of VAttachVolume. The volume header is written to disk, with
3082 the inUse bit turned off. A copy of the header is maintained in memory,
3083 however (which is why this is VOffline, not VDetach).
3086 VOffline_r(Volume * vp, char *message)
3089 VolumeId vid = V_id(vp);
3091 assert(programType != volumeUtility);
3096 if (V_offlineMessage(vp)[0] == '\0')
3097 strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
3098 V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
3100 vp->goingOffline = 1;
3101 #ifdef AFS_DEMAND_ATTACH_FS
3102 VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
3103 VCreateReservation_r(vp);
3106 /* wait for the volume to go offline */
3107 if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
3108 VWaitStateChange_r(vp);
3110 VCancelReservation_r(vp);
3111 #else /* AFS_DEMAND_ATTACH_FS */
3113 vp = VGetVolume_r(&error, vid); /* Wait for it to go offline */
3114 if (vp) /* In case it was reattached... */
3116 #endif /* AFS_DEMAND_ATTACH_FS */
3120 VOffline(Volume * vp, char *message)
3123 VOffline_r(vp, message);
3127 /* This gets used for the most part by utility routines that don't want
3128 * to keep all the volume headers around. Generally, the file server won't
3129 * call this routine, because then the offline message in the volume header
3130 * (or other information) won't be available to clients. For NAMEI, also
3131 * close the file handles. However, the fileserver does call this during
3132 * an attach following a volume operation.
3135 VDetachVolume_r(Error * ec, Volume * vp)
3138 struct DiskPartition64 *tpartp;
3139 int notifyServer, useDone = FSYNC_VOL_ON;
3141 *ec = 0; /* always "succeeds" */
3142 if (programType == volumeUtility) {
3143 notifyServer = vp->needsPutBack;
3144 if (V_destroyMe(vp) == DESTROY_ME)
3145 useDone = FSYNC_VOL_DONE;
3146 #ifdef AFS_DEMAND_ATTACH_FS
3147 else if (!V_blessed(vp) || !V_inService(vp))
3148 useDone = FSYNC_VOL_LEAVE_OFF;
3151 tpartp = vp->partition;
3153 DeleteVolumeFromHashTable(vp);
3154 vp->shuttingDown = 1;
3155 #ifdef AFS_DEMAND_ATTACH_FS
3156 DeleteVolumeFromVByPList_r(vp);
3158 VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN);
3159 #endif /* AFS_DEMAND_ATTACH_FS */
3161 /* Will be detached sometime in the future--this is OK since volume is offline */
3163 /* XXX the following code should really be moved to VCheckDetach() since the volume
3164 * is not technically detached until the refcounts reach zero
3166 #ifdef FSSYNC_BUILD_CLIENT
3167 if (programType == volumeUtility && notifyServer) {
3169 * Note: The server is not notified in the case of a bogus volume
3170 * explicitly to make it possible to create a volume, do a partial
3171 * restore, then abort the operation without ever putting the volume
3172 * online. This is essential in the case of a volume move operation
3173 * between two partitions on the same server. In that case, there
3174 * would be two instances of the same volume, one of them bogus,
3175 * which the file server would attempt to put on line
3177 FSYNC_VolOp(volume, tpartp->name, useDone, 0, NULL);
3178 /* XXX this code path is only hit by volume utilities, thus
3179 * V_BreakVolumeCallbacks will always be NULL. if we really
3180 * want to break callbacks in this path we need to use FSYNC_VolOp() */
3182 /* Dettaching it so break all callbacks on it */
3183 if (V_BreakVolumeCallbacks) {
3184 Log("volume %u detached; breaking all call backs\n", volume);
3185 (*V_BreakVolumeCallbacks) (volume);
3189 #endif /* FSSYNC_BUILD_CLIENT */
3193 VDetachVolume(Error * ec, Volume * vp)
3196 VDetachVolume_r(ec, vp);
3201 /***************************************************/
3202 /* Volume fd/inode handle closing routines */
3203 /***************************************************/
3205 /* For VDetachVolume, we close all cached file descriptors, but keep
3206 * the Inode handles in case we need to read from a busy volume.
3208 /* for demand attach, caller MUST hold ref count on vp */
3210 VCloseVolumeHandles_r(Volume * vp)
3212 #ifdef AFS_DEMAND_ATTACH_FS
3213 VolState state_save;
3215 state_save = VChangeState_r(vp, VOL_STATE_OFFLINING);
3220 * XXX need to investigate whether we can perform
3221 * DFlushVolume outside of vol_glock_mutex...
3223 * VCloseVnodeFiles_r drops the glock internally */
3224 DFlushVolume(V_id(vp));
3225 VCloseVnodeFiles_r(vp);
3227 #ifdef AFS_DEMAND_ATTACH_FS
3231 /* Too time consuming and unnecessary for the volserver */
3232 if (programType != volumeUtility) {
3233 IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
3234 IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
3235 IH_CONDSYNC(vp->diskDataHandle);
3237 IH_CONDSYNC(vp->linkHandle);
3238 #endif /* AFS_NT40_ENV */
3241 IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
3242 IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
3243 IH_REALLYCLOSE(vp->diskDataHandle);
3244 IH_REALLYCLOSE(vp->linkHandle);
3246 #ifdef AFS_DEMAND_ATTACH_FS
3248 VChangeState_r(vp, state_save);
3252 /* For both VForceOffline and VOffline, we close all relevant handles.
3253 * For VOffline, if we re-attach the volume, the files may possible be
3254 * different than before.
3256 /* for demand attach, caller MUST hold a ref count on vp */
3258 VReleaseVolumeHandles_r(Volume * vp)
3260 #ifdef AFS_DEMAND_ATTACH_FS
3261 VolState state_save;
3263 state_save = VChangeState_r(vp, VOL_STATE_DETACHING);
3266 /* XXX need to investigate whether we can perform
3267 * DFlushVolume outside of vol_glock_mutex... */
3268 DFlushVolume(V_id(vp));
3270 VReleaseVnodeFiles_r(vp); /* releases the glock internally */
3272 #ifdef AFS_DEMAND_ATTACH_FS
3276 /* Too time consuming and unnecessary for the volserver */
3277 if (programType != volumeUtility) {
3278 IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
3279 IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
3280 IH_CONDSYNC(vp->diskDataHandle);
3282 IH_CONDSYNC(vp->linkHandle);
3283 #endif /* AFS_NT40_ENV */
3286 IH_RELEASE(vp->vnodeIndex[vLarge].handle);
3287 IH_RELEASE(vp->vnodeIndex[vSmall].handle);
3288 IH_RELEASE(vp->diskDataHandle);
3289 IH_RELEASE(vp->linkHandle);
3291 #ifdef AFS_DEMAND_ATTACH_FS
3293 VChangeState_r(vp, state_save);
3298 /***************************************************/
3299 /* Volume write and fsync routines */
3300 /***************************************************/
3303 VUpdateVolume_r(Error * ec, Volume * vp, int flags)
3305 #ifdef AFS_DEMAND_ATTACH_FS
3306 VolState state_save;
3308 if (flags & VOL_UPDATE_WAIT) {
3309 VCreateReservation_r(vp);
3310 VWaitExclusiveState_r(vp);
3315 if (programType == fileServer)
3317 (V_inUse(vp) ? V_nextVnodeUnique(vp) +
3318 200 : V_nextVnodeUnique(vp));
3320 #ifdef AFS_DEMAND_ATTACH_FS
3321 state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
3325 WriteVolumeHeader_r(ec, vp);
3327 #ifdef AFS_DEMAND_ATTACH_FS
3329 VChangeState_r(vp, state_save);
3330 if (flags & VOL_UPDATE_WAIT) {
3331 VCancelReservation_r(vp);
3336 Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
3337 V_id(vp), V_name(vp));
3338 /* try to update on-disk header,
3339 * while preventing infinite recursion */
3340 if (!(flags & VOL_UPDATE_NOFORCEOFF)) {
3341 VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE);
3347 VUpdateVolume(Error * ec, Volume * vp)
3350 VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
3355 VSyncVolume_r(Error * ec, Volume * vp, int flags)
3359 #ifdef AFS_DEMAND_ATTACH_FS
3360 VolState state_save;
3363 if (flags & VOL_SYNC_WAIT) {
3364 VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
3366 VUpdateVolume_r(ec, vp, 0);
3369 #ifdef AFS_DEMAND_ATTACH_FS
3370 state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
3373 fdP = IH_OPEN(V_diskDataHandle(vp));
3374 assert(fdP != NULL);
3375 code = FDH_SYNC(fdP);
3378 #ifdef AFS_DEMAND_ATTACH_FS
3380 VChangeState_r(vp, state_save);
3386 VSyncVolume(Error * ec, Volume * vp)
3389 VSyncVolume_r(ec, vp, VOL_SYNC_WAIT);
3394 /***************************************************/
3395 /* Volume dealloaction routines */
3396 /***************************************************/
3398 #ifdef AFS_DEMAND_ATTACH_FS
3400 FreeVolume(Volume * vp)
3402 /* free the heap space, iff it's safe.
3403 * otherwise, pull it out of the hash table, so it
3404 * will get deallocated when all refs to it go away */
3405 if (!VCheckFree(vp)) {
3406 DeleteVolumeFromHashTable(vp);
3407 DeleteVolumeFromVByPList_r(vp);
3409 /* make sure we invalidate the header cache entry */
3410 FreeVolumeHeader(vp);
3413 #endif /* AFS_DEMAND_ATTACH_FS */
3416 ReallyFreeVolume(Volume * vp)
3421 #ifdef AFS_DEMAND_ATTACH_FS
3423 VChangeState_r(vp, VOL_STATE_FREED);
3424 if (vp->pending_vol_op)
3425 free(vp->pending_vol_op);
3426 #endif /* AFS_DEMAND_ATTACH_FS */
3427 for (i = 0; i < nVNODECLASSES; i++)
3428 if (vp->vnodeIndex[i].bitmap)
3429 free(vp->vnodeIndex[i].bitmap);
3430 FreeVolumeHeader(vp);
3431 #ifndef AFS_DEMAND_ATTACH_FS
3432 DeleteVolumeFromHashTable(vp);
3433 #endif /* AFS_DEMAND_ATTACH_FS */
3437 /* check to see if we should shutdown this volume
3438 * returns 1 if volume was freed, 0 otherwise */
3439 #ifdef AFS_DEMAND_ATTACH_FS
3441 VCheckDetach(register Volume * vp)
3446 if (vp->nUsers || vp->nWaiters)
3449 if (vp->shuttingDown) {
3451 if ((programType != fileServer) &&
3452 (V_inUse(vp) == programType) &&
3453 ((V_checkoutMode(vp) == V_VOLUPD) ||
3454 ((V_checkoutMode(vp) == V_CLONE) &&
3455 (VolumeWriteable(vp))))) {
3457 VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
3459 Log("VCheckDetach: failed to clear inUse failed during detachment of volid %u\n",
3463 VReleaseVolumeHandles_r(vp);
3465 ReallyFreeVolume(vp);
3466 if (programType == fileServer) {
3467 assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
3472 #else /* AFS_DEMAND_ATTACH_FS */
3474 VCheckDetach(register Volume * vp)
3482 if (vp->shuttingDown) {
3484 if ((programType != fileServer) &&
3485 (V_inUse(vp) == programType) &&
3486 ((V_checkoutMode(vp) == V_VOLUPD) ||
3487 ((V_checkoutMode(vp) == V_CLONE) &&
3488 (VolumeWriteable(vp))))) {
3490 VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
3492 Log("VCheckDetach: failed to clear inUse failed during detachment of volid %u\n",
3496 VReleaseVolumeHandles_r(vp);
3497 ReallyFreeVolume(vp);
3498 if (programType == fileServer) {
3499 #if defined(AFS_PTHREAD_ENV)
3500 assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
3501 #else /* AFS_PTHREAD_ENV */
3502 LWP_NoYieldSignal(VPutVolume);
3503 #endif /* AFS_PTHREAD_ENV */
3508 #endif /* AFS_DEMAND_ATTACH_FS */
3510 /* check to see if we should offline this volume
3511 * return 1 if volume went offline, 0 otherwise */
3512 #ifdef AFS_DEMAND_ATTACH_FS
3514 VCheckOffline(register Volume * vp)
3516 Volume * rvp = NULL;
3519 if (vp->goingOffline && !vp->nUsers) {
3521 assert(programType == fileServer);
3522 assert((V_attachState(vp) != VOL_STATE_ATTACHED) &&
3523 (V_attachState(vp) != VOL_STATE_FREED) &&
3524 (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
3525 (V_attachState(vp) != VOL_STATE_UNATTACHED));
3529 * VOL_STATE_GOING_OFFLINE
3530 * VOL_STATE_SHUTTING_DOWN
3531 * VIsErrorState(V_attachState(vp))
3532 * VIsExclusiveState(V_attachState(vp))
3535 VCreateReservation_r(vp);
3536 VChangeState_r(vp, VOL_STATE_OFFLINING);
3539 /* must clear the goingOffline flag before we drop the glock */
3540 vp->goingOffline = 0;
3545 /* perform async operations */
3546 VUpdateVolume_r(&error, vp, 0);
3547 VCloseVolumeHandles_r(vp);
3550 Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
3552 if (V_offlineMessage(vp)[0])
3553 Log(" (%s)", V_offlineMessage(vp));
3557 /* invalidate the volume header cache entry */
3558 FreeVolumeHeader(vp);
3560 /* if nothing changed state to error or salvaging,
3561 * drop state to unattached */
3562 if (!VIsErrorState(V_attachState(vp))) {
3563 VChangeState_r(vp, VOL_STATE_UNATTACHED);
3565 VCancelReservation_r(vp);
3566 /* no usage of vp is safe beyond this point */
3570 #else /* AFS_DEMAND_ATTACH_FS */
3572 VCheckOffline(register Volume * vp)
3574 Volume * rvp = NULL;
3577 if (vp->goingOffline && !vp->nUsers) {
3579 assert(programType == fileServer);
3582 vp->goingOffline = 0;
3584 VUpdateVolume_r(&error, vp, 0);
3585 VCloseVolumeHandles_r(vp);
3587 Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
3589 if (V_offlineMessage(vp)[0])
3590 Log(" (%s)", V_offlineMessage(vp));
3593 FreeVolumeHeader(vp);
3594 #ifdef AFS_PTHREAD_ENV
3595 assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
3596 #else /* AFS_PTHREAD_ENV */
3597 LWP_NoYieldSignal(VPutVolume);
3598 #endif /* AFS_PTHREAD_ENV */
3602 #endif /* AFS_DEMAND_ATTACH_FS */
3604 /***************************************************/
3605 /* demand attach fs ref counting routines */
3606 /***************************************************/
3608 #ifdef AFS_DEMAND_ATTACH_FS
3609 /* the following two functions handle reference counting for
3610 * asynchronous operations on volume structs.
3612 * their purpose is to prevent a VDetachVolume or VShutdown
3613 * from free()ing the Volume struct during an async i/o op */
3615 /* register with the async volume op ref counter */
3616 /* VCreateReservation_r moved into inline code header because it
3617 * is now needed in vnode.c -- tkeiser 11/20/2007
3621 * decrement volume-package internal refcount.
3623 * @param vp volume object pointer
3625 * @internal volume package internal use only
3628 * @arg VOL_LOCK is held
3629 * @arg lightweight refcount held
3631 * @post volume waiters refcount is decremented; volume may
3632 * have been deallocated/shutdown/offlined/salvaged/
3633 * whatever during the process
3635 * @warning once you have tossed your last reference (you can acquire
3636 * lightweight refs recursively) it is NOT SAFE to reference
3637 * a volume object pointer ever again
3639 * @see VCreateReservation_r
3641 * @note DEMAND_ATTACH_FS only
3644 VCancelReservation_r(Volume * vp)
3646 assert(--vp->nWaiters >= 0);
3647 if (vp->nWaiters == 0) {
3649 if (!VCheckDetach(vp)) {
3656 /* check to see if we should free this volume now
3657 * return 1 if volume was freed, 0 otherwise */
3659 VCheckFree(Volume * vp)
3662 if ((vp->nUsers == 0) &&
3663 (vp->nWaiters == 0) &&
3664 !(V_attachFlags(vp) & (VOL_IN_HASH |
3668 ReallyFreeVolume(vp);
3673 #endif /* AFS_DEMAND_ATTACH_FS */
3676 /***************************************************/
3677 /* online volume operations routines */
3678 /***************************************************/
3680 #ifdef AFS_DEMAND_ATTACH_FS
3682 * register a volume operation on a given volume.
3684 * @param[in] vp volume object
3685 * @param[in] vopinfo volume operation info object
3687 * @pre VOL_LOCK is held
3689 * @post volume operation info object attached to volume object.
3690 * volume operation statistics updated.
3692 * @note by "attached" we mean a copy of the passed in object is made
3694 * @internal volume package internal use only
3697 VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
3699 FSSYNC_VolOp_info * info;
3701 /* attach a vol op info node to the volume struct */
3702 info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info));
3703 assert(info != NULL);
3704 memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info));
3705 vp->pending_vol_op = info;
3708 vp->stats.last_vol_op = FT_ApproxTime();
3709 vp->stats.vol_ops++;
3710 IncUInt64(&VStats.vol_ops);
3716 * deregister the volume operation attached to this volume.
3718 * @param[in] vp volume object pointer
3720 * @pre VOL_LOCK is held
3722 * @post the volume operation info object is detached from the volume object
3724 * @internal volume package internal use only
3727 VDeregisterVolOp_r(Volume * vp)
3729 if (vp->pending_vol_op) {
3730 free(vp->pending_vol_op);
3731 vp->pending_vol_op = NULL;
3735 #endif /* AFS_DEMAND_ATTACH_FS */
3738 * determine whether it is safe to leave a volume online during
3739 * the volume operation described by the vopinfo object.
3741 * @param[in] vp volume object
3742 * @param[in] vopinfo volume operation info object
3744 * @return whether it is safe to leave volume online
3745 * @retval 0 it is NOT SAFE to leave the volume online
3746 * @retval 1 it is safe to leave the volume online during the operation
3749 * @arg VOL_LOCK is held
3750 * @arg disk header attached to vp (heavyweight ref on vp will guarantee
3751 * this condition is met)
3753 * @internal volume package internal use only
3756 VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
3758 return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
3759 (vopinfo->com.reason == V_READONLY ||
3760 (!VolumeWriteable(vp) &&
3761 (vopinfo->com.reason == V_CLONE ||
3762 vopinfo->com.reason == V_DUMP))));
3766 * determine whether VBUSY should be set during this volume operation.
3768 * @param[in] vp volume object
3769 * @param[in] vopinfo volume operation info object
3771 * @return whether VBUSY should be set
3772 * @retval 0 VBUSY does NOT need to be set
3773 * @retval 1 VBUSY SHOULD be set
3775 * @pre VOL_LOCK is held
3777 * @internal volume package internal use only
3780 VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
3782 return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
3783 (vopinfo->com.reason == V_CLONE ||
3784 vopinfo->com.reason == V_DUMP));
3788 /***************************************************/
3789 /* online salvager routines */
3790 /***************************************************/
3791 #if defined(AFS_DEMAND_ATTACH_FS)
3792 #define SALVAGE_PRIO_UPDATE_INTERVAL 3 /**< number of seconds between prio updates */
3793 #define SALVAGE_COUNT_MAX 16 /**< number of online salvages we
3794 * allow before moving the volume
3795 * into a permanent error state
3797 * once this threshold is reached,
3798 * the operator will have to manually
3799 * issue a 'bos salvage' to bring
3800 * the volume back online
3804 * check whether a salvage needs to be performed on this volume.
3806 * @param[in] vp pointer to volume object
3808 * @return status code
3809 * @retval 0 no salvage scheduled
3810 * @retval 1 a salvage has been scheduled with the salvageserver
3812 * @pre VOL_LOCK is held
3814 * @post if salvage request flag is set and nUsers and nWaiters are zero,
3815 * then a salvage will be requested
3817 * @note this is one of the event handlers called by VCancelReservation_r
3819 * @see VCancelReservation_r
3821 * @internal volume package internal use only.
3824 VCheckSalvage(register Volume * vp)
3827 #ifdef SALVSYNC_BUILD_CLIENT
3828 if (vp->nUsers || vp->nWaiters)
3830 if (vp->salvage.requested) {
3831 VScheduleSalvage_r(vp);
3834 #endif /* SALVSYNC_BUILD_CLIENT */
3839 * request volume salvage.
3841 * @param[out] ec computed client error code
3842 * @param[in] vp volume object pointer
3843 * @param[in] reason reason code (passed to salvageserver via SALVSYNC)
3844 * @param[in] flags see flags note below
3847 * VOL_SALVAGE_INVALIDATE_HEADER causes volume header cache entry
3848 * to be invalidated.
3850 * @pre VOL_LOCK is held.
3852 * @post volume state is changed.
3853 * for fileserver, salvage will be requested once refcount reaches zero.
3855 * @return operation status code
3856 * @retval 0 volume salvage will occur
3857 * @retval 1 volume salvage could not be scheduled
3859 * @note DAFS fileserver only
3861 * @note this call does not synchronously schedule a volume salvage. rather,
3862 * it sets volume state so that when volume refcounts reach zero, a
3863 * volume salvage will occur. by "refcounts", we mean both nUsers and
3864 * nWaiters must be zero.
3866 * @internal volume package internal use only.
3869 VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags)
3873 * for DAFS volume utilities, transition to error state
3874 * (at some point in the future, we should consider
3875 * making volser talk to salsrv)
3877 if (programType != fileServer) {
3878 VChangeState_r(vp, VOL_STATE_ERROR);
3883 if (!vp->salvage.requested) {
3884 vp->salvage.requested = 1;
3885 vp->salvage.reason = reason;
3886 vp->stats.last_salvage = FT_ApproxTime();
3887 if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
3888 /* XXX this should likely be changed to FreeVolumeHeader() */
3889 ReleaseVolumeHeader(vp->header);
3891 if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
3892 VChangeState_r(vp, VOL_STATE_SALVAGING);
3895 Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
3896 VChangeState_r(vp, VOL_STATE_ERROR);
3905 * update salvageserver scheduling priority for a volume.
3907 * @param[in] vp pointer to volume object
3909 * @return operation status
3911 * @retval 1 request denied, or SALVSYNC communications failure
3913 * @pre VOL_LOCK is held.
3915 * @post in-core salvage priority counter is incremented. if at least
3916 * SALVAGE_PRIO_UPDATE_INTERVAL seconds have elapsed since the
3917 * last SALVSYNC_RAISEPRIO request, we contact the salvageserver
3918 * to update its priority queue. if no salvage is scheduled,
3919 * this function is a no-op.
3921 * @note DAFS fileserver only
3923 * @note this should be called whenever a VGetVolume fails due to a
3924 * pending salvage request
3926 * @todo should set exclusive state and drop glock around salvsync call
3928 * @internal volume package internal use only.
3931 VUpdateSalvagePriority_r(Volume * vp)
3936 #ifdef SALVSYNC_BUILD_CLIENT
3938 now = FT_ApproxTime();
3940 /* update the salvageserver priority queue occasionally so that
3941 * frequently requested volumes get moved to the head of the queue
3943 if ((vp->salvage.scheduled) &&
3944 (vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) {
3945 code = SALVSYNC_SalvageVolume(vp->hashid,
3946 VPartitionPath(vp->partition),
3951 vp->stats.last_salvage_req = now;
3952 if (code != SYNC_OK) {
3956 #endif /* SALVSYNC_BUILD_CLIENT */
3962 * schedule a salvage with the salvage server.
3964 * @param[in] vp pointer to volume object
3966 * @return operation status
3967 * @retval 0 salvage scheduled successfully
3968 * @retval 1 salvage not scheduled, or SALVSYNC com error
3971 * @arg VOL_LOCK is held.
3972 * @arg nUsers and nWaiters should be zero.
3974 * @post salvageserver is sent a salvage request
3976 * @note DAFS fileserver only
3978 * @internal volume package internal use only.
3981 VScheduleSalvage_r(Volume * vp)
3984 #ifdef SALVSYNC_BUILD_CLIENT
3985 VolState state_save;
3988 if (vp->nWaiters || vp->nUsers) {
3992 /* prevent endless salvage,attach,salvage,attach,... loops */
3993 if (vp->stats.salvages >= SALVAGE_COUNT_MAX)
3996 if (!vp->salvage.scheduled) {
3997 /* if we haven't previously scheduled a salvage, do so now
3999 * set the volume to an exclusive state and drop the lock
4000 * around the SALVSYNC call
4002 * note that we do NOT acquire a reservation here -- doing so
4003 * could result in unbounded recursion
4005 strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName));
4006 state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
4007 V_attachFlags(vp) |= VOL_IS_BUSY;
4010 /* can't use V_id() since there's no guarantee
4011 * we have the disk data header at this point */
4012 code = SALVSYNC_SalvageVolume(vp->hashid,
4019 VChangeState_r(vp, state_save);
4020 V_attachFlags(vp) &= ~(VOL_IS_BUSY);
4022 if (code == SYNC_OK) {
4023 vp->salvage.scheduled = 1;
4024 vp->stats.salvages++;
4025 vp->stats.last_salvage_req = FT_ApproxTime();
4026 IncUInt64(&VStats.salvages);
4030 case SYNC_BAD_COMMAND:
4031 case SYNC_COM_ERROR:
4034 Log("VScheduleSalvage_r: SALVSYNC request denied\n");
4037 Log("VScheduleSalvage_r: SALVSYNC unknown protocol error\n");
4042 #endif /* SALVSYNC_BUILD_CLIENT */
4047 * ask salvageserver to cancel a scheduled salvage operation.
4049 * @param[in] vp pointer to volume object
4050 * @param[in] reason SALVSYNC protocol reason code
4052 * @return operation status
4054 * @retval 1 request failed
4056 * @pre VOL_LOCK is held.
4058 * @post salvageserver is sent a request to cancel the volume salvage
4060 * @todo should set exclusive state and drop glock around salvsync call
4062 * @internal volume package internal use only.
4065 VCancelSalvage_r(Volume * vp, int reason)
4069 #ifdef SALVSYNC_BUILD_CLIENT
4070 if (vp->salvage.scheduled) {
4071 code = SALVSYNC_SalvageVolume(vp->hashid,
4072 VPartitionPath(vp->partition),
4077 if (code == SYNC_OK) {
4078 vp->salvage.scheduled = 0;
4083 #endif /* SALVSYNC_BUILD_CLIENT */
4088 #ifdef SALVSYNC_BUILD_CLIENT
4090 * connect to the salvageserver SYNC service.
4092 * @return operation status
4096 * @post connection to salvageserver SYNC service established
4098 * @see VConnectSALV_r
4099 * @see VDisconnectSALV
4100 * @see VReconnectSALV
4107 retVal = VConnectSALV_r();
4113 * connect to the salvageserver SYNC service.
4115 * @return operation status
4119 * @pre VOL_LOCK is held.
4121 * @post connection to salvageserver SYNC service established
4124 * @see VDisconnectSALV_r
4125 * @see VReconnectSALV_r
4126 * @see SALVSYNC_clientInit
4128 * @internal volume package internal use only.
4131 VConnectSALV_r(void)
4133 return SALVSYNC_clientInit();
4137 * disconnect from the salvageserver SYNC service.
4139 * @return operation status
4142 * @pre client should have a live connection to the salvageserver
4144 * @post connection to salvageserver SYNC service destroyed
4146 * @see VDisconnectSALV_r
4148 * @see VReconnectSALV
4151 VDisconnectSALV(void)
4155 VDisconnectSALV_r();
4161 * disconnect from the salvageserver SYNC service.
4163 * @return operation status
4167 * @arg VOL_LOCK is held.
4168 * @arg client should have a live connection to the salvageserver.
4170 * @post connection to salvageserver SYNC service destroyed
4172 * @see VDisconnectSALV
4173 * @see VConnectSALV_r
4174 * @see VReconnectSALV_r
4175 * @see SALVSYNC_clientFinis
4177 * @internal volume package internal use only.
4180 VDisconnectSALV_r(void)
4182 return SALVSYNC_clientFinis();
4186 * disconnect and then re-connect to the salvageserver SYNC service.
4188 * @return operation status
4192 * @pre client should have a live connection to the salvageserver
4194 * @post old connection is dropped, and a new one is established
4197 * @see VDisconnectSALV
4198 * @see VReconnectSALV_r
4201 VReconnectSALV(void)
4205 retVal = VReconnectSALV_r();
4211 * disconnect and then re-connect to the salvageserver SYNC service.
4213 * @return operation status
4218 * @arg VOL_LOCK is held.
4219 * @arg client should have a live connection to the salvageserver.
4221 * @post old connection is dropped, and a new one is established
4223 * @see VConnectSALV_r
4224 * @see VDisconnectSALV
4225 * @see VReconnectSALV
4226 * @see SALVSYNC_clientReconnect
4228 * @internal volume package internal use only.
4231 VReconnectSALV_r(void)
4233 return SALVSYNC_clientReconnect();
4235 #endif /* SALVSYNC_BUILD_CLIENT */
4236 #endif /* AFS_DEMAND_ATTACH_FS */
4239 /***************************************************/
4240 /* FSSYNC routines */
4241 /***************************************************/
4243 /* This must be called by any volume utility which needs to run while the
4244 file server is also running. This is separated from VInitVolumePackage so
4245 that a utility can fork--and each of the children can independently
4246 initialize communication with the file server */
4247 #ifdef FSSYNC_BUILD_CLIENT
4249 * connect to the fileserver SYNC service.
4251 * @return operation status
4256 * @arg VInit must equal 2.
4257 * @arg Program Type must not be fileserver or salvager.
4259 * @post connection to fileserver SYNC service established
4262 * @see VDisconnectFS
4263 * @see VChildProcReconnectFS
4270 retVal = VConnectFS_r();
4276 * connect to the fileserver SYNC service.
4278 * @return operation status
4283 * @arg VInit must equal 2.
4284 * @arg Program Type must not be fileserver or salvager.
4285 * @arg VOL_LOCK is held.
4287 * @post connection to fileserver SYNC service established
4290 * @see VDisconnectFS_r
4291 * @see VChildProcReconnectFS_r
4293 * @internal volume package internal use only.
4299 assert((VInit == 2) &&
4300 (programType != fileServer) &&
4301 (programType != salvager));
4302 rc = FSYNC_clientInit();
4309 * disconnect from the fileserver SYNC service.
4312 * @arg client should have a live connection to the fileserver.
4313 * @arg VOL_LOCK is held.
4314 * @arg Program Type must not be fileserver or salvager.
4316 * @post connection to fileserver SYNC service destroyed
4318 * @see VDisconnectFS
4320 * @see VChildProcReconnectFS_r
4322 * @internal volume package internal use only.
4325 VDisconnectFS_r(void)
4327 assert((programType != fileServer) &&
4328 (programType != salvager));
4329 FSYNC_clientFinis();
4334 * disconnect from the fileserver SYNC service.
4337 * @arg client should have a live connection to the fileserver.
4338 * @arg Program Type must not be fileserver or salvager.
4340 * @post connection to fileserver SYNC service destroyed
4342 * @see VDisconnectFS_r
4344 * @see VChildProcReconnectFS
4355 * connect to the fileserver SYNC service from a child process following a fork.
4357 * @return operation status
4362 * @arg VOL_LOCK is held.
4363 * @arg current FSYNC handle is shared with a parent process
4365 * @post current FSYNC handle is discarded and a new connection to the
4366 * fileserver SYNC service is established
4368 * @see VChildProcReconnectFS
4370 * @see VDisconnectFS_r
4372 * @internal volume package internal use only.
4375 VChildProcReconnectFS_r(void)
4377 return FSYNC_clientChildProcReconnect();
4381 * connect to the fileserver SYNC service from a child process following a fork.
4383 * @return operation status
4387 * @pre current FSYNC handle is shared with a parent process
4389 * @post current FSYNC handle is discarded and a new connection to the
4390 * fileserver SYNC service is established
4392 * @see VChildProcReconnectFS_r
4394 * @see VDisconnectFS
4397 VChildProcReconnectFS(void)
4401 ret = VChildProcReconnectFS_r();
4405 #endif /* FSSYNC_BUILD_CLIENT */
4408 /***************************************************/
4409 /* volume bitmap routines */
4410 /***************************************************/
4413 * For demand attach fs, flags parameter controls
4414 * locking behavior. If (flags & VOL_ALLOC_BITMAP_WAIT)
4415 * is set, then this function will create a reservation
4416 * and block on any other exclusive operations. Otherwise,
4417 * this function assumes the caller already has exclusive
4418 * access to vp, and we just change the volume state.
4421 VAllocBitmapEntry_r(Error * ec, Volume * vp,
4422 struct vnodeIndex *index, int flags)
4425 register byte *bp, *ep;
4426 #ifdef AFS_DEMAND_ATTACH_FS
4427 VolState state_save;
4428 #endif /* AFS_DEMAND_ATTACH_FS */
4432 /* This test is probably redundant */
4433 if (!VolumeWriteable(vp)) {
4434 *ec = (bit32) VREADONLY;
4438 #ifdef AFS_DEMAND_ATTACH_FS
4439 if (flags & VOL_ALLOC_BITMAP_WAIT) {
4440 VCreateReservation_r(vp);
4441 VWaitExclusiveState_r(vp);
4443 state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
4444 #endif /* AFS_DEMAND_ATTACH_FS */
4447 if ((programType == fileServer) && !index->bitmap) {
4449 #ifndef AFS_DEMAND_ATTACH_FS
4450 /* demand attach fs uses the volume state to avoid races.
4451 * specialStatus field is not used at all */
4453 if (vp->specialStatus == VBUSY) {
4454 if (vp->goingOffline) { /* vos dump waiting for the volume to
4455 * go offline. We probably come here
4456 * from AddNewReadableResidency */
4459 while (vp->specialStatus == VBUSY) {
4460 #ifdef AFS_PTHREAD_ENV
4464 #else /* !AFS_PTHREAD_ENV */
4466 #endif /* !AFS_PTHREAD_ENV */
4470 #endif /* !AFS_DEMAND_ATTACH_FS */
4472 if (!index->bitmap) {
4473 #ifndef AFS_DEMAND_ATTACH_FS
4474 vp->specialStatus = VBUSY; /* Stop anyone else from using it. */
4475 #endif /* AFS_DEMAND_ATTACH_FS */
4476 for (i = 0; i < nVNODECLASSES; i++) {
4477 VGetBitmap_r(ec, vp, i);
4479 #ifdef AFS_DEMAND_ATTACH_FS
4480 VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
4481 #else /* AFS_DEMAND_ATTACH_FS */
4482 DeleteVolumeFromHashTable(vp);
4483 vp->shuttingDown = 1; /* Let who has it free it. */
4484 vp->specialStatus = 0;
4485 #endif /* AFS_DEMAND_ATTACH_FS */
4490 #ifndef AFS_DEMAND_ATTACH_FS
4492 vp->specialStatus = 0; /* Allow others to have access. */
4493 #endif /* AFS_DEMAND_ATTACH_FS */
4496 #endif /* BITMAP_LATER */
4498 #ifdef AFS_DEMAND_ATTACH_FS
4500 #endif /* AFS_DEMAND_ATTACH_FS */
4501 bp = index->bitmap + index->bitmapOffset;
4502 ep = index->bitmap + index->bitmapSize;
4504 if ((*(bit32 *) bp) != (bit32) 0xffffffff) {
4506 index->bitmapOffset = (afs_uint32) (bp - index->bitmap);
4509 o = ffs(~*bp) - 1; /* ffs is documented in BSTRING(3) */
4511 ret = (VnodeId) ((bp - index->bitmap) * 8 + o);
4512 #ifdef AFS_DEMAND_ATTACH_FS
4514 #endif /* AFS_DEMAND_ATTACH_FS */
4517 bp += sizeof(bit32) /* i.e. 4 */ ;
4519 /* No bit map entry--must grow bitmap */
4521 realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE);
4524 bp += index->bitmapSize;
4525 memset(bp, 0, VOLUME_BITMAP_GROWSIZE);
4526 index->bitmapOffset = index->bitmapSize;
4527 index->bitmapSize += VOLUME_BITMAP_GROWSIZE;
4529 ret = index->bitmapOffset * 8;
4530 #ifdef AFS_DEMAND_ATTACH_FS
4532 #endif /* AFS_DEMAND_ATTACH_FS */
4535 #ifdef AFS_DEMAND_ATTACH_FS
4536 VChangeState_r(vp, state_save);
4537 if (flags & VOL_ALLOC_BITMAP_WAIT) {
4538 VCancelReservation_r(vp);
4540 #endif /* AFS_DEMAND_ATTACH_FS */
4545 VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
4549 retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT);
4555 VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
4558 unsigned int offset;
4564 #endif /* BITMAP_LATER */
4565 offset = bitNumber >> 3;
4566 if (offset >= index->bitmapSize) {
4570 if (offset < index->bitmapOffset)
4571 index->bitmapOffset = offset & ~3; /* Truncate to nearest bit32 */
4572 *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7));
4576 VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
4580 VFreeBitMapEntry_r(ec, index, bitNumber);
4584 /* this function will drop the glock internally.
4585 * for old pthread fileservers, this is safe thanks to vbusy.
4587 * for demand attach fs, caller must have already called
4588 * VCreateReservation_r and VWaitExclusiveState_r */
4590 VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
4592 StreamHandle_t *file;
4595 struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
4596 struct vnodeIndex *vip = &vp->vnodeIndex[class];
4597 struct VnodeDiskObject *vnode;
4598 unsigned int unique = 0;
4602 #endif /* BITMAP_LATER */
4603 #ifdef AFS_DEMAND_ATTACH_FS
4604 VolState state_save;
4605 #endif /* AFS_DEMAND_ATTACH_FS */
4609 #ifdef AFS_DEMAND_ATTACH_FS
4610 state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
4611 #endif /* AFS_DEMAND_ATTACH_FS */
4614 fdP = IH_OPEN(vip->handle);
4615 assert(fdP != NULL);
4616 file = FDH_FDOPEN(fdP, "r");
4617 assert(file != NULL);
4618 vnode = (VnodeDiskObject *) malloc(vcp->diskSize);
4619 assert(vnode != NULL);
4620 size = OS_SIZE(fdP->fd_fd);
4622 nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize)
4624 vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4; /* The 10 is a little extra so
4625 * a few files can be created in this volume,
4626 * the whole thing is rounded up to nearest 4
4627 * bytes, because the bit map allocator likes
4630 BitMap = (byte *) calloc(1, vip->bitmapSize);
4631 assert(BitMap != NULL);
4632 #else /* BITMAP_LATER */
4633 vip->bitmap = (byte *) calloc(1, vip->bitmapSize);
4634 assert(vip->bitmap != NULL);
4635 vip->bitmapOffset = 0;
4636 #endif /* BITMAP_LATER */
4637 if (STREAM_SEEK(file, vcp->diskSize, 0) != -1) {
4639 for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) {
4640 if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1)
4642 if (vnode->type != vNull) {
4643 if (vnode->vnodeMagic != vcp->magic) {
4644 Log("GetBitmap: addled vnode index in volume %s; volume needs salvage\n", V_name(vp));
4649 *(BitMap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
4650 #else /* BITMAP_LATER */
4651 *(vip->bitmap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
4652 #endif /* BITMAP_LATER */
4653 if (unique <= vnode->uniquifier)
4654 unique = vnode->uniquifier + 1;
4656 #ifndef AFS_PTHREAD_ENV
4657 if ((bitNumber & 0x00ff) == 0x0ff) { /* every 256 iterations */
4660 #endif /* !AFS_PTHREAD_ENV */
4663 if (vp->nextVnodeUnique < unique) {
4664 Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
4667 /* Paranoia, partly justified--I think fclose after fdopen
4668 * doesn't seem to close fd. In any event, the documentation
4669 * doesn't specify, so it's safer to close it twice.
4677 /* There may have been a racing condition with some other thread, both
4678 * creating the bitmaps for this volume. If the other thread was faster
4679 * the pointer to bitmap should already be filled and we can free ours.
4681 if (vip->bitmap == NULL) {
4682 vip->bitmap = BitMap;
4683 vip->bitmapOffset = 0;
4685 free((byte *) BitMap);
4686 #endif /* BITMAP_LATER */
4687 #ifdef AFS_DEMAND_ATTACH_FS
4688 VChangeState_r(vp, state_save);
4689 #endif /* AFS_DEMAND_ATTACH_FS */
4693 /***************************************************/
4694 /* Volume Path and Volume Number utility routines */
4695 /***************************************************/
4698 * find the first occurrence of a volume header file and return the path.
4700 * @param[out] ec outbound error code
4701 * @param[in] volumeId volume id to find
4702 * @param[out] partitionp pointer to disk partition path string
4703 * @param[out] namep pointer to volume header file name string
4705 * @post path to first occurrence of volume header is returned in partitionp
4706 * and namep, or ec is set accordingly.
4708 * @warning this function is NOT re-entrant -- partitionp and namep point to
4709 * static data segments
4711 * @note if a volume utility inadvertently leaves behind a stale volume header
4712 * on a vice partition, it is possible for callers to get the wrong one,
4713 * depending on the order of the disk partition linked list.
4717 VGetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
4719 static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
4720 char path[VMAXPATHLEN];
4722 struct DiskPartition64 *dp;
4726 (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, volumeId);
4727 for (dp = DiskPartitionList; dp; dp = dp->next) {
4728 struct afs_stat status;
4729 strcpy(path, VPartitionPath(dp));
4731 if (afs_stat(path, &status) == 0) {
4732 strcpy(partition, dp->name);
4739 *partitionp = *namep = NULL;
4741 *partitionp = partition;
4747 * extract a volume number from a volume header filename string.
4749 * @param[in] name volume header filename string
4751 * @return volume number
4753 * @note the string must be of the form VFORMAT. the only permissible
4754 * deviation is a leading '/' character.
4759 VolumeNumber(char *name)
4763 return atoi(name + 1);
4767 * compute the volume header filename.
4769 * @param[in] volumeId
4771 * @return volume header filename
4773 * @post volume header filename string is constructed
4775 * @warning this function is NOT re-entrant -- the returned string is
4776 * stored in a static char array. see VolumeExternalName_r
4777 * for a re-entrant equivalent.
4779 * @see VolumeExternalName_r
4781 * @deprecated due to the above re-entrancy warning, this interface should
4782 * be considered deprecated. Please use VolumeExternalName_r
4786 VolumeExternalName(VolumeId volumeId)
4788 static char name[VMAXPATHLEN];
4789 (void)afs_snprintf(name, sizeof name, VFORMAT, volumeId);
4794 * compute the volume header filename.
4796 * @param[in] volumeId
4797 * @param[inout] name array in which to store filename
4798 * @param[in] len length of name array
4800 * @return result code from afs_snprintf
4802 * @see VolumeExternalName
4805 * @note re-entrant equivalent of VolumeExternalName
4807 * @internal volume package internal use only.
4810 VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
4812 return afs_snprintf(name, len, VFORMAT, volumeId);
4816 /***************************************************/
4817 /* Volume Usage Statistics routines */
4818 /***************************************************/
4820 #if OPENAFS_VOL_STATS
4821 #define OneDay (86400) /* 24 hours' worth of seconds */
4823 #define OneDay (24*60*60) /* 24 hours */
4824 #endif /* OPENAFS_VOL_STATS */
4826 #define Midnight(date) ((date-TimeZoneCorrection)/OneDay*OneDay+TimeZoneCorrection)
4828 /*------------------------------------------------------------------------
4829 * [export] VAdjustVolumeStatistics
4832 * If we've passed midnight, we need to update all the day use
4833 * statistics as well as zeroing the detailed volume statistics
4834 * (if we are implementing them).
4837 * vp : Pointer to the volume structure describing the lucky
4838 * volume being considered for update.
4844 * Nothing interesting.
4848 *------------------------------------------------------------------------*/
4851 VAdjustVolumeStatistics_r(register Volume * vp)
4853 unsigned int now = FT_ApproxTime();
4855 if (now - V_dayUseDate(vp) > OneDay) {
4856 register int ndays, i;
4858 ndays = (now - V_dayUseDate(vp)) / OneDay;
4859 for (i = 6; i > ndays - 1; i--)
4860 V_weekUse(vp)[i] = V_weekUse(vp)[i - ndays];
4861 for (i = 0; i < ndays - 1 && i < 7; i++)
4862 V_weekUse(vp)[i] = 0;
4864 V_weekUse(vp)[ndays - 1] = V_dayUse(vp);
4866 V_dayUseDate(vp) = Midnight(now);
4868 #if OPENAFS_VOL_STATS
4870 * All we need to do is bzero the entire VOL_STATS_BYTES of
4871 * the detailed volume statistics area.
4873 memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
4874 #endif /* OPENAFS_VOL_STATS */
4877 /*It's been more than a day of collection */
4879 * Always return happily.
4882 } /*VAdjustVolumeStatistics */
4885 VAdjustVolumeStatistics(register Volume * vp)
4889 retVal = VAdjustVolumeStatistics_r(vp);
4895 VBumpVolumeUsage_r(register Volume * vp)
4897 unsigned int now = FT_ApproxTime();
4898 if (now - V_dayUseDate(vp) > OneDay)
4899 VAdjustVolumeStatistics_r(vp);
4901 * Save the volume header image to disk after every 128 bumps to dayUse.
4903 if ((V_dayUse(vp)++ & 127) == 0) {
4905 VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT);
4910 VBumpVolumeUsage(register Volume * vp)
4913 VBumpVolumeUsage_r(vp);
4918 VSetDiskUsage_r(void)
4920 #ifndef AFS_DEMAND_ATTACH_FS
4921 static int FifteenMinuteCounter = 0;
4925 /* NOTE: Don't attempt to access the partitions list until the
4926 * initialization level indicates that all volumes are attached,
4927 * which implies that all partitions are initialized. */
4928 #ifdef AFS_PTHREAD_ENV
4930 #else /* AFS_PTHREAD_ENV */
4932 #endif /* AFS_PTHREAD_ENV */
4935 VResetDiskUsage_r();
4937 #ifndef AFS_DEMAND_ATTACH_FS
4938 if (++FifteenMinuteCounter == 3) {
4939 FifteenMinuteCounter = 0;
4942 #endif /* !AFS_DEMAND_ATTACH_FS */
4954 /***************************************************/
4955 /* Volume Update List routines */
4956 /***************************************************/
4958 /* The number of minutes that a volume hasn't been updated before the
4959 * "Dont salvage" flag in the volume header will be turned on */
4960 #define SALVAGE_INTERVAL (10*60)
4965 * volume update list functionality has been moved into the VLRU
4966 * the DONT_SALVAGE flag is now set during VLRU demotion
4969 #ifndef AFS_DEMAND_ATTACH_FS
4970 static VolumeId *UpdateList = NULL; /* Pointer to array of Volume ID's */
4971 static int nUpdatedVolumes = 0; /* Updated with entry in UpdateList, salvage after crash flag on */
4972 static int updateSize = 0; /* number of entries possible */
4973 #define UPDATE_LIST_SIZE 128 /* initial size increment (must be a power of 2!) */
4974 #endif /* !AFS_DEMAND_ATTACH_FS */
4977 VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
4980 vp->updateTime = FT_ApproxTime();
4981 if (V_dontSalvage(vp) == 0)
4983 V_dontSalvage(vp) = 0;
4984 VSyncVolume_r(ec, vp, 0);
4985 #ifdef AFS_DEMAND_ATTACH_FS
4986 V_attachFlags(vp) &= ~(VOL_HDR_DONTSALV);
4987 #else /* !AFS_DEMAND_ATTACH_FS */
4990 if (UpdateList == NULL) {
4991 updateSize = UPDATE_LIST_SIZE;
4992 UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize);
4994 if (nUpdatedVolumes == updateSize) {
4996 if (updateSize > 524288) {
4997 Log("warning: there is likely a bug in the volume update scanner\n");
5001 (VolumeId *) realloc(UpdateList,
5002 sizeof(VolumeId) * updateSize);
5005 assert(UpdateList != NULL);
5006 UpdateList[nUpdatedVolumes++] = V_id(vp);
5007 #endif /* !AFS_DEMAND_ATTACH_FS */
5010 #ifndef AFS_DEMAND_ATTACH_FS
5012 VScanUpdateList(void)
5014 register int i, gap;
5015 register Volume *vp;
5017 afs_uint32 now = FT_ApproxTime();
5018 /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */
5019 for (i = gap = 0; i < nUpdatedVolumes; i++) {
5021 UpdateList[i - gap] = UpdateList[i];
5023 /* XXX this routine needlessly messes up the Volume LRU by
5024 * breaking the LRU temporal-locality assumptions.....
5025 * we should use a special volume header allocator here */
5026 vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]);
5029 } else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) {
5030 V_dontSalvage(vp) = DONT_SALVAGE;
5031 VUpdateVolume_r(&error, vp, 0); /* No need to fsync--not critical */
5039 #ifndef AFS_PTHREAD_ENV
5041 #endif /* !AFS_PTHREAD_ENV */
5043 nUpdatedVolumes -= gap;
5045 #endif /* !AFS_DEMAND_ATTACH_FS */
5048 /***************************************************/
5049 /* Volume LRU routines */
5050 /***************************************************/
5055 * with demand attach fs, we attempt to soft detach(1)
5056 * volumes which have not been accessed in a long time
5057 * in order to speed up fileserver shutdown
5059 * (1) by soft detach we mean a process very similar
5060 * to VOffline, except the final state of the
5061 * Volume will be VOL_STATE_PREATTACHED, instead
5062 * of the usual VOL_STATE_UNATTACHED
5064 #ifdef AFS_DEMAND_ATTACH_FS
5066 /* implementation is reminiscent of a generational GC
5068 * queue 0 is newly attached volumes. this queue is
5069 * sorted by attach timestamp
5071 * queue 1 is volumes that have been around a bit
5072 * longer than queue 0. this queue is sorted by
5075 * queue 2 is volumes tha have been around the longest.
5076 * this queue is unsorted
5078 * queue 3 is volumes that have been marked as
5079 * candidates for soft detachment. this queue is
5082 #define VLRU_GENERATIONS 3 /**< number of generations in VLRU */
5083 #define VLRU_QUEUES 5 /**< total number of VLRU queues */
5086 * definition of a VLRU queue.
5089 volatile struct rx_queue q;
5096 * main VLRU data structure.
5099 struct VLRU_q q[VLRU_QUEUES]; /**< VLRU queues */
5102 /** time interval (in seconds) between promotion passes for
5103 * each young generation queue. */
5104 afs_uint32 promotion_interval[VLRU_GENERATIONS-1];
5106 /** time interval (in seconds) between soft detach candidate
5107 * scans for each generation queue.
5109 * scan_interval[VLRU_QUEUE_CANDIDATE] defines how frequently
5110 * we perform a soft detach pass. */
5111 afs_uint32 scan_interval[VLRU_GENERATIONS+1];
5113 /* scheduler state */
5114 int next_idx; /**< next queue to receive attention */
5115 afs_uint32 last_promotion[VLRU_GENERATIONS-1]; /**< timestamp of last promotion scan */
5116 afs_uint32 last_scan[VLRU_GENERATIONS+1]; /**< timestamp of last detach scan */
5118 int scanner_state; /**< state of scanner thread */
5119 pthread_cond_t cv; /**< state transition CV */
5122 /** global VLRU state */
5123 static struct VLRU volume_LRU;
5126 * defined states for VLRU scanner thread.
5129 VLRU_SCANNER_STATE_OFFLINE = 0, /**< vlru scanner thread is offline */
5130 VLRU_SCANNER_STATE_ONLINE = 1, /**< vlru scanner thread is online */
5131 VLRU_SCANNER_STATE_SHUTTING_DOWN = 2, /**< vlru scanner thread is shutting down */
5132 VLRU_SCANNER_STATE_PAUSING = 3, /**< vlru scanner thread is getting ready to pause */
5133 VLRU_SCANNER_STATE_PAUSED = 4 /**< vlru scanner thread is paused */
5134 } vlru_thread_state_t;
5136 /* vlru disk data header stuff */
5137 #define VLRU_DISK_MAGIC 0x7a8b9cad /**< vlru disk entry magic number */
5138 #define VLRU_DISK_VERSION 1 /**< vlru disk entry version number */
5140 /** vlru default expiration time (for eventual fs state serialization of vlru data) */
5141 #define VLRU_DUMP_EXPIRATION_TIME (60*60*24*7) /* expire vlru data after 1 week */
5144 /** minimum volume inactivity (in seconds) before a volume becomes eligible for
5145 * soft detachment. */
5146 static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH;
5148 /** time interval (in seconds) between VLRU scanner thread soft detach passes. */
5149 static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL;
5151 /** maximum number of volumes to soft detach in a VLRU soft detach pass. */
5152 static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX;
5154 /** VLRU control flag. non-zero value implies VLRU subsystem is activated. */
5155 static afs_uint32 VLRU_enabled = 1;
5157 /* queue synchronization routines */
5158 static void VLRU_BeginExclusive_r(struct VLRU_q * q);
5159 static void VLRU_EndExclusive_r(struct VLRU_q * q);
5160 static void VLRU_Wait_r(struct VLRU_q * q);
5163 * set VLRU subsystem tunable parameters.
5165 * @param[in] option tunable option to modify
5166 * @param[in] val new value for tunable parameter
5168 * @pre @c VInitVolumePackage has not yet been called.
5170 * @post tunable parameter is modified
5174 * @note valid option parameters are:
5175 * @arg @c VLRU_SET_THRESH
5176 * set the period of inactivity after which
5177 * volumes are eligible for soft detachment
5178 * @arg @c VLRU_SET_INTERVAL
5179 * set the time interval between calls
5180 * to the volume LRU "garbage collector"
5181 * @arg @c VLRU_SET_MAX
5182 * set the max number of volumes to deallocate
5186 VLRU_SetOptions(int option, afs_uint32 val)
5188 if (option == VLRU_SET_THRESH) {
5189 VLRU_offline_thresh = val;
5190 } else if (option == VLRU_SET_INTERVAL) {
5191 VLRU_offline_interval = val;
5192 } else if (option == VLRU_SET_MAX) {
5193 VLRU_offline_max = val;
5194 } else if (option == VLRU_SET_ENABLED) {
5197 VLRU_ComputeConstants();
5201 * compute VLRU internal timing parameters.
5203 * @post VLRU scanner thread internal timing parameters are computed
5205 * @note computes internal timing parameters based upon user-modifiable
5206 * tunable parameters.
5210 * @internal volume package internal use only.
5213 VLRU_ComputeConstants(void)
5215 afs_uint32 factor = VLRU_offline_thresh / VLRU_offline_interval;
5217 /* compute the candidate scan interval */
5218 volume_LRU.scan_interval[VLRU_QUEUE_CANDIDATE] = VLRU_offline_interval;
5220 /* compute the promotion intervals */
5221 volume_LRU.promotion_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh * 2;
5222 volume_LRU.promotion_interval[VLRU_QUEUE_MID] = VLRU_offline_thresh * 4;
5225 /* compute the gen 0 scan interval */
5226 volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh / 8;
5228 /* compute the gen 0 scan interval */
5229 volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_interval * 2;
5234 * initialize VLRU subsystem.
5236 * @pre this function has not yet been called
5238 * @post VLRU subsystem is initialized and VLRU scanner thread is starting
5242 * @internal volume package internal use only.
5248 pthread_attr_t attrs;
5251 if (!VLRU_enabled) {
5252 Log("VLRU: disabled\n");
5256 /* initialize each of the VLRU queues */
5257 for (i = 0; i < VLRU_QUEUES; i++) {
5258 queue_Init(&volume_LRU.q[i]);
5259 volume_LRU.q[i].len = 0;
5260 volume_LRU.q[i].busy = 0;
5261 assert(pthread_cond_init(&volume_LRU.q[i].cv, NULL) == 0);
5264 /* setup the timing constants */
5265 VLRU_ComputeConstants();
5267 /* XXX put inside LogLevel check? */
5268 Log("VLRU: starting scanner with the following configuration parameters:\n");
5269 Log("VLRU: offlining volumes after minimum of %d seconds of inactivity\n", VLRU_offline_thresh);
5270 Log("VLRU: running VLRU soft detach pass every %d seconds\n", VLRU_offline_interval);
5271 Log("VLRU: taking up to %d volumes offline per pass\n", VLRU_offline_max);
5272 Log("VLRU: scanning generation 0 for inactive volumes every %d seconds\n", volume_LRU.scan_interval[0]);
5273 Log("VLRU: scanning for promotion/demotion between generations 0 and 1 every %d seconds\n", volume_LRU.promotion_interval[0]);
5274 Log("VLRU: scanning for promotion/demotion between generations 1 and 2 every %d seconds\n", volume_LRU.promotion_interval[1]);
5276 /* start up the VLRU scanner */
5277 volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
5278 if (programType == fileServer) {
5279 assert(pthread_cond_init(&volume_LRU.cv, NULL) == 0);
5280 assert(pthread_attr_init(&attrs) == 0);
5281 assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
5282 assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0);
5287 * initialize the VLRU-related fields of a newly allocated volume object.
5289 * @param[in] vp pointer to volume object
5292 * @arg @c VOL_LOCK is held.
5293 * @arg volume object is not on a VLRU queue.
5295 * @post VLRU fields are initialized to indicate that volume object is not
5296 * currently registered with the VLRU subsystem
5300 * @internal volume package interal use only.
5303 VLRU_Init_Node_r(volatile Volume * vp)
5308 assert(queue_IsNotOnQueue(&vp->vlru));
5309 vp->vlru.idx = VLRU_QUEUE_INVALID;
5313 * add a volume object to a VLRU queue.
5315 * @param[in] vp pointer to volume object
5318 * @arg @c VOL_LOCK is held.
5319 * @arg caller MUST hold a lightweight ref on @p vp.
5320 * @arg caller MUST NOT hold exclusive ownership of the VLRU queue.
5322 * @post the volume object is added to the appropriate VLRU queue
5324 * @note if @c vp->vlru.idx contains the index of a valid VLRU queue,
5325 * then the volume is added to that queue. Otherwise, the value
5326 * @c VLRU_QUEUE_NEW is stored into @c vp->vlru.idx and the
5327 * volume is added to the NEW generation queue.
5329 * @note @c VOL_LOCK may be dropped internally
5331 * @note Volume state is temporarily set to @c VOL_STATE_VLRU_ADD
5332 * during the add operation, and is restored to the previous
5333 * state prior to return.
5337 * @internal volume package internal use only.
5340 VLRU_Add_r(volatile Volume * vp)
5343 VolState state_save;
5348 if (queue_IsOnQueue(&vp->vlru))
5351 state_save = VChangeState_r(vp, VOL_STATE_VLRU_ADD);
5354 if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) {
5355 idx = VLRU_QUEUE_NEW;
5358 VLRU_Wait_r(&volume_LRU.q[idx]);
5360 /* repeat check since VLRU_Wait_r may have dropped
5362 if (queue_IsNotOnQueue(&vp->vlru)) {
5364 queue_Prepend(&volume_LRU.q[idx], &vp->vlru);
5365 volume_LRU.q[idx].len++;
5366 V_attachFlags(vp) |= VOL_ON_VLRU;
5367 vp->stats.last_promote = FT_ApproxTime();
5370 VChangeState_r(vp, state_save);
5374 * delete a volume object from a VLRU queue.
5376 * @param[in] vp pointer to volume object
5379 * @arg @c VOL_LOCK is held.
5380 * @arg caller MUST hold a lightweight ref on @p vp.
5381 * @arg caller MUST NOT hold exclusive ownership of the VLRU queue.
5383 * @post volume object is removed from the VLRU queue
5385 * @note @c VOL_LOCK may be dropped internally
5389 * @todo We should probably set volume state to something exlcusive
5390 * (as @c VLRU_Add_r does) prior to dropping @c VOL_LOCK.
5392 * @internal volume package internal use only.
5395 VLRU_Delete_r(volatile Volume * vp)
5402 if (queue_IsNotOnQueue(&vp->vlru))
5408 if (idx == VLRU_QUEUE_INVALID)
5410 VLRU_Wait_r(&volume_LRU.q[idx]);
5411 } while (idx != vp->vlru.idx);
5413 /* now remove from the VLRU and update
5414 * the appropriate counter */
5415 queue_Remove(&vp->vlru);
5416 volume_LRU.q[idx].len--;
5417 vp->vlru.idx = VLRU_QUEUE_INVALID;
5418 V_attachFlags(vp) &= ~(VOL_ON_VLRU);
5422 * tell the VLRU subsystem that a volume was just accessed.
5424 * @param[in] vp pointer to volume object
5427 * @arg @c VOL_LOCK is held
5428 * @arg caller MUST hold a lightweight ref on @p vp
5429 * @arg caller MUST NOT hold exclusive ownership of any VLRU queue
5431 * @post volume VLRU access statistics are updated. If the volume was on
5432 * the VLRU soft detach candidate queue, it is moved to the NEW
5435 * @note @c VOL_LOCK may be dropped internally
5439 * @internal volume package internal use only.
5442 VLRU_UpdateAccess_r(volatile Volume * vp)
5444 afs_uint32 live_interval;
5445 Volume * rvp = NULL;
5450 if (queue_IsNotOnQueue(&vp->vlru))
5453 assert(V_attachFlags(vp) & VOL_ON_VLRU);
5455 /* update the access timestamp */
5456 vp->stats.last_get = FT_ApproxTime();
5459 * if the volume is on the soft detach candidate
5460 * list, we need to safely move it back to a
5461 * regular generation. this has to be done
5462 * carefully so we don't race against the scanner
5466 /* if this volume is on the soft detach candidate queue,
5467 * then grab exclusive access to the necessary queues */
5468 if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
5470 VCreateReservation_r(rvp);
5472 VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
5473 VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
5474 VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
5475 VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
5478 /* make sure multiple threads don't race to update */
5479 if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
5480 VLRU_SwitchQueues(vp, VLRU_QUEUE_NEW, 1);
5484 VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
5485 VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
5486 VCancelReservation_r(rvp);
5491 * switch a volume between two VLRU queues.
5493 * @param[in] vp pointer to volume object
5494 * @param[in] new_idx index of VLRU queue onto which the volume will be moved
5495 * @param[in] append controls whether the volume will be appended or
5496 * prepended to the queue. A nonzero value means it will
5497 * be appended; zero means it will be prepended.
5499 * @pre The new (and old, if applicable) queue(s) must either be owned
5500 * exclusively by the calling thread for asynchronous manipulation,
5501 * or the queue(s) must be quiescent and VOL_LOCK must be held.
5502 * Please see VLRU_BeginExclusive_r, VLRU_EndExclusive_r and VLRU_Wait_r
5503 * for further details of the queue asynchronous processing mechanism.
5505 * @post If the volume object was already on a VLRU queue, it is
5506 * removed from the queue. Depending on the value of the append
5507 * parameter, the volume object is either appended or prepended
5508 * to the VLRU queue referenced by the new_idx parameter.
5512 * @see VLRU_BeginExclusive_r
5513 * @see VLRU_EndExclusive_r
5516 * @internal volume package internal use only.
5519 VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append)
5521 if (queue_IsNotOnQueue(&vp->vlru))
5524 queue_Remove(&vp->vlru);
5525 volume_LRU.q[vp->vlru.idx].len--;
5527 /* put the volume back on the correct generational queue */
5529 queue_Append(&volume_LRU.q[new_idx], &vp->vlru);
5531 queue_Prepend(&volume_LRU.q[new_idx], &vp->vlru);
5534 volume_LRU.q[new_idx].len++;
5535 vp->vlru.idx = new_idx;
5539 * VLRU background thread.
5541 * The VLRU Scanner Thread is responsible for periodically scanning through
5542 * each VLRU queue looking for volumes which should be moved to another
5543 * queue, or soft detached.
5545 * @param[in] args unused thread arguments parameter
5547 * @return unused thread return value
5548 * @retval NULL always
5550 * @internal volume package internal use only.
5553 VLRU_ScannerThread(void * args)
5555 afs_uint32 now, min_delay, delay;
5556 afs_uint32 next_scan[VLRU_GENERATIONS];
5557 afs_uint32 next_promotion[VLRU_GENERATIONS];
5558 int i, min_idx, min_op, overdue, state;
5560 /* set t=0 for promotion cycle to be
5561 * fileserver startup */
5562 now = FT_ApproxTime();
5563 for (i=0; i < VLRU_GENERATIONS-1; i++) {
5564 volume_LRU.last_promotion[i] = now;
5567 /* don't start the scanner until VLRU_offline_thresh
5568 * plus a small delay for VInitVolumePackage to finish
5571 sleep(VLRU_offline_thresh + 60);
5573 /* set t=0 for scan cycle to be now */
5574 now = FT_ApproxTime();
5575 for (i=0; i < VLRU_GENERATIONS+1; i++) {
5576 volume_LRU.last_scan[i] = now;
5580 if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_OFFLINE) {
5581 volume_LRU.scanner_state = VLRU_SCANNER_STATE_ONLINE;
5584 while ((state = volume_LRU.scanner_state) != VLRU_SCANNER_STATE_SHUTTING_DOWN) {
5585 /* check to see if we've been asked to pause */
5586 if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) {
5587 volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED;
5588 assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
5590 VOL_CV_WAIT(&volume_LRU.cv);
5591 } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED);
5594 /* scheduling can happen outside the glock */
5597 /* figure out what is next on the schedule */
5599 /* figure out a potential schedule for the new generation first */
5601 min_delay = volume_LRU.scan_interval[0] + volume_LRU.last_scan[0] - now;
5604 if (min_delay > volume_LRU.scan_interval[0]) {
5605 /* unsigned overflow -- we're overdue to run this scan */
5610 /* if we're not overdue for gen 0, figure out schedule for candidate gen */
5612 i = VLRU_QUEUE_CANDIDATE;
5613 delay = volume_LRU.scan_interval[i] + volume_LRU.last_scan[i] - now;
5614 if (delay < min_delay) {
5618 if (delay > volume_LRU.scan_interval[i]) {
5619 /* unsigned overflow -- we're overdue to run this scan */
5627 /* if we're still not overdue for something, figure out schedules for promotions */
5628 for (i=0; !overdue && i < VLRU_GENERATIONS-1; i++) {
5629 delay = volume_LRU.promotion_interval[i] + volume_LRU.last_promotion[i] - now;
5630 if (delay < min_delay) {
5635 if (delay > volume_LRU.promotion_interval[i]) {
5636 /* unsigned overflow -- we're overdue to run this promotion */
5645 /* sleep as needed */
5650 /* do whatever is next */
5653 VLRU_Promote_r(min_idx);
5654 VLRU_Demote_r(min_idx+1);
5656 VLRU_Scan_r(min_idx);
5658 now = FT_ApproxTime();
5661 Log("VLRU scanner asked to go offline (scanner_state=%d)\n", state);
5663 /* signal that scanner is down */
5664 volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
5665 assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
5671 * promote volumes from one VLRU generation to the next.
5673 * This routine scans a VLRU generation looking for volumes which are
5674 * eligible to be promoted to the next generation. All volumes which
5675 * meet the eligibility requirement are promoted.
5677 * Promotion eligibility is based upon meeting both of the following
5680 * @arg The volume has been accessed since the last promotion:
5681 * @c (vp->stats.last_get >= vp->stats.last_promote)
5682 * @arg The last promotion occurred at least
5683 * @c volume_LRU.promotion_interval[idx] seconds ago
5685 * As a performance optimization, promotions are "globbed". In other
5686 * words, we promote arbitrarily large contiguous sublists of elements
5689 * @param[in] idx VLRU queue index to scan
5693 * @internal VLRU internal use only.
5696 VLRU_Promote_r(int idx)
5698 int len, chaining, promote;
5699 afs_uint32 now, thresh;
5700 struct rx_queue *qp, *nqp;
5701 Volume * vp, *start, *end;
5703 /* get exclusive access to two chains, and drop the glock */
5704 VLRU_Wait_r(&volume_LRU.q[idx]);
5705 VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
5706 VLRU_Wait_r(&volume_LRU.q[idx+1]);
5707 VLRU_BeginExclusive_r(&volume_LRU.q[idx+1]);
5710 thresh = volume_LRU.promotion_interval[idx];
5711 now = FT_ApproxTime();
5714 for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
5715 vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
5716 promote = (((vp->stats.last_promote + thresh) <= now) &&
5717 (vp->stats.last_get >= vp->stats.last_promote));
5725 /* promote and prepend chain */
5726 queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
5740 /* promote and prepend */
5741 queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
5745 volume_LRU.q[idx].len -= len;
5746 volume_LRU.q[idx+1].len += len;
5749 /* release exclusive access to the two chains */
5751 volume_LRU.last_promotion[idx] = now;
5752 VLRU_EndExclusive_r(&volume_LRU.q[idx+1]);
5753 VLRU_EndExclusive_r(&volume_LRU.q[idx]);
5756 /* run the demotions */
5758 VLRU_Demote_r(int idx)
5761 int len, chaining, demote;
5762 afs_uint32 now, thresh;
5763 struct rx_queue *qp, *nqp;
5764 Volume * vp, *start, *end;
5765 Volume ** salv_flag_vec = NULL;
5766 int salv_vec_offset = 0;
5768 assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD);
5770 /* get exclusive access to two chains, and drop the glock */
5771 VLRU_Wait_r(&volume_LRU.q[idx-1]);
5772 VLRU_BeginExclusive_r(&volume_LRU.q[idx-1]);
5773 VLRU_Wait_r(&volume_LRU.q[idx]);
5774 VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
5777 /* no big deal if this allocation fails */
5778 if (volume_LRU.q[idx].len) {
5779 salv_flag_vec = (Volume **) malloc(volume_LRU.q[idx].len * sizeof(Volume *));
5782 now = FT_ApproxTime();
5783 thresh = volume_LRU.promotion_interval[idx-1];
5786 for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
5787 vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
5788 demote = (((vp->stats.last_promote + thresh) <= now) &&
5789 (vp->stats.last_get < (now - thresh)));
5791 /* we now do volume update list DONT_SALVAGE flag setting during
5792 * demotion passes */
5793 if (salv_flag_vec &&
5794 !(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
5796 (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
5797 (V_attachState(vp) == VOL_STATE_ATTACHED)) {
5798 salv_flag_vec[salv_vec_offset++] = vp;
5799 VCreateReservation_r(vp);
5808 /* demote and append chain */
5809 queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
5823 queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
5827 volume_LRU.q[idx].len -= len;
5828 volume_LRU.q[idx-1].len += len;
5831 /* release exclusive access to the two chains */
5833 VLRU_EndExclusive_r(&volume_LRU.q[idx]);
5834 VLRU_EndExclusive_r(&volume_LRU.q[idx-1]);
5836 /* now go back and set the DONT_SALVAGE flags as appropriate */
5837 if (salv_flag_vec) {
5839 for (i = 0; i < salv_vec_offset; i++) {
5840 vp = salv_flag_vec[i];
5841 if (!(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
5842 (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
5843 (V_attachState(vp) == VOL_STATE_ATTACHED)) {
5846 V_attachFlags(vp) |= VOL_HDR_DONTSALV;
5847 V_dontSalvage(vp) = DONT_SALVAGE;
5848 VUpdateVolume_r(&ec, vp, 0);
5852 VCancelReservation_r(vp);
5854 free(salv_flag_vec);
5858 /* run a pass of the VLRU GC scanner */
5860 VLRU_Scan_r(int idx)
5862 afs_uint32 now, thresh;
5863 struct rx_queue *qp, *nqp;
5864 volatile Volume * vp;
5867 assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE);
5869 /* gain exclusive access to the idx VLRU */
5870 VLRU_Wait_r(&volume_LRU.q[idx]);
5871 VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
5873 if (idx != VLRU_QUEUE_CANDIDATE) {
5874 /* gain exclusive access to the candidate VLRU */
5875 VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
5876 VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
5879 now = FT_ApproxTime();
5880 thresh = now - VLRU_offline_thresh;
5882 /* perform candidate selection and soft detaching */
5883 if (idx == VLRU_QUEUE_CANDIDATE) {
5884 /* soft detach some volumes from the candidate pool */
5888 for (i=0,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
5889 vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
5890 if (i >= VLRU_offline_max) {
5893 /* check timestamp to see if it's a candidate for soft detaching */
5894 if (vp->stats.last_get <= thresh) {
5896 if (VCheckSoftDetach(vp, thresh))
5902 /* scan for volumes to become soft detach candidates */
5903 for (i=1,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue),i++) {
5904 vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
5906 /* check timestamp to see if it's a candidate for soft detaching */
5907 if (vp->stats.last_get <= thresh) {
5908 VCheckSoftDetachCandidate(vp, thresh);
5911 if (!(i&0x7f)) { /* lock coarsening optimization */
5919 /* relinquish exclusive access to the VLRU chains */
5923 volume_LRU.last_scan[idx] = now;
5924 if (idx != VLRU_QUEUE_CANDIDATE) {
5925 VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
5927 VLRU_EndExclusive_r(&volume_LRU.q[idx]);
5930 /* check whether volume is safe to soft detach
5931 * caller MUST NOT hold a ref count on vp */
5933 VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh)
5937 if (vp->nUsers || vp->nWaiters)
5940 if (vp->stats.last_get <= thresh) {
5941 ret = VSoftDetachVolume_r(vp, thresh);
5947 /* check whether volume should be made a
5948 * soft detach candidate */
5950 VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh)
5953 if (vp->nUsers || vp->nWaiters)
5958 assert(idx == VLRU_QUEUE_NEW);
5960 if (vp->stats.last_get <= thresh) {
5961 /* move to candidate pool */
5962 queue_Remove(&vp->vlru);
5963 volume_LRU.q[VLRU_QUEUE_NEW].len--;
5964 queue_Prepend(&volume_LRU.q[VLRU_QUEUE_CANDIDATE], &vp->vlru);
5965 vp->vlru.idx = VLRU_QUEUE_CANDIDATE;
5966 volume_LRU.q[VLRU_QUEUE_CANDIDATE].len++;
5974 /* begin exclusive access on VLRU */
5976 VLRU_BeginExclusive_r(struct VLRU_q * q)
5978 assert(q->busy == 0);
5982 /* end exclusive access on VLRU */
5984 VLRU_EndExclusive_r(struct VLRU_q * q)
5988 assert(pthread_cond_broadcast(&q->cv) == 0);
5991 /* wait for another thread to end exclusive access on VLRU */
5993 VLRU_Wait_r(struct VLRU_q * q)
5996 VOL_CV_WAIT(&q->cv);
6001 * volume soft detach
6003 * caller MUST NOT hold a ref count on vp */
6005 VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
6010 assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
6012 ts_save = vp->stats.last_get;
6013 if (ts_save > thresh)
6016 if (vp->nUsers || vp->nWaiters)
6019 if (VIsExclusiveState(V_attachState(vp))) {
6023 switch (V_attachState(vp)) {
6024 case VOL_STATE_UNATTACHED:
6025 case VOL_STATE_PREATTACHED:
6026 case VOL_STATE_ERROR:
6027 case VOL_STATE_GOING_OFFLINE:
6028 case VOL_STATE_SHUTTING_DOWN:
6029 case VOL_STATE_SALVAGING:
6030 volume_LRU.q[vp->vlru.idx].len--;
6032 /* create and cancel a reservation to
6033 * give the volume an opportunity to
6035 VCreateReservation_r(vp);
6036 queue_Remove(&vp->vlru);
6037 vp->vlru.idx = VLRU_QUEUE_INVALID;
6038 V_attachFlags(vp) &= ~(VOL_ON_VLRU);
6039 VCancelReservation_r(vp);
6043 /* hold the volume and take it offline.
6044 * no need for reservations, as VHold_r
6045 * takes care of that internally. */
6046 if (VHold_r(vp) == 0) {
6047 /* vhold drops the glock, so now we should
6048 * check to make sure we aren't racing against
6049 * other threads. if we are racing, offlining vp
6050 * would be wasteful, and block the scanner for a while
6054 (vp->shuttingDown) ||
6055 (vp->goingOffline) ||
6056 (vp->stats.last_get != ts_save)) {
6057 /* looks like we're racing someone else. bail */
6061 /* pull it off the VLRU */
6062 assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
6063 volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--;
6064 queue_Remove(&vp->vlru);
6065 vp->vlru.idx = VLRU_QUEUE_INVALID;
6066 V_attachFlags(vp) &= ~(VOL_ON_VLRU);
6068 /* take if offline */
6069 VOffline_r(vp, "volume has been soft detached");
6071 /* invalidate the volume header cache */
6072 FreeVolumeHeader(vp);
6075 IncUInt64(&VStats.soft_detaches);
6076 vp->stats.soft_detaches++;
6078 /* put in pre-attached state so demand
6079 * attacher can work on it */
6080 VChangeState_r(vp, VOL_STATE_PREATTACHED);
6086 #endif /* AFS_DEMAND_ATTACH_FS */
6089 /***************************************************/
6090 /* Volume Header Cache routines */
6091 /***************************************************/
6094 * volume header cache.
6096 struct volume_hdr_LRU_t volume_hdr_LRU;
6099 * initialize the volume header cache.
6101 * @param[in] howMany number of header cache entries to preallocate
6103 * @pre VOL_LOCK held. Function has never been called before.
6105 * @post howMany cache entries are allocated, initialized, and added
6106 * to the LRU list. Header cache statistics are initialized.
6108 * @note only applicable to fileServer program type. Should only be
6109 * called once during volume package initialization.
6111 * @internal volume package internal use only.
6114 VInitVolumeHeaderCache(afs_uint32 howMany)
6116 register struct volHeader *hp;
6117 if (programType != fileServer)
6119 queue_Init(&volume_hdr_LRU);
6120 volume_hdr_LRU.stats.free = 0;
6121 volume_hdr_LRU.stats.used = howMany;
6122 volume_hdr_LRU.stats.attached = 0;
6123 hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
6125 ReleaseVolumeHeader(hp++);
6129 * get a volume header and attach it to the volume object.
6131 * @param[in] vp pointer to volume object
6133 * @return cache entry status
6134 * @retval 0 volume header was newly attached; cache data is invalid
6135 * @retval 1 volume header was previously attached; cache data is valid
6137 * @pre VOL_LOCK held. For DAFS, lightweight ref must be held on volume object.
6139 * @post volume header attached to volume object. if necessary, header cache
6140 * entry on LRU is synchronized to disk. Header is removed from LRU list.
6142 * @note VOL_LOCK may be dropped
6144 * @warning this interface does not load header data from disk. it merely
6145 * attaches a header object to the volume object, and may sync the old
6146 * header cache data out to disk in the process.
6148 * @internal volume package internal use only.
6151 GetVolumeHeader(register Volume * vp)
6154 register struct volHeader *hd;
6156 static int everLogged = 0;
6158 #ifdef AFS_DEMAND_ATTACH_FS
6159 VolState vp_save, back_save;
6161 /* XXX debug 9/19/05 we've apparently got
6162 * a ref counting bug somewhere that's
6163 * breaking the nUsers == 0 => header on LRU
6165 if (vp->header && queue_IsNotOnQueue(vp->header)) {
6166 Log("nUsers == 0, but header not on LRU\n");
6171 old = (vp->header != NULL); /* old == volume already has a header */
6173 if (programType != fileServer) {
6174 /* for volume utilities, we allocate volHeaders as needed */
6176 hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
6180 #ifdef AFS_DEMAND_ATTACH_FS
6181 V_attachFlags(vp) |= VOL_HDR_ATTACHED;
6185 /* for the fileserver, we keep a volume header cache */
6187 /* the header we previously dropped in the lru is
6188 * still available. pull it off the lru and return */
6191 assert(hd->back == vp);
6193 /* we need to grab a new element off the LRU */
6194 if (queue_IsNotEmpty(&volume_hdr_LRU)) {
6195 /* grab an element and pull off of LRU */
6196 hd = queue_First(&volume_hdr_LRU, volHeader);
6199 /* LRU is empty, so allocate a new volHeader
6200 * this is probably indicative of a leak, so let the user know */
6201 hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
6204 Log("****Allocated more volume headers, probably leak****\n");
6207 volume_hdr_LRU.stats.free++;
6210 /* this header used to belong to someone else.
6211 * we'll need to check if the header needs to
6212 * be sync'd out to disk */
6214 #ifdef AFS_DEMAND_ATTACH_FS
6215 /* if hd->back were in an exclusive state, then
6216 * its volHeader would not be on the LRU... */
6217 assert(!VIsExclusiveState(V_attachState(hd->back)));
6220 if (hd->diskstuff.inUse) {
6221 /* volume was in use, so we'll need to sync
6222 * its header to disk */
6224 #ifdef AFS_DEMAND_ATTACH_FS
6225 back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING);
6226 vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING);
6227 VCreateReservation_r(hd->back);
6231 WriteVolumeHeader_r(&error, hd->back);
6232 /* Ignore errors; catch them later */
6234 #ifdef AFS_DEMAND_ATTACH_FS
6239 hd->back->header = NULL;
6240 #ifdef AFS_DEMAND_ATTACH_FS
6241 V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU);
6243 if (hd->diskstuff.inUse) {
6244 VChangeState_r(hd->back, back_save);
6245 VCancelReservation_r(hd->back);
6246 VChangeState_r(vp, vp_save);
6250 volume_hdr_LRU.stats.attached++;
6254 #ifdef AFS_DEMAND_ATTACH_FS
6255 V_attachFlags(vp) |= VOL_HDR_ATTACHED;
6258 volume_hdr_LRU.stats.free--;
6259 volume_hdr_LRU.stats.used++;
6261 IncUInt64(&VStats.hdr_gets);
6262 #ifdef AFS_DEMAND_ATTACH_FS
6263 IncUInt64(&vp->stats.hdr_gets);
6264 vp->stats.last_hdr_get = FT_ApproxTime();
6271 * make sure volume header is attached and contains valid cache data.
6273 * @param[out] ec outbound error code
6274 * @param[in] vp pointer to volume object
6276 * @pre VOL_LOCK held. For DAFS, lightweight ref held on vp.
6278 * @post header cache entry attached, and loaded with valid data, or
6279 * *ec is nonzero, and the header is released back into the LRU.
6281 * @internal volume package internal use only.
6284 LoadVolumeHeader(Error * ec, Volume * vp)
6286 #ifdef AFS_DEMAND_ATTACH_FS
6287 VolState state_save;
6291 if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
6292 IncUInt64(&VStats.hdr_loads);
6293 state_save = VChangeState_r(vp, VOL_STATE_HDR_LOADING);
6296 ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
6297 sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
6299 IncUInt64(&vp->stats.hdr_loads);
6300 now = FT_ApproxTime();
6304 V_attachFlags(vp) |= VOL_HDR_LOADED;
6305 vp->stats.last_hdr_load = now;
6307 VChangeState_r(vp, state_save);
6309 #else /* AFS_DEMAND_ATTACH_FS */
6311 if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
6312 IncUInt64(&VStats.hdr_loads);
6314 ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
6315 sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
6318 #endif /* AFS_DEMAND_ATTACH_FS */
6320 /* maintain (nUsers==0) => header in LRU invariant */
6321 ReleaseVolumeHeader(vp->header);
6326 * release a header cache entry back into the LRU list.
6328 * @param[in] hd pointer to volume header cache object
6330 * @pre VOL_LOCK held.
6332 * @post header cache object appended onto end of LRU list.
6334 * @note only applicable to fileServer program type.
6336 * @note used to place a header cache entry back into the
6337 * LRU pool without invalidating it as a cache entry.
6339 * @internal volume package internal use only.
6342 ReleaseVolumeHeader(register struct volHeader *hd)
6344 if (programType != fileServer)
6346 if (!hd || queue_IsOnQueue(hd)) /* no header, or header already released */
6348 queue_Append(&volume_hdr_LRU, hd);
6349 #ifdef AFS_DEMAND_ATTACH_FS
6351 V_attachFlags(hd->back) |= VOL_HDR_IN_LRU;
6354 volume_hdr_LRU.stats.free++;
6355 volume_hdr_LRU.stats.used--;
6359 * free/invalidate a volume header cache entry.
6361 * @param[in] vp pointer to volume object
6363 * @pre VOL_LOCK is held.
6365 * @post For fileserver, header cache entry is returned to LRU, and it is
6366 * invalidated as a cache entry. For volume utilities, the header
6367 * cache entry is freed.
6369 * @note For fileserver, this should be utilized instead of ReleaseVolumeHeader
6370 * whenever it is necessary to invalidate the header cache entry.
6372 * @see ReleaseVolumeHeader
6374 * @internal volume package internal use only.
6377 FreeVolumeHeader(register Volume * vp)
6379 register struct volHeader *hd = vp->header;
6382 if (programType == fileServer) {
6383 ReleaseVolumeHeader(hd);
6388 #ifdef AFS_DEMAND_ATTACH_FS
6389 V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED);
6391 volume_hdr_LRU.stats.attached--;
6396 /***************************************************/
6397 /* Volume Hash Table routines */
6398 /***************************************************/
6401 * set size of volume object hash table.
6403 * @param[in] logsize log(2) of desired hash table size
6405 * @return operation status
6407 * @retval -1 failure
6409 * @pre MUST be called prior to VInitVolumePackage
6411 * @post Volume Hash Table will have 2^logsize buckets
6414 VSetVolHashSize(int logsize)
6416 /* 64 to 16384 hash buckets seems like a reasonable range */
6417 if ((logsize < 6 ) || (logsize > 14)) {
6422 VolumeHashTable.Size = 1 << logsize;
6423 VolumeHashTable.Mask = VolumeHashTable.Size - 1;
6425 /* we can't yet support runtime modification of this
6426 * parameter. we'll need a configuration rwlock to
6427 * make runtime modification feasible.... */
6434 * initialize dynamic data structures for volume hash table.
6436 * @post hash table is allocated, and fields are initialized.
6438 * @internal volume package internal use only.
6441 VInitVolumeHash(void)
6445 VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size,
6446 sizeof(VolumeHashChainHead));
6447 assert(VolumeHashTable.Table != NULL);
6449 for (i=0; i < VolumeHashTable.Size; i++) {
6450 queue_Init(&VolumeHashTable.Table[i]);
6451 #ifdef AFS_DEMAND_ATTACH_FS
6452 assert(pthread_cond_init(&VolumeHashTable.Table[i].chain_busy_cv, NULL) == 0);
6453 #endif /* AFS_DEMAND_ATTACH_FS */
6458 * add a volume object to the hash table.
6460 * @param[in] vp pointer to volume object
6461 * @param[in] hashid hash of volume id
6463 * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight
6466 * @post volume is added to hash chain.
6468 * @internal volume package internal use only.
6470 * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
6471 * asynchronous hash chain reordering to finish.
6474 AddVolumeToHashTable(register Volume * vp, int hashid)
6476 VolumeHashChainHead * head;
6478 if (queue_IsOnQueue(vp))
6481 head = &VolumeHashTable.Table[VOLUME_HASH(hashid)];
6483 #ifdef AFS_DEMAND_ATTACH_FS
6484 /* wait for the hash chain to become available */
6487 V_attachFlags(vp) |= VOL_IN_HASH;
6488 vp->chainCacheCheck = ++head->cacheCheck;
6489 #endif /* AFS_DEMAND_ATTACH_FS */
6492 vp->hashid = hashid;
6493 queue_Append(head, vp);
6494 vp->vnodeHashOffset = VolumeHashOffset_r();
6498 * delete a volume object from the hash table.
6500 * @param[in] vp pointer to volume object
6502 * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight
6505 * @post volume is removed from hash chain.
6507 * @internal volume package internal use only.
6509 * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
6510 * asynchronous hash chain reordering to finish.
6513 DeleteVolumeFromHashTable(register Volume * vp)
6515 VolumeHashChainHead * head;
6517 if (!queue_IsOnQueue(vp))
6520 head = &VolumeHashTable.Table[VOLUME_HASH(vp->hashid)];
6522 #ifdef AFS_DEMAND_ATTACH_FS
6523 /* wait for the hash chain to become available */
6526 V_attachFlags(vp) &= ~(VOL_IN_HASH);
6528 #endif /* AFS_DEMAND_ATTACH_FS */
6532 /* do NOT reset hashid to zero, as the online
6533 * salvager package may need to know the volume id
6534 * after the volume is removed from the hash */
6538 * lookup a volume object in the hash table given a volume id.
6540 * @param[out] ec error code return
6541 * @param[in] volumeId volume id
6542 * @param[in] hint volume object which we believe could be the correct
6545 * @return volume object pointer
6546 * @retval NULL no such volume id is registered with the hash table.
6548 * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight
6551 * @post volume object with the given id is returned. volume object and
6552 * hash chain access statistics are updated. hash chain may have
6555 * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
6556 * asynchronous hash chain reordering operation to finish, or
6557 * in order for us to perform an asynchronous chain reordering.
6559 * @note Hash chain reorderings occur when the access count for the
6560 * volume object being looked up exceeds the sum of the previous
6561 * node's (the node ahead of it in the hash chain linked list)
6562 * access count plus the constant VOLUME_HASH_REORDER_THRESHOLD.
6564 * @note For DAFS, the hint parameter allows us to short-circuit if the
6565 * cacheCheck fields match between the hash chain head and the
6566 * hint volume object.
6569 VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
6571 register int looks = 0;
6572 Volume * vp, *np, *pp;
6573 VolumeHashChainHead * head;
6576 head = &VolumeHashTable.Table[VOLUME_HASH(volumeId)];
6578 #ifdef AFS_DEMAND_ATTACH_FS
6579 /* wait for the hash chain to become available */
6582 /* check to see if we can short circuit without walking the hash chain */
6583 if (hint && (hint->chainCacheCheck == head->cacheCheck)) {
6584 IncUInt64(&hint->stats.hash_short_circuits);
6587 #endif /* AFS_DEMAND_ATTACH_FS */
6589 /* someday we need to either do per-chain locks, RWlocks,
6590 * or both for volhash access.
6591 * (and move to a data structure with better cache locality) */
6593 /* search the chain for this volume id */
6594 for(queue_Scan(head, vp, np, Volume)) {
6596 if ((vp->hashid == volumeId)) {
6601 if (queue_IsEnd(head, vp)) {
6605 #ifdef AFS_DEMAND_ATTACH_FS
6606 /* update hash chain statistics */
6609 FillInt64(lks, 0, looks);
6610 AddUInt64(head->looks, lks, &head->looks);
6611 AddUInt64(VStats.hash_looks, lks, &VStats.hash_looks);
6612 IncUInt64(&head->gets);
6617 IncUInt64(&vp->stats.hash_lookups);
6619 /* for demand attach fileserver, we permit occasional hash chain reordering
6620 * so that frequently looked up volumes move towards the head of the chain */
6621 pp = queue_Prev(vp, Volume);
6622 if (!queue_IsEnd(head, pp)) {
6623 FillInt64(thresh, 0, VOLUME_HASH_REORDER_THRESHOLD);
6624 AddUInt64(thresh, pp->stats.hash_lookups, &thresh);
6625 if (GEInt64(vp->stats.hash_lookups, thresh)) {
6626 VReorderHash_r(head, pp, vp);
6630 /* update the short-circuit cache check */
6631 vp->chainCacheCheck = head->cacheCheck;
6633 #endif /* AFS_DEMAND_ATTACH_FS */
6638 #ifdef AFS_DEMAND_ATTACH_FS
6639 /* perform volume hash chain reordering.
6641 * advance a subchain beginning at vp ahead of
6642 * the adjacent subchain ending at pp */
6644 VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp)
6646 Volume *tp, *np, *lp;
6647 afs_uint64 move_thresh;
6649 /* this should never be called if the chain is already busy, so
6650 * no need to wait for other exclusive chain ops to finish */
6652 /* this is a rather heavy set of operations,
6653 * so let's set the chain busy flag and drop
6655 VHashBeginExclusive_r(head);
6658 /* scan forward in the chain from vp looking for the last element
6659 * in the chain we want to advance */
6660 FillInt64(move_thresh, 0, VOLUME_HASH_REORDER_CHAIN_THRESH);
6661 AddUInt64(move_thresh, pp->stats.hash_lookups, &move_thresh);
6662 for(queue_ScanFrom(head, vp, tp, np, Volume)) {
6663 if (LTInt64(tp->stats.hash_lookups, move_thresh)) {
6667 lp = queue_Prev(tp, Volume);
6669 /* scan backwards from pp to determine where to splice and
6670 * insert the subchain we're advancing */
6671 for(queue_ScanBackwardsFrom(head, pp, tp, np, Volume)) {
6672 if (GTInt64(tp->stats.hash_lookups, move_thresh)) {
6676 tp = queue_Next(tp, Volume);
6678 /* rebalance chain(vp,...,lp) ahead of chain(tp,...,pp) */
6679 queue_MoveChainBefore(tp,vp,lp);
6682 IncUInt64(&VStats.hash_reorders);
6684 IncUInt64(&head->reorders);
6686 /* wake up any threads waiting for the hash chain */
6687 VHashEndExclusive_r(head);
6691 /* demand-attach fs volume hash
6692 * asynchronous exclusive operations */
6695 * begin an asynchronous exclusive operation on a volume hash chain.
6697 * @param[in] head pointer to volume hash chain head object
6699 * @pre VOL_LOCK held. hash chain is quiescent.
6701 * @post hash chain marked busy.
6703 * @note this interface is used in conjunction with VHashEndExclusive_r and
6704 * VHashWait_r to perform asynchronous (wrt VOL_LOCK) operations on a
6705 * volume hash chain. Its main use case is hash chain reordering, which
6706 * has the potential to be a highly latent operation.
6708 * @see VHashEndExclusive_r
6713 * @internal volume package internal use only.
6716 VHashBeginExclusive_r(VolumeHashChainHead * head)
6718 assert(head->busy == 0);
6723 * relinquish exclusive ownership of a volume hash chain.
6725 * @param[in] head pointer to volume hash chain head object
6727 * @pre VOL_LOCK held. thread owns the hash chain exclusively.
6729 * @post hash chain is marked quiescent. threads awaiting use of
6730 * chain are awakened.
6732 * @see VHashBeginExclusive_r
6737 * @internal volume package internal use only.
6740 VHashEndExclusive_r(VolumeHashChainHead * head)
6744 assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
6748 * wait for all asynchronous operations on a hash chain to complete.
6750 * @param[in] head pointer to volume hash chain head object
6752 * @pre VOL_LOCK held.
6754 * @post hash chain object is quiescent.
6756 * @see VHashBeginExclusive_r
6757 * @see VHashEndExclusive_r
6761 * @note This interface should be called before any attempt to
6762 * traverse the hash chain. It is permissible for a thread
6763 * to gain exclusive access to the chain, and then perform
6764 * latent operations on the chain asynchronously wrt the
6767 * @warning if waiting is necessary, VOL_LOCK is dropped
6769 * @internal volume package internal use only.
6772 VHashWait_r(VolumeHashChainHead * head)
6774 while (head->busy) {
6775 VOL_CV_WAIT(&head->chain_busy_cv);
6778 #endif /* AFS_DEMAND_ATTACH_FS */
6781 /***************************************************/
6782 /* Volume by Partition List routines */
6783 /***************************************************/
6786 * demand attach fileserver adds a
6787 * linked list of volumes to each
6788 * partition object, thus allowing
6789 * for quick enumeration of all
6790 * volumes on a partition
6793 #ifdef AFS_DEMAND_ATTACH_FS
6795 * add a volume to its disk partition VByPList.
6797 * @param[in] vp pointer to volume object
6799 * @pre either the disk partition VByPList is owned exclusively
6800 * by the calling thread, or the list is quiescent and
6803 * @post volume is added to disk partition VByPList
6807 * @warning it is the caller's responsibility to ensure list
6810 * @see VVByPListWait_r
6811 * @see VVByPListBeginExclusive_r
6812 * @see VVByPListEndExclusive_r
6814 * @internal volume package internal use only.
6817 AddVolumeToVByPList_r(Volume * vp)
6819 if (queue_IsNotOnQueue(&vp->vol_list)) {
6820 queue_Append(&vp->partition->vol_list, &vp->vol_list);
6821 V_attachFlags(vp) |= VOL_ON_VBYP_LIST;
6822 vp->partition->vol_list.len++;
6827 * delete a volume from its disk partition VByPList.
6829 * @param[in] vp pointer to volume object
6831 * @pre either the disk partition VByPList is owned exclusively
6832 * by the calling thread, or the list is quiescent and
6835 * @post volume is removed from the disk partition VByPList
6839 * @warning it is the caller's responsibility to ensure list
6842 * @see VVByPListWait_r
6843 * @see VVByPListBeginExclusive_r
6844 * @see VVByPListEndExclusive_r
6846 * @internal volume package internal use only.
6849 DeleteVolumeFromVByPList_r(Volume * vp)
6851 if (queue_IsOnQueue(&vp->vol_list)) {
6852 queue_Remove(&vp->vol_list);
6853 V_attachFlags(vp) &= ~(VOL_ON_VBYP_LIST);
6854 vp->partition->vol_list.len--;
6859 * begin an asynchronous exclusive operation on a VByPList.
6861 * @param[in] dp pointer to disk partition object
6863 * @pre VOL_LOCK held. VByPList is quiescent.
6865 * @post VByPList marked busy.
6867 * @note this interface is used in conjunction with VVByPListEndExclusive_r and
6868 * VVByPListWait_r to perform asynchronous (wrt VOL_LOCK) operations on a
6871 * @see VVByPListEndExclusive_r
6872 * @see VVByPListWait_r
6876 * @internal volume package internal use only.
6878 /* take exclusive control over the list */
6880 VVByPListBeginExclusive_r(struct DiskPartition64 * dp)
6882 assert(dp->vol_list.busy == 0);
6883 dp->vol_list.busy = 1;
6887 * relinquish exclusive ownership of a VByPList.
6889 * @param[in] dp pointer to disk partition object
6891 * @pre VOL_LOCK held. thread owns the VByPList exclusively.
6893 * @post VByPList is marked quiescent. threads awaiting use of
6894 * the list are awakened.
6896 * @see VVByPListBeginExclusive_r
6897 * @see VVByPListWait_r
6901 * @internal volume package internal use only.
6904 VVByPListEndExclusive_r(struct DiskPartition64 * dp)
6906 assert(dp->vol_list.busy);
6907 dp->vol_list.busy = 0;
6908 assert(pthread_cond_broadcast(&dp->vol_list.cv) == 0);
6912 * wait for all asynchronous operations on a VByPList to complete.
6914 * @param[in] dp pointer to disk partition object
6916 * @pre VOL_LOCK is held.
6918 * @post disk partition's VByP list is quiescent
6922 * @note This interface should be called before any attempt to
6923 * traverse the VByPList. It is permissible for a thread
6924 * to gain exclusive access to the list, and then perform
6925 * latent operations on the list asynchronously wrt the
6928 * @warning if waiting is necessary, VOL_LOCK is dropped
6930 * @see VVByPListEndExclusive_r
6931 * @see VVByPListBeginExclusive_r
6933 * @internal volume package internal use only.
6936 VVByPListWait_r(struct DiskPartition64 * dp)
6938 while (dp->vol_list.busy) {
6939 VOL_CV_WAIT(&dp->vol_list.cv);
6942 #endif /* AFS_DEMAND_ATTACH_FS */
6944 /***************************************************/
6945 /* Volume Cache Statistics routines */
6946 /***************************************************/
6949 VPrintCacheStats_r(void)
6951 afs_uint32 get_hi, get_lo, load_hi, load_lo;
6952 register struct VnodeClassInfo *vcp;
6953 vcp = &VnodeClassInfo[vLarge];
6954 Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
6955 vcp = &VnodeClassInfo[vSmall];
6956 Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
6957 SplitInt64(VStats.hdr_gets, get_hi, get_lo);
6958 SplitInt64(VStats.hdr_loads, load_hi, load_lo);
6959 Log("Volume header cache, %d entries, %d gets, %d replacements\n",
6960 VStats.hdr_cache_size, get_lo, load_lo);
6964 VPrintCacheStats(void)
6967 VPrintCacheStats_r();
6971 #ifdef AFS_DEMAND_ATTACH_FS
6973 UInt64ToDouble(afs_uint64 * x)
6975 static double c32 = 4.0 * 1.073741824 * 1000000000.0;
6977 SplitInt64(*x, h, l);
6978 return (((double)h) * c32) + ((double) l);
6982 DoubleToPrintable(double x, char * buf, int len)
6984 static double billion = 1000000000.0;
6987 y[0] = (afs_uint32) (x / (billion * billion));
6988 y[1] = (afs_uint32) ((x - (((double)y[0]) * billion * billion)) / billion);
6989 y[2] = (afs_uint32) (x - ((((double)y[0]) * billion * billion) + (((double)y[1]) * billion)));
6992 snprintf(buf, len, "%d%09d%09d", y[0], y[1], y[2]);
6994 snprintf(buf, len, "%d%09d", y[1], y[2]);
6996 snprintf(buf, len, "%d", y[2]);
7002 struct VLRUExtStatsEntry {
7006 struct VLRUExtStats {
7012 } queue_info[VLRU_QUEUE_INVALID];
7013 struct VLRUExtStatsEntry * vec;
7017 * add a 256-entry fudge factor onto the vector in case state changes
7018 * out from under us.
7020 #define VLRU_EXT_STATS_VEC_LEN_FUDGE 256
7023 * collect extended statistics for the VLRU subsystem.
7025 * @param[out] stats pointer to stats structure to be populated
7026 * @param[in] nvols number of volumes currently known to exist
7028 * @pre VOL_LOCK held
7030 * @post stats->vec allocated and populated
7032 * @return operation status
7037 VVLRUExtStats_r(struct VLRUExtStats * stats, afs_uint32 nvols)
7039 afs_uint32 cur, idx, len;
7040 struct rx_queue * qp, * nqp;
7042 struct VLRUExtStatsEntry * vec;
7044 len = nvols + VLRU_EXT_STATS_VEC_LEN_FUDGE;
7045 vec = stats->vec = calloc(len,
7046 sizeof(struct VLRUExtStatsEntry));
7052 for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
7053 VLRU_Wait_r(&volume_LRU.q[idx]);
7054 VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
7057 stats->queue_info[idx].start = cur;
7059 for (queue_Scan(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
7061 /* out of space in vec */
7064 vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
7065 vec[cur].volid = vp->hashid;
7069 stats->queue_info[idx].len = cur - stats->queue_info[idx].start;
7072 VLRU_EndExclusive_r(&volume_LRU.q[idx]);
7080 #define ENUMTOSTRING(en) #en
7081 #define ENUMCASE(en) \
7083 return ENUMTOSTRING(en); \
7087 vlru_idx_to_string(int idx)
7090 ENUMCASE(VLRU_QUEUE_NEW);
7091 ENUMCASE(VLRU_QUEUE_MID);
7092 ENUMCASE(VLRU_QUEUE_OLD);
7093 ENUMCASE(VLRU_QUEUE_CANDIDATE);
7094 ENUMCASE(VLRU_QUEUE_HELD);
7095 ENUMCASE(VLRU_QUEUE_INVALID);
7097 return "**UNKNOWN**";
7102 VPrintExtendedCacheStats_r(int flags)
7105 afs_uint32 vol_sum = 0;
7112 struct stats looks, gets, reorders, len;
7113 struct stats ch_looks, ch_gets, ch_reorders;
7115 VolumeHashChainHead *head;
7117 struct VLRUExtStats vlru_stats;
7119 /* zero out stats */
7120 memset(&looks, 0, sizeof(struct stats));
7121 memset(&gets, 0, sizeof(struct stats));
7122 memset(&reorders, 0, sizeof(struct stats));
7123 memset(&len, 0, sizeof(struct stats));
7124 memset(&ch_looks, 0, sizeof(struct stats));
7125 memset(&ch_gets, 0, sizeof(struct stats));
7126 memset(&ch_reorders, 0, sizeof(struct stats));
7128 for (i = 0; i < VolumeHashTable.Size; i++) {
7129 head = &VolumeHashTable.Table[i];
7132 VHashBeginExclusive_r(head);
7135 ch_looks.sum = UInt64ToDouble(&head->looks);
7136 ch_gets.sum = UInt64ToDouble(&head->gets);
7137 ch_reorders.sum = UInt64ToDouble(&head->reorders);
7139 /* update global statistics */
7141 looks.sum += ch_looks.sum;
7142 gets.sum += ch_gets.sum;
7143 reorders.sum += ch_reorders.sum;
7144 len.sum += (double)head->len;
7145 vol_sum += head->len;
7148 len.min = (double) head->len;
7149 len.max = (double) head->len;
7150 looks.min = ch_looks.sum;
7151 looks.max = ch_looks.sum;
7152 gets.min = ch_gets.sum;
7153 gets.max = ch_gets.sum;
7154 reorders.min = ch_reorders.sum;
7155 reorders.max = ch_reorders.sum;
7157 if (((double)head->len) < len.min)
7158 len.min = (double) head->len;
7159 if (((double)head->len) > len.max)
7160 len.max = (double) head->len;
7161 if (ch_looks.sum < looks.min)
7162 looks.min = ch_looks.sum;
7163 else if (ch_looks.sum > looks.max)
7164 looks.max = ch_looks.sum;
7165 if (ch_gets.sum < gets.min)
7166 gets.min = ch_gets.sum;
7167 else if (ch_gets.sum > gets.max)
7168 gets.max = ch_gets.sum;
7169 if (ch_reorders.sum < reorders.min)
7170 reorders.min = ch_reorders.sum;
7171 else if (ch_reorders.sum > reorders.max)
7172 reorders.max = ch_reorders.sum;
7176 if ((flags & VOL_STATS_PER_CHAIN2) && queue_IsNotEmpty(head)) {
7177 /* compute detailed per-chain stats */
7178 struct stats hdr_loads, hdr_gets;
7179 double v_looks, v_loads, v_gets;
7181 /* initialize stats with data from first element in chain */
7182 vp = queue_First(head, Volume);
7183 v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
7184 v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
7185 v_gets = UInt64ToDouble(&vp->stats.hdr_gets);
7186 ch_gets.min = ch_gets.max = v_looks;
7187 hdr_loads.min = hdr_loads.max = v_loads;
7188 hdr_gets.min = hdr_gets.max = v_gets;
7189 hdr_loads.sum = hdr_gets.sum = 0;
7191 vp = queue_Next(vp, Volume);
7193 /* pull in stats from remaining elements in chain */
7194 for (queue_ScanFrom(head, vp, vp, np, Volume)) {
7195 v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
7196 v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
7197 v_gets = UInt64ToDouble(&vp->stats.hdr_gets);
7199 hdr_loads.sum += v_loads;
7200 hdr_gets.sum += v_gets;
7202 if (v_looks < ch_gets.min)
7203 ch_gets.min = v_looks;
7204 else if (v_looks > ch_gets.max)
7205 ch_gets.max = v_looks;
7207 if (v_loads < hdr_loads.min)
7208 hdr_loads.min = v_loads;
7209 else if (v_loads > hdr_loads.max)
7210 hdr_loads.max = v_loads;
7212 if (v_gets < hdr_gets.min)
7213 hdr_gets.min = v_gets;
7214 else if (v_gets > hdr_gets.max)
7215 hdr_gets.max = v_gets;
7218 /* compute per-chain averages */
7219 ch_gets.avg = ch_gets.sum / ((double)head->len);
7220 hdr_loads.avg = hdr_loads.sum / ((double)head->len);
7221 hdr_gets.avg = hdr_gets.sum / ((double)head->len);
7223 /* dump per-chain stats */
7224 Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n",
7226 DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
7227 DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1])));
7228 Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n",
7229 DoubleToPrintable(ch_gets.min, pr_buf[0], sizeof(pr_buf[0])),
7230 DoubleToPrintable(ch_gets.max, pr_buf[1], sizeof(pr_buf[1])),
7231 DoubleToPrintable(ch_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
7232 DoubleToPrintable(ch_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
7233 Log("\tHDR gets : min=%s, max=%s, avg=%s, total=%s\n",
7234 DoubleToPrintable(hdr_gets.min, pr_buf[0], sizeof(pr_buf[0])),
7235 DoubleToPrintable(hdr_gets.max, pr_buf[1], sizeof(pr_buf[1])),
7236 DoubleToPrintable(hdr_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
7237 DoubleToPrintable(hdr_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
7238 Log("\tHDR loads : min=%s, max=%s, avg=%s, total=%s\n",
7239 DoubleToPrintable(hdr_loads.min, pr_buf[0], sizeof(pr_buf[0])),
7240 DoubleToPrintable(hdr_loads.max, pr_buf[1], sizeof(pr_buf[1])),
7241 DoubleToPrintable(hdr_loads.avg, pr_buf[2], sizeof(pr_buf[2])),
7242 DoubleToPrintable(hdr_loads.sum, pr_buf[3], sizeof(pr_buf[3])));
7243 } else if (flags & VOL_STATS_PER_CHAIN) {
7244 /* dump simple per-chain stats */
7245 Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n",
7247 DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
7248 DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])),
7249 DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2])));
7253 VHashEndExclusive_r(head);
7258 /* compute global averages */
7259 len.avg = len.sum / ((double)VolumeHashTable.Size);
7260 looks.avg = looks.sum / ((double)VolumeHashTable.Size);
7261 gets.avg = gets.sum / ((double)VolumeHashTable.Size);
7262 reorders.avg = reorders.sum / ((double)VolumeHashTable.Size);
7264 /* dump global stats */
7265 Log("Volume hash summary: %d buckets\n", VolumeHashTable.Size);
7266 Log(" chain length : min=%s, max=%s, avg=%s, total=%s\n",
7267 DoubleToPrintable(len.min, pr_buf[0], sizeof(pr_buf[0])),
7268 DoubleToPrintable(len.max, pr_buf[1], sizeof(pr_buf[1])),
7269 DoubleToPrintable(len.avg, pr_buf[2], sizeof(pr_buf[2])),
7270 DoubleToPrintable(len.sum, pr_buf[3], sizeof(pr_buf[3])));
7271 Log(" looks : min=%s, max=%s, avg=%s, total=%s\n",
7272 DoubleToPrintable(looks.min, pr_buf[0], sizeof(pr_buf[0])),
7273 DoubleToPrintable(looks.max, pr_buf[1], sizeof(pr_buf[1])),
7274 DoubleToPrintable(looks.avg, pr_buf[2], sizeof(pr_buf[2])),
7275 DoubleToPrintable(looks.sum, pr_buf[3], sizeof(pr_buf[3])));
7276 Log(" gets : min=%s, max=%s, avg=%s, total=%s\n",
7277 DoubleToPrintable(gets.min, pr_buf[0], sizeof(pr_buf[0])),
7278 DoubleToPrintable(gets.max, pr_buf[1], sizeof(pr_buf[1])),
7279 DoubleToPrintable(gets.avg, pr_buf[2], sizeof(pr_buf[2])),
7280 DoubleToPrintable(gets.sum, pr_buf[3], sizeof(pr_buf[3])));
7281 Log(" reorders : min=%s, max=%s, avg=%s, total=%s\n",
7282 DoubleToPrintable(reorders.min, pr_buf[0], sizeof(pr_buf[0])),
7283 DoubleToPrintable(reorders.max, pr_buf[1], sizeof(pr_buf[1])),
7284 DoubleToPrintable(reorders.avg, pr_buf[2], sizeof(pr_buf[2])),
7285 DoubleToPrintable(reorders.sum, pr_buf[3], sizeof(pr_buf[3])));
7287 /* print extended disk related statistics */
7289 struct DiskPartition64 * diskP;
7290 afs_uint32 vol_count[VOLMAXPARTS+1];
7291 byte part_exists[VOLMAXPARTS+1];
7295 memset(vol_count, 0, sizeof(vol_count));
7296 memset(part_exists, 0, sizeof(part_exists));
7300 for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
7302 vol_count[id] = diskP->vol_list.len;
7303 part_exists[id] = 1;
7307 for (i = 0; i <= VOLMAXPARTS; i++) {
7308 if (part_exists[i]) {
7309 /* XXX while this is currently safe, it is a violation
7310 * of the VGetPartitionById_r interface contract. */
7311 diskP = VGetPartitionById_r(i, 0);
7313 Log("Partition %s has %d online volumes\n",
7314 VPartitionPath(diskP), diskP->vol_list.len);
7321 /* print extended VLRU statistics */
7322 if (VVLRUExtStats_r(&vlru_stats, vol_sum) == 0) {
7323 afs_uint32 idx, cur, lpos;
7327 Log("VLRU State Dump:\n\n");
7329 for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
7330 Log("\t%s:\n", vlru_idx_to_string(idx));
7333 for (cur = vlru_stats.queue_info[idx].start;
7334 cur < vlru_stats.queue_info[idx].len;
7336 line[lpos++] = vlru_stats.vec[cur].volid;
7338 Log("\t\t%u, %u, %u, %u, %u,\n",
7339 line[0], line[1], line[2], line[3], line[4]);
7348 Log("\t\t%u, %u, %u, %u, %u\n",
7349 line[0], line[1], line[2], line[3], line[4]);
7354 free(vlru_stats.vec);
7361 VPrintExtendedCacheStats(int flags)
7364 VPrintExtendedCacheStats_r(flags);
7367 #endif /* AFS_DEMAND_ATTACH_FS */