/* * Copyright 2000, International Business Machines Corporation and others. * All Rights Reserved. * * This software has been released under the terms of the IBM Public * License. For details, see the LICENSE file in the top-level source * directory or online at http://www.openafs.org/dl/license10.html * * Portions Copyright (c) 2005-2008 Sine Nomine Associates */ /* 1/1/89: NB: this stuff is all going to be replaced. Don't take it too seriously */ /* System: VICE-TWO Module: volume.c Institution: The Information Technology Center, Carnegie-Mellon University */ #include #include #include #include #include #include #ifdef HAVE_SYS_FILE_H #include #endif #include #include #ifndef AFS_NT40_ENV #if !defined(AFS_SGI_ENV) #ifdef AFS_OSF_ENV #include #else /* AFS_OSF_ENV */ #ifdef AFS_VFSINCL_ENV #define VFS #ifdef AFS_SUN5_ENV #include #else #if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV) #include #include #else #include #endif #endif #else /* AFS_VFSINCL_ENV */ #if !defined(AFS_AIX_ENV) && !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV) && !defined(AFS_DARWIN_ENV) #include #endif #endif /* AFS_VFSINCL_ENV */ #endif /* AFS_OSF_ENV */ #endif /* AFS_SGI_ENV */ #endif /* !AFS_NT40_ENV */ #ifdef AFS_AIX_ENV #include #else #ifdef AFS_HPUX_ENV #include #else #if defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV) #ifdef AFS_SUN5_ENV #include #include #else #include #endif #else #ifndef AFS_NT40_ENV #if defined(AFS_SGI_ENV) #include #else #ifndef AFS_LINUX20_ENV #include /* Need to find in libc 5, present in libc 6 */ #endif #endif #endif /* AFS_SGI_ENV */ #endif #endif /* AFS_HPUX_ENV */ #endif #include "nfs.h" #include #include "lock.h" #include "lwp.h" #include #include "ihandle.h" #include #include "daemon_com.h" #include "fssync.h" #include "salvsync.h" #include "vnode.h" #include "volume.h" #include "partition.h" #include "volume_inline.h" #include "common.h" #include "vutils.h" #include #ifdef AFS_PTHREAD_ENV pthread_mutex_t vol_glock_mutex; pthread_mutex_t vol_trans_mutex; pthread_cond_t vol_put_volume_cond; pthread_cond_t vol_sleep_cond; pthread_cond_t vol_init_attach_cond; pthread_cond_t vol_vinit_cond; int vol_attach_threads = 1; #endif /* AFS_PTHREAD_ENV */ /* start-time configurable I/O parameters */ ih_init_params vol_io_params; #ifdef AFS_DEMAND_ATTACH_FS pthread_mutex_t vol_salvsync_mutex; /* * Set this to 1 to disallow SALVSYNC communication in all threads; used * during shutdown, since the salvageserver may have gone away. */ static volatile sig_atomic_t vol_disallow_salvsync = 0; #endif /* AFS_DEMAND_ATTACH_FS */ /** * has VShutdown_r been called / is VShutdown_r running? */ static int vol_shutting_down = 0; #ifdef AFS_OSF_ENV extern void *calloc(), *realloc(); #endif /* Forward declarations */ static Volume *attach2(Error * ec, VolId volumeId, char *path, struct DiskPartition64 *partp, Volume * vp, int isbusy, int mode, int *acheckedOut); static void ReallyFreeVolume(Volume * vp); #ifdef AFS_DEMAND_ATTACH_FS static void FreeVolume(Volume * vp); #else /* !AFS_DEMAND_ATTACH_FS */ #define FreeVolume(vp) ReallyFreeVolume(vp) static void VScanUpdateList(void); #endif /* !AFS_DEMAND_ATTACH_FS */ static void VInitVolumeHeaderCache(afs_uint32 howMany); static int GetVolumeHeader(Volume * vp); static void ReleaseVolumeHeader(struct volHeader *hd); static void FreeVolumeHeader(Volume * vp); static void AddVolumeToHashTable(Volume * vp, int hashid); static void DeleteVolumeFromHashTable(Volume * vp); #if 0 static int VHold(Volume * vp); #endif static int VHold_r(Volume * vp); static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class); static void VReleaseVolumeHandles_r(Volume * vp); static void VCloseVolumeHandles_r(Volume * vp); static void LoadVolumeHeader(Error * ec, Volume * vp); static int VCheckOffline(Volume * vp); static int VCheckDetach(Volume * vp); static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, const struct timespec *ts); int LogLevel; /* Vice loglevel--not defined as extern so that it will be * defined when not linked with vice, XXXX */ ProgramType programType; /* The type of program using the package */ static VolumePackageOptions vol_opts; /* extended volume package statistics */ VolPkgStats VStats; #ifdef VOL_LOCK_DEBUG pthread_t vol_glock_holder = 0; #endif #define VOLUME_BITMAP_GROWSIZE 16 /* bytes, => 128vnodes */ /* Must be a multiple of 4 (1 word) !! */ /* this parameter needs to be tunable at runtime. * 128 was really inadequate for largish servers -- at 16384 volumes this * puts average chain length at 128, thus an average 65 deref's to find a volptr. * talk about bad spatial locality... * * an AVL or splay tree might work a lot better, but we'll just increase * the default hash table size for now */ #define DEFAULT_VOLUME_HASH_SIZE 256 /* Must be a power of 2!! */ #define DEFAULT_VOLUME_HASH_MASK (DEFAULT_VOLUME_HASH_SIZE-1) #define VOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask)) /* * turn volume hash chains into partially ordered lists. * when the threshold is exceeded between two adjacent elements, * perform a chain rebalancing operation. * * keep the threshold high in order to keep cache line invalidates * low "enough" on SMPs */ #define VOLUME_HASH_REORDER_THRESHOLD 200 /* * when possible, don't just reorder single elements, but reorder * entire chains of elements at once. a chain of elements that * exceed the element previous to the pivot by at least CHAIN_THRESH * accesses are moved in front of the chain whose elements have at * least CHAIN_THRESH less accesses than the pivot element */ #define VOLUME_HASH_REORDER_CHAIN_THRESH (VOLUME_HASH_REORDER_THRESHOLD / 2) #include "rx/rx_queue.h" VolumeHashTable_t VolumeHashTable = { DEFAULT_VOLUME_HASH_SIZE, DEFAULT_VOLUME_HASH_MASK, NULL }; static void VInitVolumeHash(void); #ifndef AFS_HAVE_FFS /* This macro is used where an ffs() call does not exist. Was in util/ffs.c */ ffs(x) { afs_int32 ffs_i; afs_int32 ffs_tmp = x; if (ffs_tmp == 0) return (-1); else for (ffs_i = 1;; ffs_i++) { if (ffs_tmp & 1) return (ffs_i); else ffs_tmp >>= 1; } } #endif /* !AFS_HAVE_FFS */ #ifdef AFS_PTHREAD_ENV /** * disk partition queue element */ typedef struct diskpartition_queue_t { struct rx_queue queue; /**< queue header */ struct DiskPartition64 *diskP; /**< disk partition table entry */ } diskpartition_queue_t; #ifndef AFS_DEMAND_ATTACH_FS typedef struct vinitvolumepackage_thread_t { struct rx_queue queue; pthread_cond_t thread_done_cv; int n_threads_complete; } vinitvolumepackage_thread_t; static void * VInitVolumePackageThread(void * args); #else /* !AFS_DEMAND_ATTTACH_FS */ #define VINIT_BATCH_MAX_SIZE 512 /** * disk partition work queue */ struct partition_queue { struct rx_queue head; /**< diskpartition_queue_t queue */ pthread_mutex_t mutex; pthread_cond_t cv; }; /** * volumes parameters for preattach */ struct volume_init_batch { struct rx_queue queue; /**< queue header */ int thread; /**< posting worker thread */ int last; /**< indicates thread is done */ int size; /**< number of volume ids in batch */ Volume *batch[VINIT_BATCH_MAX_SIZE]; /**< volumes ids to preattach */ }; /** * volume parameters work queue */ struct volume_init_queue { struct rx_queue head; /**< volume_init_batch queue */ pthread_mutex_t mutex; pthread_cond_t cv; }; /** * volume init worker thread parameters */ struct vinitvolumepackage_thread_param { int nthreads; /**< total number of worker threads */ int thread; /**< thread number for this worker thread */ struct partition_queue *pq; /**< queue partitions to scan */ struct volume_init_queue *vq; /**< queue of volume to preattach */ }; static void *VInitVolumePackageThread(void *args); static struct DiskPartition64 *VInitNextPartition(struct partition_queue *pq); static VolId VInitNextVolumeId(DIR *dirp); static int VInitPreAttachVolumes(int nthreads, struct volume_init_queue *vq); #endif /* !AFS_DEMAND_ATTACH_FS */ #endif /* AFS_PTHREAD_ENV */ #ifndef AFS_DEMAND_ATTACH_FS static int VAttachVolumesByPartition(struct DiskPartition64 *diskP, int * nAttached, int * nUnattached); #endif /* AFS_DEMAND_ATTACH_FS */ #ifdef AFS_DEMAND_ATTACH_FS /* demand attach fileserver extensions */ /* XXX * in the future we will support serialization of VLRU state into the fs_state * disk dumps * * these structures are the beginning of that effort */ struct VLRU_DiskHeader { struct versionStamp stamp; /* magic and structure version number */ afs_uint32 mtime; /* time of dump to disk */ afs_uint32 num_records; /* number of VLRU_DiskEntry records */ }; struct VLRU_DiskEntry { afs_uint32 vid; /* volume ID */ afs_uint32 idx; /* generation */ afs_uint32 last_get; /* timestamp of last get */ }; struct VLRU_StartupQueue { struct VLRU_DiskEntry * entry; int num_entries; int next_idx; }; typedef struct vshutdown_thread_t { struct rx_queue q; pthread_mutex_t lock; pthread_cond_t cv; pthread_cond_t master_cv; int n_threads; int n_threads_complete; int vol_remaining; int schedule_version; int pass; byte n_parts; byte n_parts_done_pass; byte part_thread_target[VOLMAXPARTS+1]; byte part_done_pass[VOLMAXPARTS+1]; struct rx_queue * part_pass_head[VOLMAXPARTS+1]; int stats[4][VOLMAXPARTS+1]; } vshutdown_thread_t; static void * VShutdownThread(void * args); static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode); static int VCheckFree(Volume * vp); /* VByP List */ static void AddVolumeToVByPList_r(Volume * vp); static void DeleteVolumeFromVByPList_r(Volume * vp); static void VVByPListBeginExclusive_r(struct DiskPartition64 * dp); static void VVByPListEndExclusive_r(struct DiskPartition64 * dp); static void VVByPListWait_r(struct DiskPartition64 * dp); /* online salvager */ static int VCheckSalvage(Volume * vp); #if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT) static int VScheduleSalvage_r(Volume * vp); #endif /* Volume hash table */ static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp); static void VHashBeginExclusive_r(VolumeHashChainHead * head); static void VHashEndExclusive_r(VolumeHashChainHead * head); static void VHashWait_r(VolumeHashChainHead * head); /* shutdown */ static int ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass); static int ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass, struct rx_queue ** idx); static void ShutdownController(vshutdown_thread_t * params); static void ShutdownCreateSchedule(vshutdown_thread_t * params); /* VLRU */ static void VLRU_ComputeConstants(void); static void VInitVLRU(void); static void VLRU_Init_Node_r(Volume * vp); static void VLRU_Add_r(Volume * vp); static void VLRU_Delete_r(Volume * vp); static void VLRU_UpdateAccess_r(Volume * vp); static void * VLRU_ScannerThread(void * args); static void VLRU_Scan_r(int idx); static void VLRU_Promote_r(int idx); static void VLRU_Demote_r(int idx); static void VLRU_SwitchQueues(Volume * vp, int new_idx, int append); /* soft detach */ static int VCheckSoftDetach(Volume * vp, afs_uint32 thresh); static int VCheckSoftDetachCandidate(Volume * vp, afs_uint32 thresh); static int VSoftDetachVolume_r(Volume * vp, afs_uint32 thresh); pthread_key_t VThread_key; VThreadOptions_t VThread_defaults = { 0 /**< allow salvsync */ }; #endif /* AFS_DEMAND_ATTACH_FS */ struct Lock vol_listLock; /* Lock obtained when listing volumes: * prevents a volume from being missed * if the volume is attached during a * list volumes */ /* Common message used when the volume goes off line */ char *VSalvageMessage = "Files in this volume are currently unavailable; call operations"; int VInit; /* 0 - uninitialized, * 1 - initialized but not all volumes have been attached, * 2 - initialized and all volumes have been attached, * 3 - initialized, all volumes have been attached, and * VConnectFS() has completed. */ static int vinit_attach_abort = 0; bit32 VolumeCacheCheck; /* Incremented everytime a volume goes on line-- * used to stamp volume headers and in-core * vnodes. When the volume goes on-line the * vnode will be invalidated * access only with VOL_LOCK held */ /***************************************************/ /* Startup routines */ /***************************************************/ #if defined(FAST_RESTART) && defined(AFS_DEMAND_ATTACH_FS) # error FAST_RESTART and DAFS are incompatible. For the DAFS equivalent \ of FAST_RESTART, use the -unsafe-nosalvage fileserver argument #endif /** * assign default values to a VolumePackageOptions struct. * * Always call this on a VolumePackageOptions struct first, then set any * specific options you want, then call VInitVolumePackage2. * * @param[in] pt caller's program type * @param[out] opts volume package options */ void VOptDefaults(ProgramType pt, VolumePackageOptions *opts) { opts->nLargeVnodes = opts->nSmallVnodes = 5; opts->volcache = 0; opts->canScheduleSalvage = 0; opts->canUseFSSYNC = 0; opts->canUseSALVSYNC = 0; opts->interrupt_rxcall = NULL; opts->offline_timeout = -1; opts->offline_shutdown_timeout = -1; opts->usage_threshold = 128; opts->usage_rate_limit = 5; #ifdef FAST_RESTART opts->unsafe_attach = 1; #else /* !FAST_RESTART */ opts->unsafe_attach = 0; #endif /* !FAST_RESTART */ switch (pt) { case fileServer: opts->canScheduleSalvage = 1; opts->canUseSALVSYNC = 1; break; case salvageServer: opts->canUseFSSYNC = 1; break; case volumeServer: opts->nLargeVnodes = 0; opts->nSmallVnodes = 0; opts->canScheduleSalvage = 1; opts->canUseFSSYNC = 1; break; default: /* noop */ break; } } /** * Set VInit to a certain value, and signal waiters. * * @param[in] value the value to set VInit to * * @pre VOL_LOCK held */ static void VSetVInit_r(int value) { VInit = value; CV_BROADCAST(&vol_vinit_cond); } static_inline void VLogOfflineTimeout(const char *type, afs_int32 timeout) { if (timeout < 0) { return; } if (timeout == 0) { Log("VInitVolumePackage: Interrupting clients accessing %s " "immediately\n", type); } else { Log("VInitVolumePackage: Interrupting clients accessing %s " "after %ld second%s\n", type, (long)timeout, timeout==1?"":"s"); } } int VInitVolumePackage2(ProgramType pt, VolumePackageOptions * opts) { int errors = 0; /* Number of errors while finding vice partitions. */ programType = pt; vol_opts = *opts; #ifndef AFS_PTHREAD_ENV if (opts->offline_timeout != -1 || opts->offline_shutdown_timeout != -1) { Log("VInitVolumePackage: offline_timeout and/or " "offline_shutdown_timeout was specified, but the volume package " "does not support these for LWP builds\n"); return -1; } #endif VLogOfflineTimeout("volumes going offline", opts->offline_timeout); VLogOfflineTimeout("volumes going offline during shutdown", opts->offline_shutdown_timeout); memset(&VStats, 0, sizeof(VStats)); VStats.hdr_cache_size = 200; VInitPartitionPackage(); VInitVolumeHash(); #ifdef AFS_DEMAND_ATTACH_FS if (programType == fileServer) { VInitVLRU(); } else { VLRU_SetOptions(VLRU_SET_ENABLED, 0); } osi_Assert(pthread_key_create(&VThread_key, NULL) == 0); #endif MUTEX_INIT(&vol_glock_mutex, "vol glock", MUTEX_DEFAULT, 0); MUTEX_INIT(&vol_trans_mutex, "vol trans", MUTEX_DEFAULT, 0); CV_INIT(&vol_put_volume_cond, "vol put", CV_DEFAULT, 0); CV_INIT(&vol_sleep_cond, "vol sleep", CV_DEFAULT, 0); CV_INIT(&vol_init_attach_cond, "vol init attach", CV_DEFAULT, 0); CV_INIT(&vol_vinit_cond, "vol init", CV_DEFAULT, 0); #ifndef AFS_PTHREAD_ENV IOMGR_Initialize(); #endif /* AFS_PTHREAD_ENV */ Lock_Init(&vol_listLock); srandom(time(0)); /* For VGetVolumeInfo */ #ifdef AFS_DEMAND_ATTACH_FS MUTEX_INIT(&vol_salvsync_mutex, "salvsync", MUTEX_DEFAULT, 0); #endif /* AFS_DEMAND_ATTACH_FS */ /* Ok, we have done enough initialization that fileserver can * start accepting calls, even though the volumes may not be * available just yet. */ VInit = 1; #if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_SERVER) if (programType == salvageServer) { SALVSYNC_salvInit(); } #endif /* AFS_DEMAND_ATTACH_FS */ #ifdef FSSYNC_BUILD_SERVER if (programType == fileServer) { FSYNC_fsInit(); } #endif #if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT) if (VCanUseSALVSYNC()) { /* establish a connection to the salvager at this point */ osi_Assert(VConnectSALV() != 0); } #endif /* AFS_DEMAND_ATTACH_FS */ if (opts->volcache > VStats.hdr_cache_size) VStats.hdr_cache_size = opts->volcache; VInitVolumeHeaderCache(VStats.hdr_cache_size); VInitVnodes(vLarge, opts->nLargeVnodes); VInitVnodes(vSmall, opts->nSmallVnodes); errors = VAttachPartitions(); if (errors) return -1; if (programType != fileServer) { errors = VInitAttachVolumes(programType); if (errors) { return -1; } } #ifdef FSSYNC_BUILD_CLIENT if (VCanUseFSSYNC()) { if (!VConnectFS()) { #ifdef AFS_DEMAND_ATTACH_FS if (programType == salvageServer) { Log("Unable to connect to file server; aborted\n"); exit(1); } #endif /* AFS_DEMAND_ATTACH_FS */ Log("Unable to connect to file server; will retry at need\n"); } } #endif /* FSSYNC_BUILD_CLIENT */ return 0; } #if !defined(AFS_PTHREAD_ENV) /** * Attach volumes in vice partitions * * @param[in] pt calling program type * * @return 0 * @note This is the original, non-threaded version of attach parititions. * * @post VInit state is 2 */ int VInitAttachVolumes(ProgramType pt) { osi_Assert(VInit==1); if (pt == fileServer) { struct DiskPartition64 *diskP; /* Attach all the volumes in this partition */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { int nAttached = 0, nUnattached = 0; osi_Assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0); } } VOL_LOCK; VSetVInit_r(2); /* Initialized, and all volumes have been attached */ LWP_NoYieldSignal(VInitAttachVolumes); VOL_UNLOCK; return 0; } #endif /* !AFS_PTHREAD_ENV */ #if defined(AFS_PTHREAD_ENV) && !defined(AFS_DEMAND_ATTACH_FS) /** * Attach volumes in vice partitions * * @param[in] pt calling program type * * @return 0 * @note Threaded version of attach parititions. * * @post VInit state is 2 */ int VInitAttachVolumes(ProgramType pt) { osi_Assert(VInit==1); if (pt == fileServer) { struct DiskPartition64 *diskP; struct vinitvolumepackage_thread_t params; struct diskpartition_queue_t * dpq; int i, threads, parts; pthread_t tid; pthread_attr_t attrs; CV_INIT(¶ms.thread_done_cv, "thread done", CV_DEFAULT, 0); queue_Init(¶ms); params.n_threads_complete = 0; /* create partition work queue */ for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) { dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t)); osi_Assert(dpq != NULL); dpq->diskP = diskP; queue_Append(¶ms,dpq); } threads = MIN(parts, vol_attach_threads); if (threads > 1) { /* spawn off a bunch of initialization threads */ osi_Assert(pthread_attr_init(&attrs) == 0); osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); Log("VInitVolumePackage: beginning parallel fileserver startup\n"); Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n", threads, parts); VOL_LOCK; for (i=0; i < threads; i++) { AFS_SIGSET_DECL; AFS_SIGSET_CLEAR(); osi_Assert(pthread_create (&tid, &attrs, &VInitVolumePackageThread, ¶ms) == 0); AFS_SIGSET_RESTORE(); } while(params.n_threads_complete < threads) { VOL_CV_WAIT(¶ms.thread_done_cv); } VOL_UNLOCK; osi_Assert(pthread_attr_destroy(&attrs) == 0); } else { /* if we're only going to run one init thread, don't bother creating * another LWP */ Log("VInitVolumePackage: beginning single-threaded fileserver startup\n"); Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n", parts); VInitVolumePackageThread(¶ms); } CV_DESTROY(¶ms.thread_done_cv); } VOL_LOCK; VSetVInit_r(2); /* Initialized, and all volumes have been attached */ CV_BROADCAST(&vol_init_attach_cond); VOL_UNLOCK; return 0; } static void * VInitVolumePackageThread(void * args) { struct DiskPartition64 *diskP; struct vinitvolumepackage_thread_t * params; struct diskpartition_queue_t * dpq; params = (vinitvolumepackage_thread_t *) args; VOL_LOCK; /* Attach all the volumes in this partition */ while (queue_IsNotEmpty(params)) { int nAttached = 0, nUnattached = 0; if (vinit_attach_abort) { Log("Aborting initialization\n"); goto done; } dpq = queue_First(params,diskpartition_queue_t); queue_Remove(dpq); VOL_UNLOCK; diskP = dpq->diskP; free(dpq); osi_Assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0); VOL_LOCK; } done: params->n_threads_complete++; CV_SIGNAL(¶ms->thread_done_cv); VOL_UNLOCK; return NULL; } #endif /* AFS_PTHREAD_ENV && !AFS_DEMAND_ATTACH_FS */ #if defined(AFS_DEMAND_ATTACH_FS) /** * Attach volumes in vice partitions * * @param[in] pt calling program type * * @return 0 * @note Threaded version of attach partitions. * * @post VInit state is 2 */ int VInitAttachVolumes(ProgramType pt) { osi_Assert(VInit==1); if (pt == fileServer) { struct DiskPartition64 *diskP; struct partition_queue pq; struct volume_init_queue vq; int i, threads, parts; pthread_t tid; pthread_attr_t attrs; /* create partition work queue */ queue_Init(&pq); CV_INIT(&(pq.cv), "partq", CV_DEFAULT, 0); MUTEX_INIT(&(pq.mutex), "partq", MUTEX_DEFAULT, 0); for (parts = 0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) { struct diskpartition_queue_t *dp; dp = (struct diskpartition_queue_t*)malloc(sizeof(struct diskpartition_queue_t)); osi_Assert(dp != NULL); dp->diskP = diskP; queue_Append(&pq, dp); } /* number of worker threads; at least one, not to exceed the number of partitions */ threads = MIN(parts, vol_attach_threads); /* create volume work queue */ queue_Init(&vq); CV_INIT(&(vq.cv), "volq", CV_DEFAULT, 0); MUTEX_INIT(&(vq.mutex), "volq", MUTEX_DEFAULT, 0); osi_Assert(pthread_attr_init(&attrs) == 0); osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); Log("VInitVolumePackage: beginning parallel fileserver startup\n"); Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n", threads, parts); /* create threads to scan disk partitions. */ for (i=0; i < threads; i++) { struct vinitvolumepackage_thread_param *params; AFS_SIGSET_DECL; params = (struct vinitvolumepackage_thread_param *)malloc(sizeof(struct vinitvolumepackage_thread_param)); osi_Assert(params); params->pq = &pq; params->vq = &vq; params->nthreads = threads; params->thread = i+1; AFS_SIGSET_CLEAR(); osi_Assert(pthread_create (&tid, &attrs, &VInitVolumePackageThread, (void*)params) == 0); AFS_SIGSET_RESTORE(); } VInitPreAttachVolumes(threads, &vq); osi_Assert(pthread_attr_destroy(&attrs) == 0); CV_DESTROY(&pq.cv); MUTEX_DESTROY(&pq.mutex); CV_DESTROY(&vq.cv); MUTEX_DESTROY(&vq.mutex); } VOL_LOCK; VSetVInit_r(2); /* Initialized, and all volumes have been attached */ CV_BROADCAST(&vol_init_attach_cond); VOL_UNLOCK; return 0; } /** * Volume package initialization worker thread. Scan partitions for volume * header files. Gather batches of volume ids and dispatch them to * the main thread to be preattached. The volume preattachement is done * in the main thread to avoid global volume lock contention. */ static void * VInitVolumePackageThread(void *args) { struct vinitvolumepackage_thread_param *params; struct DiskPartition64 *partition; struct partition_queue *pq; struct volume_init_queue *vq; struct volume_init_batch *vb; osi_Assert(args); params = (struct vinitvolumepackage_thread_param *)args; pq = params->pq; vq = params->vq; osi_Assert(pq); osi_Assert(vq); vb = (struct volume_init_batch*)malloc(sizeof(struct volume_init_batch)); osi_Assert(vb); vb->thread = params->thread; vb->last = 0; vb->size = 0; Log("Scanning partitions on thread %d of %d\n", params->thread, params->nthreads); while((partition = VInitNextPartition(pq))) { DIR *dirp; VolId vid; Log("Partition %s: pre-attaching volumes\n", partition->name); dirp = opendir(VPartitionPath(partition)); if (!dirp) { Log("opendir on Partition %s failed, errno=%d!\n", partition->name, errno); continue; } while ((vid = VInitNextVolumeId(dirp))) { Volume *vp = (Volume*)malloc(sizeof(Volume)); osi_Assert(vp); memset(vp, 0, sizeof(Volume)); vp->device = partition->device; vp->partition = partition; vp->hashid = vid; queue_Init(&vp->vnode_list); queue_Init(&vp->rx_call_list); CV_INIT(&V_attachCV(vp), "partattach", CV_DEFAULT, 0); vb->batch[vb->size++] = vp; if (vb->size == VINIT_BATCH_MAX_SIZE) { MUTEX_ENTER(&vq->mutex); queue_Append(vq, vb); CV_BROADCAST(&vq->cv); MUTEX_EXIT(&vq->mutex); vb = (struct volume_init_batch*)malloc(sizeof(struct volume_init_batch)); osi_Assert(vb); vb->thread = params->thread; vb->size = 0; vb->last = 0; } } closedir(dirp); } vb->last = 1; MUTEX_ENTER(&vq->mutex); queue_Append(vq, vb); CV_BROADCAST(&vq->cv); MUTEX_EXIT(&vq->mutex); Log("Partition scan thread %d of %d ended\n", params->thread, params->nthreads); free(params); return NULL; } /** * Read next element from the pre-populated partition list. */ static struct DiskPartition64* VInitNextPartition(struct partition_queue *pq) { struct DiskPartition64 *partition; struct diskpartition_queue_t *dp; /* queue element */ if (vinit_attach_abort) { Log("Aborting volume preattach thread.\n"); return NULL; } /* get next partition to scan */ MUTEX_ENTER(&pq->mutex); if (queue_IsEmpty(pq)) { MUTEX_EXIT(&pq->mutex); return NULL; } dp = queue_First(pq, diskpartition_queue_t); queue_Remove(dp); MUTEX_EXIT(&pq->mutex); osi_Assert(dp); osi_Assert(dp->diskP); partition = dp->diskP; free(dp); return partition; } /** * Find next volume id on the partition. */ static VolId VInitNextVolumeId(DIR *dirp) { struct dirent *d; VolId vid = 0; char *ext; while((d = readdir(dirp))) { if (vinit_attach_abort) { Log("Aborting volume preattach thread.\n"); break; } ext = strrchr(d->d_name, '.'); if (d->d_name[0] == 'V' && ext && strcmp(ext, VHDREXT) == 0) { vid = VolumeNumber(d->d_name); if (vid) { break; } Log("Warning: bogus volume header file: %s\n", d->d_name); } } return vid; } /** * Preattach volumes in batches to avoid lock contention. */ static int VInitPreAttachVolumes(int nthreads, struct volume_init_queue *vq) { struct volume_init_batch *vb; int i; while (nthreads) { /* dequeue next volume */ MUTEX_ENTER(&vq->mutex); if (queue_IsEmpty(vq)) { CV_WAIT(&vq->cv, &vq->mutex); } vb = queue_First(vq, volume_init_batch); queue_Remove(vb); MUTEX_EXIT(&vq->mutex); if (vb->size) { VOL_LOCK; for (i = 0; isize; i++) { Volume *vp; Volume *dup; Error ec = 0; vp = vb->batch[i]; dup = VLookupVolume_r(&ec, vp->hashid, NULL); if (ec) { Log("Error looking up volume, code=%d\n", ec); } else if (dup) { Log("Warning: Duplicate volume id %d detected.\n", vp->hashid); } else { /* put pre-attached volume onto the hash table * and bring it up to the pre-attached state */ AddVolumeToHashTable(vp, vp->hashid); AddVolumeToVByPList_r(vp); VLRU_Init_Node_r(vp); VChangeState_r(vp, VOL_STATE_PREATTACHED); } } VOL_UNLOCK; } if (vb->last) { nthreads--; } free(vb); } return 0; } #endif /* AFS_DEMAND_ATTACH_FS */ #if !defined(AFS_DEMAND_ATTACH_FS) /* * attach all volumes on a given disk partition */ static int VAttachVolumesByPartition(struct DiskPartition64 *diskP, int * nAttached, int * nUnattached) { DIR * dirp; struct dirent * dp; int ret = 0; Log("Partition %s: attaching volumes\n", diskP->name); dirp = opendir(VPartitionPath(diskP)); if (!dirp) { Log("opendir on Partition %s failed!\n", diskP->name); return 1; } while ((dp = readdir(dirp))) { char *p; p = strrchr(dp->d_name, '.'); if (vinit_attach_abort) { Log("Partition %s: abort attach volumes\n", diskP->name); goto done; } if (p != NULL && strcmp(p, VHDREXT) == 0) { Error error; Volume *vp; vp = VAttachVolumeByName(&error, diskP->name, dp->d_name, V_VOLUPD); (*(vp ? nAttached : nUnattached))++; if (error == VOFFLINE) Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name); else if (LogLevel >= 5) { Log("Partition %s: attached volume %d (%s)\n", diskP->name, VolumeNumber(dp->d_name), dp->d_name); } if (vp) { VPutVolume(vp); } } } Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached); done: closedir(dirp); return ret; } #endif /* !AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* Shutdown routines */ /***************************************************/ /* * demand attach fs * highly multithreaded volume package shutdown * * with the demand attach fileserver extensions, * VShutdown has been modified to be multithreaded. * In order to achieve optimal use of many threads, * the shutdown code involves one control thread and * n shutdown worker threads. The control thread * periodically examines the number of volumes available * for shutdown on each partition, and produces a worker * thread allocation schedule. The idea is to eliminate * redundant scheduling computation on the workers by * having a single master scheduler. * * The scheduler's objectives are: * (1) fairness * each partition with volumes remaining gets allocated * at least 1 thread (assuming sufficient threads) * (2) performance * threads are allocated proportional to the number of * volumes remaining to be offlined. This ensures that * the OS I/O scheduler has many requests to elevator * seek on partitions that will (presumably) take the * longest amount of time (from now) to finish shutdown * (3) keep threads busy * when there are extra threads, they are assigned to * partitions using a simple round-robin algorithm * * In the future, we may wish to add the ability to adapt * to the relative performance patterns of each disk * partition. * * * demand attach fs * multi-step shutdown process * * demand attach shutdown is a four-step process. Each * shutdown "pass" shuts down increasingly more difficult * volumes. The main purpose is to achieve better cache * utilization during shutdown. * * pass 0 * shutdown volumes in the unattached, pre-attached * and error states * pass 1 * shutdown attached volumes with cached volume headers * pass 2 * shutdown all volumes in non-exclusive states * pass 3 * shutdown all remaining volumes */ #ifdef AFS_DEMAND_ATTACH_FS void VShutdown_r(void) { int i; struct DiskPartition64 * diskP; struct diskpartition_queue_t * dpq; vshutdown_thread_t params; pthread_t tid; pthread_attr_t attrs; memset(¶ms, 0, sizeof(vshutdown_thread_t)); if (VInit < 2) { Log("VShutdown: aborting attach volumes\n"); vinit_attach_abort = 1; VOL_CV_WAIT(&vol_init_attach_cond); } for (params.n_parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, params.n_parts++); Log("VShutdown: shutting down on-line volumes on %d partition%s...\n", params.n_parts, params.n_parts > 1 ? "s" : ""); vol_shutting_down = 1; if (vol_attach_threads > 1) { /* prepare for parallel shutdown */ params.n_threads = vol_attach_threads; MUTEX_INIT(¶ms.lock, "params", MUTEX_DEFAULT, 0); CV_INIT(¶ms.cv, "params", CV_DEFAULT, 0); CV_INIT(¶ms.master_cv, "params master", CV_DEFAULT, 0); osi_Assert(pthread_attr_init(&attrs) == 0); osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); queue_Init(¶ms); /* setup the basic partition information structures for * parallel shutdown */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { /* XXX debug */ struct rx_queue * qp, * nqp; Volume * vp; int count = 0; VVByPListWait_r(diskP); VVByPListBeginExclusive_r(diskP); /* XXX debug */ for (queue_Scan(&diskP->vol_list, qp, nqp, rx_queue)) { vp = (Volume *)((char *)qp - offsetof(Volume, vol_list)); if (vp->header) count++; } Log("VShutdown: partition %s has %d volumes with attached headers\n", VPartitionPath(diskP), count); /* build up the pass 0 shutdown work queue */ dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t)); osi_Assert(dpq != NULL); dpq->diskP = diskP; queue_Prepend(¶ms, dpq); params.part_pass_head[diskP->index] = queue_First(&diskP->vol_list, rx_queue); } Log("VShutdown: beginning parallel fileserver shutdown\n"); Log("VShutdown: using %d threads to offline volumes on %d partition%s\n", vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" ); /* do pass 0 shutdown */ MUTEX_ENTER(¶ms.lock); for (i=0; i < params.n_threads; i++) { osi_Assert(pthread_create (&tid, &attrs, &VShutdownThread, ¶ms) == 0); } /* wait for all the pass 0 shutdowns to complete */ while (params.n_threads_complete < params.n_threads) { CV_WAIT(¶ms.master_cv, ¶ms.lock); } params.n_threads_complete = 0; params.pass = 1; CV_BROADCAST(¶ms.cv); MUTEX_EXIT(¶ms.lock); Log("VShutdown: pass 0 completed using the 1 thread per partition algorithm\n"); Log("VShutdown: starting passes 1 through 3 using finely-granular mp-fast algorithm\n"); /* run the parallel shutdown scheduler. it will drop the glock internally */ ShutdownController(¶ms); /* wait for all the workers to finish pass 3 and terminate */ while (params.pass < 4) { VOL_CV_WAIT(¶ms.cv); } osi_Assert(pthread_attr_destroy(&attrs) == 0); CV_DESTROY(¶ms.cv); CV_DESTROY(¶ms.master_cv); MUTEX_DESTROY(¶ms.lock); /* drop the VByPList exclusive reservations */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { VVByPListEndExclusive_r(diskP); Log("VShutdown: %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n", VPartitionPath(diskP), params.stats[0][diskP->index], params.stats[1][diskP->index], params.stats[2][diskP->index], params.stats[3][diskP->index]); } Log("VShutdown: shutdown finished using %d threads\n", params.n_threads); } else { /* if we're only going to run one shutdown thread, don't bother creating * another LWP */ Log("VShutdown: beginning single-threaded fileserver shutdown\n"); for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { VShutdownByPartition_r(diskP); } } Log("VShutdown: complete.\n"); } #else /* AFS_DEMAND_ATTACH_FS */ void VShutdown_r(void) { int i; Volume *vp, *np; afs_int32 code; if (VInit < 2) { Log("VShutdown: aborting attach volumes\n"); vinit_attach_abort = 1; #ifdef AFS_PTHREAD_ENV VOL_CV_WAIT(&vol_init_attach_cond); #else LWP_WaitProcess(VInitAttachVolumes); #endif /* AFS_PTHREAD_ENV */ } Log("VShutdown: shutting down on-line volumes...\n"); vol_shutting_down = 1; for (i = 0; i < VolumeHashTable.Size; i++) { /* try to hold first volume in the hash table */ for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) { code = VHold_r(vp); if (code == 0) { if (LogLevel >= 5) Log("VShutdown: Attempting to take volume %u offline.\n", vp->hashid); /* next, take the volume offline (drops reference count) */ VOffline_r(vp, "File server was shut down"); } } } Log("VShutdown: complete.\n"); } #endif /* AFS_DEMAND_ATTACH_FS */ void VShutdown(void) { osi_Assert(VInit>0); VOL_LOCK; VShutdown_r(); VOL_UNLOCK; } /** * stop new activity (e.g. SALVSYNC) from occurring * * Use this to make the volume package less busy; for example, during * shutdown. This doesn't actually shutdown/detach anything in the * volume package, but prevents certain processes from ocurring. For * example, preventing new SALVSYNC communication in DAFS. In theory, we * could also use this to prevent new volume attachment, or prevent * other programs from checking out volumes, etc. */ void VSetTranquil(void) { #ifdef AFS_DEMAND_ATTACH_FS /* make sure we don't try to contact the salvageserver, since it may * not be around anymore */ vol_disallow_salvsync = 1; #endif } #ifdef AFS_DEMAND_ATTACH_FS /* * demand attach fs * shutdown control thread */ static void ShutdownController(vshutdown_thread_t * params) { /* XXX debug */ struct DiskPartition64 * diskP; Device id; vshutdown_thread_t shadow; ShutdownCreateSchedule(params); while ((params->pass < 4) && (params->n_threads_complete < params->n_threads)) { /* recompute schedule once per second */ memcpy(&shadow, params, sizeof(vshutdown_thread_t)); VOL_UNLOCK; /* XXX debug */ Log("ShutdownController: schedule version=%d, vol_remaining=%d, pass=%d\n", shadow.schedule_version, shadow.vol_remaining, shadow.pass); Log("ShutdownController: n_threads_complete=%d, n_parts_done_pass=%d\n", shadow.n_threads_complete, shadow.n_parts_done_pass); for (diskP = DiskPartitionList; diskP; diskP=diskP->next) { id = diskP->index; Log("ShutdownController: part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n", id, diskP->vol_list.len, shadow.part_thread_target[id], shadow.part_done_pass[id], shadow.part_pass_head[id]); } sleep(1); VOL_LOCK; ShutdownCreateSchedule(params); } } /* create the shutdown thread work schedule. * this scheduler tries to implement fairness * by allocating at least 1 thread to each * partition with volumes to be shutdown, * and then it attempts to allocate remaining * threads based upon the amount of work left */ static void ShutdownCreateSchedule(vshutdown_thread_t * params) { struct DiskPartition64 * diskP; int sum, thr_workload, thr_left; int part_residue[VOLMAXPARTS+1]; Device id; /* compute the total number of outstanding volumes */ sum = 0; for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { sum += diskP->vol_list.len; } params->schedule_version++; params->vol_remaining = sum; if (!sum) return; /* compute average per-thread workload */ thr_workload = sum / params->n_threads; if (sum % params->n_threads) thr_workload++; thr_left = params->n_threads; memset(&part_residue, 0, sizeof(part_residue)); /* for fairness, give every partition with volumes remaining * at least one thread */ for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) { id = diskP->index; if (diskP->vol_list.len) { params->part_thread_target[id] = 1; thr_left--; } else { params->part_thread_target[id] = 0; } } if (thr_left && thr_workload) { /* compute length-weighted workloads */ int delta; for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) { id = diskP->index; delta = (diskP->vol_list.len / thr_workload) - params->part_thread_target[id]; if (delta < 0) { continue; } if (delta < thr_left) { params->part_thread_target[id] += delta; thr_left -= delta; } else { params->part_thread_target[id] += thr_left; thr_left = 0; break; } } } if (thr_left) { /* try to assign any leftover threads to partitions that * had volume lengths closer to needing thread_target+1 */ int max_residue, max_id = 0; /* compute the residues */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; part_residue[id] = diskP->vol_list.len - (params->part_thread_target[id] * thr_workload); } /* now try to allocate remaining threads to partitions with the * highest residues */ while (thr_left) { max_residue = 0; for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; if (part_residue[id] > max_residue) { max_residue = part_residue[id]; max_id = id; } } if (!max_residue) { break; } params->part_thread_target[max_id]++; thr_left--; part_residue[max_id] = 0; } } if (thr_left) { /* punt and give any remaining threads equally to each partition */ int alloc; if (thr_left >= params->n_parts) { alloc = thr_left / params->n_parts; for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; params->part_thread_target[id] += alloc; thr_left -= alloc; } } /* finish off the last of the threads */ for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) { id = diskP->index; params->part_thread_target[id]++; thr_left--; } } } /* worker thread for parallel shutdown */ static void * VShutdownThread(void * args) { vshutdown_thread_t * params; int found, pass, schedule_version_save, count; struct DiskPartition64 *diskP; struct diskpartition_queue_t * dpq; Device id; params = (vshutdown_thread_t *) args; /* acquire the shutdown pass 0 lock */ MUTEX_ENTER(¶ms->lock); /* if there's still pass 0 work to be done, * get a work entry, and do a pass 0 shutdown */ if (queue_IsNotEmpty(params)) { dpq = queue_First(params, diskpartition_queue_t); queue_Remove(dpq); MUTEX_EXIT(¶ms->lock); diskP = dpq->diskP; free(dpq); id = diskP->index; count = 0; while (ShutdownVolumeWalk_r(diskP, 0, ¶ms->part_pass_head[id])) count++; params->stats[0][diskP->index] = count; MUTEX_ENTER(¶ms->lock); } params->n_threads_complete++; if (params->n_threads_complete == params->n_threads) { /* notify control thread that all workers have completed pass 0 */ CV_SIGNAL(¶ms->master_cv); } while (params->pass == 0) { CV_WAIT(¶ms->cv, ¶ms->lock); } /* switch locks */ MUTEX_EXIT(¶ms->lock); VOL_LOCK; pass = params->pass; osi_Assert(pass > 0); /* now escalate through the more complicated shutdowns */ while (pass <= 3) { schedule_version_save = params->schedule_version; found = 0; /* find a disk partition to work on */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; if (params->part_thread_target[id] && !params->part_done_pass[id]) { params->part_thread_target[id]--; found = 1; break; } } if (!found) { /* hmm. for some reason the controller thread couldn't find anything for * us to do. let's see if there's anything we can do */ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; if (diskP->vol_list.len && !params->part_done_pass[id]) { found = 1; break; } else if (!params->part_done_pass[id]) { params->part_done_pass[id] = 1; params->n_parts_done_pass++; if (pass == 3) { Log("VShutdown: done shutting down volumes on partition %s.\n", VPartitionPath(diskP)); } } } } /* do work on this partition until either the controller * creates a new schedule, or we run out of things to do * on this partition */ if (found) { count = 0; while (!params->part_done_pass[id] && (schedule_version_save == params->schedule_version)) { /* ShutdownVolumeWalk_r will drop the glock internally */ if (!ShutdownVolumeWalk_r(diskP, pass, ¶ms->part_pass_head[id])) { if (!params->part_done_pass[id]) { params->part_done_pass[id] = 1; params->n_parts_done_pass++; if (pass == 3) { Log("VShutdown: done shutting down volumes on partition %s.\n", VPartitionPath(diskP)); } } break; } count++; } params->stats[pass][id] += count; } else { /* ok, everyone is done this pass, proceed */ /* barrier lock */ params->n_threads_complete++; while (params->pass == pass) { if (params->n_threads_complete == params->n_threads) { /* we are the last thread to complete, so we will * reinitialize worker pool state for the next pass */ params->n_threads_complete = 0; params->n_parts_done_pass = 0; params->pass++; for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; params->part_done_pass[id] = 0; params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue); } /* compute a new thread schedule before releasing all the workers */ ShutdownCreateSchedule(params); /* wake up all the workers */ CV_BROADCAST(¶ms->cv); VOL_UNLOCK; Log("VShutdown: pass %d completed using %d threads on %d partitions\n", pass, params->n_threads, params->n_parts); VOL_LOCK; } else { VOL_CV_WAIT(¶ms->cv); } } pass = params->pass; } /* for fairness */ VOL_UNLOCK; pthread_yield(); VOL_LOCK; } VOL_UNLOCK; return NULL; } /* shut down all volumes on a given disk partition * * note that this function will not allow mp-fast * shutdown of a partition */ int VShutdownByPartition_r(struct DiskPartition64 * dp) { int pass; int pass_stats[4]; int total; /* wait for other exclusive ops to finish */ VVByPListWait_r(dp); /* begin exclusive access */ VVByPListBeginExclusive_r(dp); /* pick the low-hanging fruit first, * then do the complicated ones last * (has the advantage of keeping * in-use volumes up until the bitter end) */ for (pass = 0, total=0; pass < 4; pass++) { pass_stats[pass] = ShutdownVByPForPass_r(dp, pass); total += pass_stats[pass]; } /* end exclusive access */ VVByPListEndExclusive_r(dp); Log("VShutdownByPartition: shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n", total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]); return 0; } /* internal shutdown functionality * * for multi-pass shutdown: * 0 to only "shutdown" {pre,un}attached and error state volumes * 1 to also shutdown attached volumes w/ volume header loaded * 2 to also shutdown attached volumes w/o volume header loaded * 3 to also shutdown exclusive state volumes * * caller MUST hold exclusive access on the hash chain * because we drop vol_glock_mutex internally * * this function is reentrant for passes 1--3 * (e.g. multiple threads can cooperate to * shutdown a partition mp-fast) * * pass 0 is not scaleable because the volume state data is * synchronized by vol_glock mutex, and the locking overhead * is too high to drop the lock long enough to do linked list * traversal */ static int ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass) { struct rx_queue * q = queue_First(&dp->vol_list, rx_queue); int i = 0; while (ShutdownVolumeWalk_r(dp, pass, &q)) i++; return i; } /* conditionally shutdown one volume on partition dp * returns 1 if a volume was shutdown in this pass, * 0 otherwise */ static int ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass, struct rx_queue ** idx) { struct rx_queue *qp, *nqp; Volume * vp; qp = *idx; for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) { vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list)); switch (pass) { case 0: if ((V_attachState(vp) != VOL_STATE_UNATTACHED) && (V_attachState(vp) != VOL_STATE_ERROR) && (V_attachState(vp) != VOL_STATE_DELETED) && (V_attachState(vp) != VOL_STATE_PREATTACHED)) { break; } case 1: if ((V_attachState(vp) == VOL_STATE_ATTACHED) && (vp->header == NULL)) { break; } case 2: if (VIsExclusiveState(V_attachState(vp))) { break; } case 3: *idx = nqp; DeleteVolumeFromVByPList_r(vp); VShutdownVolume_r(vp); vp = NULL; return 1; } } return 0; } /* * shutdown a specific volume */ /* caller MUST NOT hold a heavyweight ref on vp */ int VShutdownVolume_r(Volume * vp) { int code; VCreateReservation_r(vp); if (LogLevel >= 5) { Log("VShutdownVolume_r: vid=%u, device=%d, state=%hu\n", vp->hashid, vp->partition->device, V_attachState(vp)); } /* wait for other blocking ops to finish */ VWaitExclusiveState_r(vp); osi_Assert(VIsValidState(V_attachState(vp))); switch(V_attachState(vp)) { case VOL_STATE_SALVAGING: /* Leave salvaging volumes alone. Any in-progress salvages will * continue working after viced shuts down. This is intentional. */ case VOL_STATE_PREATTACHED: case VOL_STATE_ERROR: VChangeState_r(vp, VOL_STATE_UNATTACHED); case VOL_STATE_UNATTACHED: case VOL_STATE_DELETED: break; case VOL_STATE_GOING_OFFLINE: case VOL_STATE_SHUTTING_DOWN: case VOL_STATE_ATTACHED: code = VHold_r(vp); if (!code) { if (LogLevel >= 5) Log("VShutdown: Attempting to take volume %u offline.\n", vp->hashid); /* take the volume offline (drops reference count) */ VOffline_r(vp, "File server was shut down"); } break; default: break; } VCancelReservation_r(vp); vp = NULL; return 0; } #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* Header I/O routines */ /***************************************************/ static const char * HeaderName(bit32 magic) { switch (magic) { case VOLUMEINFOMAGIC: return "volume info"; case SMALLINDEXMAGIC: return "small index"; case LARGEINDEXMAGIC: return "large index"; case LINKTABLEMAGIC: return "link table"; } return "unknown"; } /* open a descriptor for the inode (h), * read in an on-disk structure into buffer (to) of size (size), * verify versionstamp in structure has magic (magic) and * optionally verify version (version) if (version) is nonzero */ static void ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic, bit32 version) { struct versionStamp *vsn; FdHandle_t *fdP; afs_sfsize_t nbytes; afs_ino_str_t stmp; *ec = 0; if (h == NULL) { Log("ReadHeader: Null inode handle argument for %s header file.\n", HeaderName(magic)); *ec = VSALVAGE; return; } fdP = IH_OPEN(h); if (fdP == NULL) { Log("ReadHeader: Failed to open %s header file " "(volume=%u, inode=%s); errno=%d\n", HeaderName(magic), h->ih_vid, PrintInode(stmp, h->ih_ino), errno); *ec = VSALVAGE; return; } vsn = (struct versionStamp *)to; nbytes = FDH_PREAD(fdP, to, size, 0); if (nbytes < 0) { Log("ReadHeader: Failed to read %s header file " "(volume=%u, inode=%s); errno=%d\n", HeaderName(magic), h->ih_vid, PrintInode(stmp, h->ih_ino), errno); *ec = VSALVAGE; FDH_REALLYCLOSE(fdP); return; } if (nbytes != size) { Log("ReadHeader: Incorrect number of bytes read from %s header file " "(volume=%u, inode=%s); expected=%d, read=%d\n", HeaderName(magic), h->ih_vid, PrintInode(stmp, h->ih_ino), size, (int)nbytes); *ec = VSALVAGE; FDH_REALLYCLOSE(fdP); return; } if (vsn->magic != magic) { Log("ReadHeader: Incorrect magic for %s header file " "(volume=%u, inode=%s); expected=0x%x, read=0x%x\n", HeaderName(magic), h->ih_vid, PrintInode(stmp, h->ih_ino), magic, vsn->magic); *ec = VSALVAGE; FDH_REALLYCLOSE(fdP); return; } FDH_CLOSE(fdP); /* Check is conditional, in case caller wants to inspect version himself */ if (version && vsn->version != version) { Log("ReadHeader: Incorrect version for %s header file " "(volume=%u, inode=%s); expected=%x, read=%x\n", HeaderName(magic), h->ih_vid, PrintInode(stmp, h->ih_ino), version, vsn->version); *ec = VSALVAGE; } } void WriteVolumeHeader_r(Error * ec, Volume * vp) { IHandle_t *h = V_diskDataHandle(vp); FdHandle_t *fdP; *ec = 0; fdP = IH_OPEN(h); if (fdP == NULL) { *ec = VSALVAGE; return; } if (FDH_PWRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)), 0) != sizeof(V_disk(vp))) { *ec = VSALVAGE; FDH_REALLYCLOSE(fdP); return; } FDH_CLOSE(fdP); } /* VolumeHeaderToDisk * Allows for storing 64 bit inode numbers in on-disk volume header * file. */ /* convert in-memory representation of a volume header to the * on-disk representation of a volume header */ void VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h) { memset(dh, 0, sizeof(VolumeDiskHeader_t)); dh->stamp = h->stamp; dh->id = h->id; dh->parent = h->parent; #ifdef AFS_64BIT_IOPS_ENV dh->volumeInfo_lo = (afs_int32) h->volumeInfo & 0xffffffff; dh->volumeInfo_hi = (afs_int32) (h->volumeInfo >> 32) & 0xffffffff; dh->smallVnodeIndex_lo = (afs_int32) h->smallVnodeIndex & 0xffffffff; dh->smallVnodeIndex_hi = (afs_int32) (h->smallVnodeIndex >> 32) & 0xffffffff; dh->largeVnodeIndex_lo = (afs_int32) h->largeVnodeIndex & 0xffffffff; dh->largeVnodeIndex_hi = (afs_int32) (h->largeVnodeIndex >> 32) & 0xffffffff; dh->linkTable_lo = (afs_int32) h->linkTable & 0xffffffff; dh->linkTable_hi = (afs_int32) (h->linkTable >> 32) & 0xffffffff; #else dh->volumeInfo_lo = h->volumeInfo; dh->smallVnodeIndex_lo = h->smallVnodeIndex; dh->largeVnodeIndex_lo = h->largeVnodeIndex; dh->linkTable_lo = h->linkTable; #endif } /* DiskToVolumeHeader * Converts an on-disk representation of a volume header to * the in-memory representation of a volume header. * * Makes the assumption that AFS has *always* * zero'd the volume header file so that high parts of inode * numbers are 0 in older (SGI EFS) volume header files. */ void DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh) { memset(h, 0, sizeof(VolumeHeader_t)); h->stamp = dh->stamp; h->id = dh->id; h->parent = dh->parent; #ifdef AFS_64BIT_IOPS_ENV h->volumeInfo = (Inode) dh->volumeInfo_lo | ((Inode) dh->volumeInfo_hi << 32); h->smallVnodeIndex = (Inode) dh->smallVnodeIndex_lo | ((Inode) dh-> smallVnodeIndex_hi << 32); h->largeVnodeIndex = (Inode) dh->largeVnodeIndex_lo | ((Inode) dh-> largeVnodeIndex_hi << 32); h->linkTable = (Inode) dh->linkTable_lo | ((Inode) dh->linkTable_hi << 32); #else h->volumeInfo = dh->volumeInfo_lo; h->smallVnodeIndex = dh->smallVnodeIndex_lo; h->largeVnodeIndex = dh->largeVnodeIndex_lo; h->linkTable = dh->linkTable_lo; #endif } /***************************************************/ /* Volume Attachment routines */ /***************************************************/ #ifdef AFS_DEMAND_ATTACH_FS /** * pre-attach a volume given its path. * * @param[out] ec outbound error code * @param[in] partition partition path string * @param[in] name volume id string * * @return volume object pointer * * @note A pre-attached volume will only have its partition * and hashid fields initialized. At first call to * VGetVolume, the volume will be fully attached. * */ Volume * VPreAttachVolumeByName(Error * ec, char *partition, char *name) { Volume * vp; VOL_LOCK; vp = VPreAttachVolumeByName_r(ec, partition, name); VOL_UNLOCK; return vp; } /** * pre-attach a volume given its path. * * @param[out] ec outbound error code * @param[in] partition path to vice partition * @param[in] name volume id string * * @return volume object pointer * * @pre VOL_LOCK held * * @internal volume package internal use only. */ Volume * VPreAttachVolumeByName_r(Error * ec, char *partition, char *name) { return VPreAttachVolumeById_r(ec, partition, VolumeNumber(name)); } /** * pre-attach a volume given its path and numeric volume id. * * @param[out] ec error code return * @param[in] partition path to vice partition * @param[in] volumeId numeric volume id * * @return volume object pointer * * @pre VOL_LOCK held * * @internal volume package internal use only. */ Volume * VPreAttachVolumeById_r(Error * ec, char * partition, VolId volumeId) { Volume *vp; struct DiskPartition64 *partp; *ec = 0; osi_Assert(programType == fileServer); if (!(partp = VGetPartition_r(partition, 0))) { *ec = VNOVOL; Log("VPreAttachVolumeById_r: Error getting partition (%s)\n", partition); return NULL; } vp = VLookupVolume_r(ec, volumeId, NULL); if (*ec) { return NULL; } return VPreAttachVolumeByVp_r(ec, partp, vp, volumeId); } /** * preattach a volume. * * @param[out] ec outbound error code * @param[in] partp pointer to partition object * @param[in] vp pointer to volume object * @param[in] vid volume id * * @return volume object pointer * * @pre VOL_LOCK is held. * * @warning Returned volume object pointer does not have to * equal the pointer passed in as argument vp. There * are potential race conditions which can result in * the pointers having different values. It is up to * the caller to make sure that references are handled * properly in this case. * * @note If there is already a volume object registered with * the same volume id, its pointer MUST be passed as * argument vp. Failure to do so will result in a silent * failure to preattach. * * @internal volume package internal use only. */ Volume * VPreAttachVolumeByVp_r(Error * ec, struct DiskPartition64 * partp, Volume * vp, VolId vid) { Volume *nvp = NULL; *ec = 0; /* check to see if pre-attach already happened */ if (vp && (V_attachState(vp) != VOL_STATE_UNATTACHED) && (V_attachState(vp) != VOL_STATE_DELETED) && (V_attachState(vp) != VOL_STATE_PREATTACHED) && !VIsErrorState(V_attachState(vp))) { /* * pre-attach is a no-op in all but the following cases: * * - volume is unattached * - volume is in an error state * - volume is pre-attached */ Log("VPreattachVolumeByVp_r: volume %u not in quiescent state (state %u flags 0x%x)\n", vid, V_attachState(vp), V_attachFlags(vp)); goto done; } else if (vp) { /* we're re-attaching a volume; clear out some old state */ memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage)); if (V_partition(vp) != partp) { /* XXX potential race */ DeleteVolumeFromVByPList_r(vp); } } else { /* if we need to allocate a new Volume struct, * go ahead and drop the vol glock, otherwise * do the basic setup synchronised, as it's * probably not worth dropping the lock */ VOL_UNLOCK; /* allocate the volume structure */ vp = nvp = (Volume *) malloc(sizeof(Volume)); osi_Assert(vp != NULL); memset(vp, 0, sizeof(Volume)); queue_Init(&vp->vnode_list); queue_Init(&vp->rx_call_list); CV_INIT(&V_attachCV(vp), "vp attach", CV_DEFAULT, 0); } /* link the volume with its associated vice partition */ vp->device = partp->device; vp->partition = partp; vp->hashid = vid; vp->specialStatus = 0; /* if we dropped the lock, reacquire the lock, * check for pre-attach races, and then add * the volume to the hash table */ if (nvp) { VOL_LOCK; nvp = VLookupVolume_r(ec, vid, NULL); if (*ec) { free(vp); vp = NULL; goto done; } else if (nvp) { /* race detected */ free(vp); vp = nvp; goto done; } else { /* hack to make up for VChangeState_r() decrementing * the old state counter */ VStats.state_levels[0]++; } } /* put pre-attached volume onto the hash table * and bring it up to the pre-attached state */ AddVolumeToHashTable(vp, vp->hashid); AddVolumeToVByPList_r(vp); VLRU_Init_Node_r(vp); VChangeState_r(vp, VOL_STATE_PREATTACHED); if (LogLevel >= 5) Log("VPreAttachVolumeByVp_r: volume %u pre-attached\n", vp->hashid); done: if (*ec) return NULL; else return vp; } #endif /* AFS_DEMAND_ATTACH_FS */ /* Attach an existing volume, given its pathname, and return a pointer to the volume header information. The volume also normally goes online at this time. An offline volume must be reattached to make it go online */ Volume * VAttachVolumeByName(Error * ec, char *partition, char *name, int mode) { Volume *retVal; VOL_LOCK; retVal = VAttachVolumeByName_r(ec, partition, name, mode); VOL_UNLOCK; return retVal; } Volume * VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode) { Volume *vp = NULL; struct DiskPartition64 *partp; char path[64]; int isbusy = 0; VolId volumeId; int checkedOut; #ifdef AFS_DEMAND_ATTACH_FS VolumeStats stats_save; Volume *svp = NULL; #endif /* AFS_DEMAND_ATTACH_FS */ *ec = 0; volumeId = VolumeNumber(name); if (!(partp = VGetPartition_r(partition, 0))) { *ec = VNOVOL; Log("VAttachVolume: Error getting partition (%s)\n", partition); goto done; } if (VRequiresPartLock()) { osi_Assert(VInit == 3); VLockPartition_r(partition); } else if (programType == fileServer) { #ifdef AFS_DEMAND_ATTACH_FS /* lookup the volume in the hash table */ vp = VLookupVolume_r(ec, volumeId, NULL); if (*ec) { return NULL; } if (vp) { /* save any counters that are supposed to * be monotonically increasing over the * lifetime of the fileserver */ memcpy(&stats_save, &vp->stats, sizeof(VolumeStats)); } else { memset(&stats_save, 0, sizeof(VolumeStats)); } /* if there's something in the hash table, and it's not * in the pre-attach state, then we may need to detach * it before proceeding */ if (vp && (V_attachState(vp) != VOL_STATE_PREATTACHED)) { VCreateReservation_r(vp); VWaitExclusiveState_r(vp); /* at this point state must be one of: * - UNATTACHED * - ATTACHED * - SHUTTING_DOWN * - GOING_OFFLINE * - SALVAGING * - ERROR * - DELETED */ if (vp->specialStatus == VBUSY) isbusy = 1; /* if it's already attached, see if we can return it */ if (V_attachState(vp) == VOL_STATE_ATTACHED) { VGetVolumeByVp_r(ec, vp); if (V_inUse(vp) == fileServer) { VCancelReservation_r(vp); return vp; } /* otherwise, we need to detach, and attempt to re-attach */ VDetachVolume_r(ec, vp); if (*ec) { Log("VAttachVolume: Error detaching old volume instance (%s)\n", name); } } else { /* if it isn't fully attached, delete from the hash tables, and let the refcounter handle the rest */ DeleteVolumeFromHashTable(vp); DeleteVolumeFromVByPList_r(vp); } VCancelReservation_r(vp); vp = NULL; } /* pre-attach volume if it hasn't been done yet */ if (!vp || (V_attachState(vp) == VOL_STATE_UNATTACHED) || (V_attachState(vp) == VOL_STATE_DELETED) || (V_attachState(vp) == VOL_STATE_ERROR)) { svp = vp; vp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId); if (*ec) { return NULL; } } osi_Assert(vp != NULL); /* handle pre-attach races * * multiple threads can race to pre-attach a volume, * but we can't let them race beyond that * * our solution is to let the first thread to bring * the volume into an exclusive state win; the other * threads just wait until it finishes bringing the * volume online, and then they do a vgetvolumebyvp */ if (svp && (svp != vp)) { /* wait for other exclusive ops to finish */ VCreateReservation_r(vp); VWaitExclusiveState_r(vp); /* get a heavyweight ref, kill the lightweight ref, and return */ VGetVolumeByVp_r(ec, vp); VCancelReservation_r(vp); return vp; } /* at this point, we are chosen as the thread to do * demand attachment for this volume. all other threads * doing a getvolume on vp->hashid will block until we finish */ /* make sure any old header cache entries are invalidated * before proceeding */ FreeVolumeHeader(vp); VChangeState_r(vp, VOL_STATE_ATTACHING); /* restore any saved counters */ memcpy(&vp->stats, &stats_save, sizeof(VolumeStats)); #else /* AFS_DEMAND_ATTACH_FS */ vp = VGetVolume_r(ec, volumeId); if (vp) { if (V_inUse(vp) == fileServer) return vp; if (vp->specialStatus == VBUSY) isbusy = 1; VDetachVolume_r(ec, vp); if (*ec) { Log("VAttachVolume: Error detaching volume (%s)\n", name); } vp = NULL; } #endif /* AFS_DEMAND_ATTACH_FS */ } *ec = 0; strcpy(path, VPartitionPath(partp)); VOL_UNLOCK; strcat(path, OS_DIRSEP); strcat(path, name); if (!vp) { vp = (Volume *) calloc(1, sizeof(Volume)); osi_Assert(vp != NULL); vp->hashid = volumeId; vp->device = partp->device; vp->partition = partp; queue_Init(&vp->vnode_list); queue_Init(&vp->rx_call_list); #ifdef AFS_DEMAND_ATTACH_FS CV_INIT(&V_attachCV(vp), "vp attach", CV_DEFAULT, 0); #endif /* AFS_DEMAND_ATTACH_FS */ } /* attach2 is entered without any locks, and returns * with vol_glock_mutex held */ vp = attach2(ec, volumeId, path, partp, vp, isbusy, mode, &checkedOut); if (VCanUseFSSYNC() && vp) { #ifdef AFS_DEMAND_ATTACH_FS if ((mode == V_VOLUPD) || (VolumeWriteable(vp) && (mode == V_CLONE))) { /* mark volume header as in use so that volser crashes lead to a * salvage attempt */ VUpdateVolume_r(ec, vp, 0); } /* for dafs, we should tell the fileserver, except for V_PEEK * where we know it is not necessary */ if (mode == V_PEEK) { vp->needsPutBack = 0; } else { vp->needsPutBack = VOL_PUTBACK; } #else /* !AFS_DEMAND_ATTACH_FS */ /* duplicate computation in fssync.c about whether the server * takes the volume offline or not. If the volume isn't * offline, we must not return it when we detach the volume, * or the server will abort */ if (mode == V_READONLY || mode == V_PEEK || (!VolumeWriteable(vp) && (mode == V_CLONE || mode == V_DUMP))) vp->needsPutBack = 0; else vp->needsPutBack = VOL_PUTBACK; #endif /* !AFS_DEMAND_ATTACH_FS */ } #ifdef FSSYNC_BUILD_CLIENT /* Only give back the vol to the fileserver if we checked it out; attach2 * will set checkedOut only if we successfully checked it out from the * fileserver. */ if (VCanUseFSSYNC() && vp == NULL && checkedOut) { #ifdef AFS_DEMAND_ATTACH_FS /* If we couldn't attach but we scheduled a salvage, we already * notified the fileserver; don't online it now */ if (*ec != VSALVAGING) #endif /* AFS_DEMAND_ATTACH_FS */ FSYNC_VolOp(volumeId, partition, FSYNC_VOL_ON, 0, NULL); } else #endif if (programType == fileServer && vp) { #ifdef AFS_DEMAND_ATTACH_FS /* * we can get here in cases where we don't "own" * the volume (e.g. volume owned by a utility). * short circuit around potential disk header races. */ if (V_attachState(vp) != VOL_STATE_ATTACHED) { goto done; } #endif VUpdateVolume_r(ec, vp, 0); if (*ec) { Log("VAttachVolume: Error updating volume\n"); if (vp) VPutVolume_r(vp); goto done; } if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) { #ifndef AFS_DEMAND_ATTACH_FS /* This is a hack: by temporarily setting the incore * dontSalvage flag ON, the volume will be put back on the * Update list (with dontSalvage OFF again). It will then * come back in N minutes with DONT_SALVAGE eventually * set. This is the way that volumes that have never had * it set get it set; or that volumes that have been * offline without DONT SALVAGE having been set also * eventually get it set */ V_dontSalvage(vp) = DONT_SALVAGE; #endif /* !AFS_DEMAND_ATTACH_FS */ VAddToVolumeUpdateList_r(ec, vp); if (*ec) { Log("VAttachVolume: Error adding volume to update list\n"); if (vp) VPutVolume_r(vp); goto done; } } if (LogLevel) Log("VOnline: volume %u (%s) attached and online\n", V_id(vp), V_name(vp)); } done: if (VRequiresPartLock()) { VUnlockPartition_r(partition); } if (*ec) { #ifdef AFS_DEMAND_ATTACH_FS /* attach failed; make sure we're in error state */ if (vp && !VIsErrorState(V_attachState(vp))) { VChangeState_r(vp, VOL_STATE_ERROR); } #endif /* AFS_DEMAND_ATTACH_FS */ return NULL; } else { return vp; } } #ifdef AFS_DEMAND_ATTACH_FS /* VAttachVolumeByVp_r * * finish attaching a volume that is * in a less than fully attached state */ /* caller MUST hold a ref count on vp */ static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode) { char name[VMAXPATHLEN]; int reserve = 0; struct DiskPartition64 *partp; char path[64]; int isbusy = 0; VolId volumeId; Volume * nvp = NULL; VolumeStats stats_save; int checkedOut; *ec = 0; /* volume utility should never call AttachByVp */ osi_Assert(programType == fileServer); volumeId = vp->hashid; partp = vp->partition; VolumeExternalName_r(volumeId, name, sizeof(name)); /* if another thread is performing a blocking op, wait */ VWaitExclusiveState_r(vp); memcpy(&stats_save, &vp->stats, sizeof(VolumeStats)); /* if it's already attached, see if we can return it */ if (V_attachState(vp) == VOL_STATE_ATTACHED) { VGetVolumeByVp_r(ec, vp); if (V_inUse(vp) == fileServer) { return vp; } else { if (vp->specialStatus == VBUSY) isbusy = 1; VDetachVolume_r(ec, vp); if (*ec) { Log("VAttachVolume: Error detaching volume (%s)\n", name); } vp = NULL; } } /* pre-attach volume if it hasn't been done yet */ if (!vp || (V_attachState(vp) == VOL_STATE_UNATTACHED) || (V_attachState(vp) == VOL_STATE_DELETED) || (V_attachState(vp) == VOL_STATE_ERROR)) { nvp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId); if (*ec) { return NULL; } if (nvp != vp) { reserve = 1; VCreateReservation_r(nvp); vp = nvp; } } osi_Assert(vp != NULL); VChangeState_r(vp, VOL_STATE_ATTACHING); /* restore monotonically increasing stats */ memcpy(&vp->stats, &stats_save, sizeof(VolumeStats)); *ec = 0; /* compute path to disk header */ strcpy(path, VPartitionPath(partp)); VOL_UNLOCK; strcat(path, OS_DIRSEP); strcat(path, name); /* do volume attach * * NOTE: attach2 is entered without any locks, and returns * with vol_glock_mutex held */ vp = attach2(ec, volumeId, path, partp, vp, isbusy, mode, &checkedOut); /* * the event that an error was encountered, or * the volume was not brought to an attached state * for any reason, skip to the end. We cannot * safely call VUpdateVolume unless we "own" it. */ if (*ec || (vp == NULL) || (V_attachState(vp) != VOL_STATE_ATTACHED)) { goto done; } VUpdateVolume_r(ec, vp, 0); if (*ec) { Log("VAttachVolume: Error updating volume %u\n", vp->hashid); VPutVolume_r(vp); goto done; } if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) { #ifndef AFS_DEMAND_ATTACH_FS /* This is a hack: by temporarily setting the incore * dontSalvage flag ON, the volume will be put back on the * Update list (with dontSalvage OFF again). It will then * come back in N minutes with DONT_SALVAGE eventually * set. This is the way that volumes that have never had * it set get it set; or that volumes that have been * offline without DONT SALVAGE having been set also * eventually get it set */ V_dontSalvage(vp) = DONT_SALVAGE; #endif /* !AFS_DEMAND_ATTACH_FS */ VAddToVolumeUpdateList_r(ec, vp); if (*ec) { Log("VAttachVolume: Error adding volume %u to update list\n", vp->hashid); if (vp) VPutVolume_r(vp); goto done; } } if (LogLevel) Log("VOnline: volume %u (%s) attached and online\n", V_id(vp), V_name(vp)); done: if (reserve) { VCancelReservation_r(nvp); reserve = 0; } if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) { if (vp && !VIsErrorState(V_attachState(vp))) { VChangeState_r(vp, VOL_STATE_ERROR); } return NULL; } else { return vp; } } /** * lock a volume on disk (non-blocking). * * @param[in] vp The volume to lock * @param[in] locktype READ_LOCK or WRITE_LOCK * * @return operation status * @retval 0 success, lock was obtained * @retval EBUSY a conflicting lock was held by another process * @retval EIO error acquiring lock * * @pre If we're in the fileserver, vp is in an exclusive state * * @pre vp is not already locked */ static int VLockVolumeNB(Volume *vp, int locktype) { int code; osi_Assert(programType != fileServer || VIsExclusiveState(V_attachState(vp))); osi_Assert(!(V_attachFlags(vp) & VOL_LOCKED)); code = VLockVolumeByIdNB(vp->hashid, vp->partition, locktype); if (code == 0) { V_attachFlags(vp) |= VOL_LOCKED; } return code; } /** * unlock a volume on disk that was locked with VLockVolumeNB. * * @param[in] vp volume to unlock * * @pre If we're in the fileserver, vp is in an exclusive state * * @pre vp has already been locked */ static void VUnlockVolume(Volume *vp) { osi_Assert(programType != fileServer || VIsExclusiveState(V_attachState(vp))); osi_Assert((V_attachFlags(vp) & VOL_LOCKED)); VUnlockVolumeById(vp->hashid, vp->partition); V_attachFlags(vp) &= ~VOL_LOCKED; } #endif /* AFS_DEMAND_ATTACH_FS */ /** * read in a vol header, possibly lock the vol header, and possibly check out * the vol header from the fileserver, as part of volume attachment. * * @param[out] ec error code * @param[in] vp volume pointer object * @param[in] partp disk partition object of the attaching partition * @param[in] mode attachment mode such as V_VOLUPD, V_DUMP, etc (see * volume.h) * @param[in] peek 1 to just try to read in the volume header and make sure * we don't try to lock the vol, or check it out from * FSSYNC or anything like that; 0 otherwise, for 'normal' * operation * @param[out] acheckedOut If we successfully checked-out the volume from * the fileserver (if we needed to), this is set * to 1, otherwise it is untouched. * * @note As part of DAFS volume attachment, the volume header may be either * read- or write-locked to ensure mutual exclusion of certain volume * operations. In some cases in order to determine whether we need to * read- or write-lock the header, we need to read in the header to see * if the volume is RW or not. So, if we read in the header under a * read-lock and determine that we actually need a write-lock on the * volume header, this function will drop the read lock, acquire a write * lock, and read the header in again. */ static void attach_volume_header(Error *ec, Volume *vp, struct DiskPartition64 *partp, int mode, int peek, int *acheckedOut) { struct VolumeDiskHeader diskHeader; struct VolumeHeader header; int code; int first_try = 1; int lock_tries = 0, checkout_tries = 0; int retry; VolumeId volid = vp->hashid; #ifdef FSSYNC_BUILD_CLIENT int checkout, done_checkout = 0; #endif /* FSSYNC_BUILD_CLIENT */ #ifdef AFS_DEMAND_ATTACH_FS int locktype = 0, use_locktype = -1; #endif /* AFS_DEMAND_ATTACH_FS */ retry: retry = 0; *ec = 0; if (lock_tries > VOL_MAX_CHECKOUT_RETRIES) { Log("VAttachVolume: retried too many times trying to lock header for " "vol %lu part %s; giving up\n", afs_printable_uint32_lu(volid), VPartitionPath(partp)); *ec = VNOVOL; goto done; } if (checkout_tries > VOL_MAX_CHECKOUT_RETRIES) { Log("VAttachVolume: retried too many times trying to checkout " "vol %lu part %s; giving up\n", afs_printable_uint32_lu(volid), VPartitionPath(partp)); *ec = VNOVOL; goto done; } if (VReadVolumeDiskHeader(volid, partp, NULL)) { /* short-circuit the 'volume does not exist' case */ *ec = VNOVOL; goto done; } #ifdef FSSYNC_BUILD_CLIENT checkout = !done_checkout; done_checkout = 1; if (!peek && checkout && VMustCheckoutVolume(mode)) { SYNC_response res; memset(&res, 0, sizeof(res)); if (FSYNC_VolOp(volid, partp->name, FSYNC_VOL_NEEDVOLUME, mode, &res) != SYNC_OK) { if (res.hdr.reason == FSYNC_SALVAGE) { Log("VAttachVolume: file server says volume %lu is salvaging\n", afs_printable_uint32_lu(volid)); *ec = VSALVAGING; } else { Log("VAttachVolume: attach of volume %lu apparently denied by file server\n", afs_printable_uint32_lu(volid)); *ec = VNOVOL; /* XXXX */ } goto done; } *acheckedOut = 1; } #endif #ifdef AFS_DEMAND_ATTACH_FS if (use_locktype < 0) { /* don't know whether vol is RO or RW; assume it's RO and we can retry * if it turns out to be RW */ locktype = VVolLockType(mode, 0); } else { /* a previous try says we should use use_locktype to lock the volume, * so use that */ locktype = use_locktype; } if (!peek && locktype) { code = VLockVolumeNB(vp, locktype); if (code) { if (code == EBUSY) { Log("VAttachVolume: another program has vol %lu locked\n", afs_printable_uint32_lu(volid)); } else { Log("VAttachVolume: error %d trying to lock vol %lu\n", code, afs_printable_uint32_lu(volid)); } *ec = VNOVOL; goto done; } } #endif /* AFS_DEMAND_ATTACH_FS */ code = VReadVolumeDiskHeader(volid, partp, &diskHeader); if (code) { if (code == EIO) { *ec = VSALVAGE; } else { *ec = VNOVOL; } goto done; } DiskToVolumeHeader(&header, &diskHeader); IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header.parent, header.largeVnodeIndex); IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header.parent, header.smallVnodeIndex); IH_INIT(vp->diskDataHandle, partp->device, header.parent, header.volumeInfo); IH_INIT(vp->linkHandle, partp->device, header.parent, header.linkTable); if (first_try) { /* only need to do this once */ VOL_LOCK; GetVolumeHeader(vp); VOL_UNLOCK; } #if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT) /* demand attach changes the V_PEEK mechanism * * we can now suck the current disk data structure over * the fssync interface without going to disk * * (technically, we don't need to restrict this feature * to demand attach fileservers. However, I'm trying * to limit the number of common code changes) */ if (VCanUseFSSYNC() && (mode == V_PEEK || peek)) { SYNC_response res; res.payload.len = sizeof(VolumeDiskData); res.payload.buf = &vp->header->diskstuff; if (FSYNC_VolOp(vp->hashid, partp->name, FSYNC_VOL_QUERY_HDR, FSYNC_WHATEVER, &res) == SYNC_OK) { goto disk_header_loaded; } } #endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */ (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION); #ifdef AFS_DEMAND_ATTACH_FS /* update stats */ VOL_LOCK; IncUInt64(&VStats.hdr_loads); IncUInt64(&vp->stats.hdr_loads); VOL_UNLOCK; #endif /* AFS_DEMAND_ATTACH_FS */ if (*ec) { Log("VAttachVolume: Error reading diskDataHandle header for vol %lu; " "error=%u\n", afs_printable_uint32_lu(volid), *ec); goto done; } #ifdef AFS_DEMAND_ATTACH_FS # ifdef FSSYNC_BUILD_CLIENT disk_header_loaded: # endif /* FSSYNC_BUILD_CLIENT */ /* if the lock type we actually used to lock the volume is different than * the lock type we should have used, retry with the lock type we should * use */ use_locktype = VVolLockType(mode, VolumeWriteable(vp)); if (locktype != use_locktype) { retry = 1; lock_tries++; } #endif /* AFS_DEMAND_ATTACH_FS */ *ec = 0; done: #if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT) if (!peek && *ec == 0 && retry == 0 && VMustCheckoutVolume(mode)) { code = FSYNC_VerifyCheckout(volid, partp->name, FSYNC_VOL_NEEDVOLUME, mode); if (code == SYNC_DENIED) { /* must retry checkout; fileserver no longer thinks we have * the volume */ retry = 1; checkout_tries++; done_checkout = 0; } else if (code != SYNC_OK) { *ec = VNOVOL; } } #endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */ if (*ec || retry) { /* either we are going to be called again for a second pass, or we * encountered an error; clean up in either case */ #ifdef AFS_DEMAND_ATTACH_FS if ((V_attachFlags(vp) & VOL_LOCKED)) { VUnlockVolume(vp); } #endif /* AFS_DEMAND_ATTACH_FS */ if (vp->linkHandle) { IH_RELEASE(vp->vnodeIndex[vLarge].handle); IH_RELEASE(vp->vnodeIndex[vSmall].handle); IH_RELEASE(vp->diskDataHandle); IH_RELEASE(vp->linkHandle); } } if (*ec) { return; } if (retry) { first_try = 0; goto retry; } } #ifdef AFS_DEMAND_ATTACH_FS static void attach_check_vop(Error *ec, VolumeId volid, struct DiskPartition64 *partp, Volume *vp, int *acheckedOut) { *ec = 0; if (vp->pending_vol_op) { VOL_LOCK; if (vp->pending_vol_op->vol_op_state == FSSYNC_VolOpRunningUnknown) { int code; code = VVolOpLeaveOnlineNoHeader_r(vp, vp->pending_vol_op); if (code == 1) { vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOnline; } else if (code == 0) { vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOffline; } else { /* we need the vol header to determine if the volume can be * left online for the vop, so... get the header */ VOL_UNLOCK; /* attach header with peek=1 to avoid checking out the volume * or locking it; we just want the header info, we're not * messing with the volume itself at all */ attach_volume_header(ec, vp, partp, V_PEEK, 1, acheckedOut); if (*ec) { return; } VOL_LOCK; if (VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) { vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOnline; } else { vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOffline; } /* make sure we grab a new vol header and re-open stuff on * actual attachment; we can't keep the data we grabbed, since * it was not done under a lock and thus not safe */ FreeVolumeHeader(vp); VReleaseVolumeHandles_r(vp); } } /* see if the pending volume op requires exclusive access */ switch (vp->pending_vol_op->vol_op_state) { case FSSYNC_VolOpPending: /* this should never happen */ osi_Assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpPending); break; case FSSYNC_VolOpRunningUnknown: /* this should never happen; we resolved 'unknown' above */ osi_Assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpRunningUnknown); break; case FSSYNC_VolOpRunningOffline: /* mark the volume down */ *ec = VOFFLINE; VChangeState_r(vp, VOL_STATE_UNATTACHED); /* do not set V_offlineMessage here; we don't have ownership of * the volume (and probably do not have the header loaded), so we * can't alter the disk header */ /* check to see if we should set the specialStatus flag */ if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) { /* don't overwrite specialStatus if it was already set to * something else (e.g. VMOVED) */ if (!vp->specialStatus) { vp->specialStatus = VBUSY; } } break; default: break; } VOL_UNLOCK; } } #endif /* AFS_DEMAND_ATTACH_FS */ /** * volume attachment helper function. * * @param[out] ec error code * @param[in] volumeId volume ID of the attaching volume * @param[in] path full path to the volume header .vol file * @param[in] partp disk partition object for the attaching partition * @param[in] vp volume object; vp->hashid, vp->device, vp->partition, * vp->vnode_list, vp->rx_call_list, and V_attachCV (for * DAFS) should already be initialized * @param[in] isbusy 1 if vp->specialStatus should be set to VBUSY; that is, * if there is a volume operation running for this volume * that should set the volume to VBUSY during its run. 0 * otherwise. (see VVolOpSetVBusy_r) * @param[in] mode attachment mode such as V_VOLUPD, V_DUMP, etc (see * volume.h) * @param[out] acheckedOut If we successfully checked-out the volume from * the fileserver (if we needed to), this is set * to 1, otherwise it is 0. * * @return pointer to the semi-attached volume pointer * @retval NULL an error occurred (check value of *ec) * @retval vp volume successfully attaching * * @pre no locks held * * @post VOL_LOCK held */ static Volume * attach2(Error * ec, VolId volumeId, char *path, struct DiskPartition64 *partp, Volume * vp, int isbusy, int mode, int *acheckedOut) { /* have we read in the header successfully? */ int read_header = 0; #ifdef AFS_DEMAND_ATTACH_FS /* should we FreeVolume(vp) instead of VCheckFree(vp) in the error * cleanup? */ int forcefree = 0; /* in the case of an error, to what state should the volume be * transitioned? */ VolState error_state = VOL_STATE_ERROR; #endif /* AFS_DEMAND_ATTACH_FS */ *ec = 0; vp->vnodeIndex[vLarge].handle = NULL; vp->vnodeIndex[vSmall].handle = NULL; vp->diskDataHandle = NULL; vp->linkHandle = NULL; *acheckedOut = 0; #ifdef AFS_DEMAND_ATTACH_FS attach_check_vop(ec, volumeId, partp, vp, acheckedOut); if (!*ec) { attach_volume_header(ec, vp, partp, mode, 0, acheckedOut); } #else attach_volume_header(ec, vp, partp, mode, 0, acheckedOut); #endif /* !AFS_DEMAND_ATTACH_FS */ if (*ec == VNOVOL) { /* if the volume doesn't exist, skip straight to 'error' so we don't * request a salvage */ VOL_LOCK; goto error_notbroken; } if (!*ec) { read_header = 1; /* ensure that we don't override specialStatus if it was set to * something else (e.g. VMOVED) */ if (isbusy && !vp->specialStatus) { vp->specialStatus = VBUSY; } vp->shuttingDown = 0; vp->goingOffline = 0; vp->nUsers = 1; #ifdef AFS_DEMAND_ATTACH_FS vp->stats.last_attach = FT_ApproxTime(); vp->stats.attaches++; #endif VOL_LOCK; IncUInt64(&VStats.attaches); vp->cacheCheck = ++VolumeCacheCheck; /* just in case this ever rolls over */ if (!vp->cacheCheck) vp->cacheCheck = ++VolumeCacheCheck; VOL_UNLOCK; #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(vp) |= VOL_HDR_LOADED; vp->stats.last_hdr_load = vp->stats.last_attach; #endif /* AFS_DEMAND_ATTACH_FS */ } if (!*ec) { struct IndexFileHeader iHead; #if OPENAFS_VOL_STATS /* * We just read in the diskstuff part of the header. If the detailed * volume stats area has not yet been initialized, we should bzero the * area and mark it as initialized. */ if (!(V_stat_initialized(vp))) { memset((V_stat_area(vp)), 0, VOL_STATS_BYTES); V_stat_initialized(vp) = 1; } #endif /* OPENAFS_VOL_STATS */ (void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle, (char *)&iHead, sizeof(iHead), SMALLINDEXMAGIC, SMALLINDEXVERSION); if (*ec) { Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec); } } if (!*ec) { struct IndexFileHeader iHead; (void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle, (char *)&iHead, sizeof(iHead), LARGEINDEXMAGIC, LARGEINDEXVERSION); if (*ec) { Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec); } } #ifdef AFS_NAMEI_ENV if (!*ec) { struct versionStamp stamp; (void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp, sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION); if (*ec) { Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec); } } #endif /* AFS_NAMEI_ENV */ #if defined(AFS_DEMAND_ATTACH_FS) if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) { VOL_LOCK; if (!VCanScheduleSalvage()) { Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec); } VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_NO_OFFLINE); vp->nUsers = 0; goto locked_error; } else if (*ec) { /* volume operation in progress */ VOL_LOCK; goto error_notbroken; } #else /* AFS_DEMAND_ATTACH_FS */ if (*ec) { Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec); goto unlocked_error; } #endif /* AFS_DEMAND_ATTACH_FS */ if (V_needsSalvaged(vp)) { if (vp->specialStatus) vp->specialStatus = 0; VOL_LOCK; #if defined(AFS_DEMAND_ATTACH_FS) if (!VCanScheduleSalvage()) { Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path); } VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_NO_OFFLINE); vp->nUsers = 0; #else /* AFS_DEMAND_ATTACH_FS */ *ec = VSALVAGE; #endif /* AFS_DEMAND_ATTACH_FS */ goto locked_error; } VOL_LOCK; vp->nextVnodeUnique = V_uniquifier(vp); if (VShouldCheckInUse(mode) && V_inUse(vp) && VolumeWriteable(vp)) { if (!V_needsSalvaged(vp)) { V_needsSalvaged(vp) = 1; VUpdateVolume_r(ec, vp, 0); } #if defined(AFS_DEMAND_ATTACH_FS) if (!VCanScheduleSalvage()) { Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path); } VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_NO_OFFLINE); vp->nUsers = 0; #else /* AFS_DEMAND_ATTACH_FS */ Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path); *ec = VSALVAGE; #endif /* AFS_DEMAND_ATTACH_FS */ goto locked_error; } if (programType == fileServer && V_destroyMe(vp) == DESTROY_ME) { /* Only check destroyMe if we are the fileserver, since the * volserver et al sometimes need to work with volumes with * destroyMe set. Examples are 'temporary' volumes the * volserver creates, and when we create a volume (destroyMe * is set on creation; sometimes a separate volserver * transaction is created to clear destroyMe). */ #if defined(AFS_DEMAND_ATTACH_FS) /* schedule a salvage so the volume goes away on disk */ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_NO_OFFLINE); VChangeState_r(vp, VOL_STATE_ERROR); vp->nUsers = 0; forcefree = 1; #endif /* AFS_DEMAND_ATTACH_FS */ Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path); *ec = VNOVOL; goto locked_error; } vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL; #ifndef BITMAP_LATER if (programType == fileServer && VolumeWriteable(vp)) { int i; for (i = 0; i < nVNODECLASSES; i++) { VGetBitmap_r(ec, vp, i); if (*ec) { #ifdef AFS_DEMAND_ATTACH_FS VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_NO_OFFLINE); vp->nUsers = 0; #endif /* AFS_DEMAND_ATTACH_FS */ Log("VAttachVolume: error getting bitmap for volume (%s)\n", path); goto locked_error; } } } #endif /* BITMAP_LATER */ if (VInit >= 2 && V_needsCallback(vp)) { if (V_BreakVolumeCallbacks) { Log("VAttachVolume: Volume %lu was changed externally; breaking callbacks\n", afs_printable_uint32_lu(V_id(vp))); V_needsCallback(vp) = 0; VOL_UNLOCK; (*V_BreakVolumeCallbacks) (V_id(vp)); VOL_LOCK; VUpdateVolume_r(ec, vp, 0); } #ifdef FSSYNC_BUILD_CLIENT else if (VCanUseFSSYNC()) { afs_int32 fsync_code; V_needsCallback(vp) = 0; VOL_UNLOCK; fsync_code = FSYNC_VolOp(V_id(vp), NULL, FSYNC_VOL_BREAKCBKS, FSYNC_WHATEVER, NULL); VOL_LOCK; if (fsync_code) { V_needsCallback(vp) = 1; Log("Error trying to tell the fileserver to break callbacks for " "changed volume %lu; error code %ld\n", afs_printable_uint32_lu(V_id(vp)), afs_printable_int32_ld(fsync_code)); } else { VUpdateVolume_r(ec, vp, 0); } } #endif /* FSSYNC_BUILD_CLIENT */ if (*ec) { Log("VAttachVolume: error %d clearing needsCallback on volume " "%lu; needs salvage\n", (int)*ec, afs_printable_uint32_lu(V_id(vp))); #ifdef AFS_DEMAND_ATTACH_FS VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_NO_OFFLINE); vp->nUsers = 0; #else /* !AFS_DEMAND_ATTACH_FS */ *ec = VSALVAGE; #endif /* !AFS_DEMAND_ATTACh_FS */ goto locked_error; } } if (programType == fileServer) { if (vp->specialStatus) vp->specialStatus = 0; if (V_blessed(vp) && V_inService(vp) && !V_needsSalvaged(vp)) { V_inUse(vp) = fileServer; V_offlineMessage(vp)[0] = '\0'; } if (!V_inUse(vp)) { *ec = VNOVOL; #ifdef AFS_DEMAND_ATTACH_FS /* Put the vol into PREATTACHED state, so if someone tries to * access it again, we try to attach, see that we're not blessed, * and give a VNOVOL error again. Putting it into UNATTACHED state * would result in a VOFFLINE error instead. */ error_state = VOL_STATE_PREATTACHED; #endif /* AFS_DEMAND_ATTACH_FS */ /* mimic e.g. GetVolume errors */ if (!V_blessed(vp)) { Log("Volume %lu offline: not blessed\n", afs_printable_uint32_lu(V_id(vp))); FreeVolumeHeader(vp); } else if (!V_inService(vp)) { Log("Volume %lu offline: not in service\n", afs_printable_uint32_lu(V_id(vp))); FreeVolumeHeader(vp); } else { Log("Volume %lu offline: needs salvage\n", afs_printable_uint32_lu(V_id(vp))); *ec = VSALVAGE; #ifdef AFS_DEMAND_ATTACH_FS error_state = VOL_STATE_ERROR; /* see if we can recover */ VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, 0 /*flags*/); #endif } #ifdef AFS_DEMAND_ATTACH_FS vp->nUsers = 0; #endif goto locked_error; } } else { #ifdef AFS_DEMAND_ATTACH_FS if ((mode != V_PEEK) && (mode != V_SECRETLY)) V_inUse(vp) = programType; #endif /* AFS_DEMAND_ATTACH_FS */ V_checkoutMode(vp) = mode; } AddVolumeToHashTable(vp, V_id(vp)); #ifdef AFS_DEMAND_ATTACH_FS if (VCanUnlockAttached() && (V_attachFlags(vp) & VOL_LOCKED)) { VUnlockVolume(vp); } if ((programType != fileServer) || (V_inUse(vp) == fileServer)) { AddVolumeToVByPList_r(vp); VLRU_Add_r(vp); VChangeState_r(vp, VOL_STATE_ATTACHED); } else { VChangeState_r(vp, VOL_STATE_UNATTACHED); } #endif return vp; #ifndef AFS_DEMAND_ATTACH_FS unlocked_error: #endif VOL_LOCK; locked_error: #ifdef AFS_DEMAND_ATTACH_FS if (!VIsErrorState(V_attachState(vp))) { if (VIsErrorState(error_state)) { Log("attach2: forcing vol %u to error state (state %u flags 0x%x ec %d)\n", vp->hashid, V_attachState(vp), V_attachFlags(vp), *ec); } VChangeState_r(vp, error_state); } #endif /* AFS_DEMAND_ATTACH_FS */ if (read_header) { VReleaseVolumeHandles_r(vp); } error_notbroken: #ifdef AFS_DEMAND_ATTACH_FS VCheckSalvage(vp); if (forcefree) { FreeVolume(vp); } else { VCheckFree(vp); } #else /* !AFS_DEMAND_ATTACH_FS */ FreeVolume(vp); #endif /* !AFS_DEMAND_ATTACH_FS */ return NULL; } /* Attach an existing volume. The volume also normally goes online at this time. An offline volume must be reattached to make it go online. */ Volume * VAttachVolume(Error * ec, VolumeId volumeId, int mode) { Volume *retVal; VOL_LOCK; retVal = VAttachVolume_r(ec, volumeId, mode); VOL_UNLOCK; return retVal; } Volume * VAttachVolume_r(Error * ec, VolumeId volumeId, int mode) { char *part, *name; VGetVolumePath(ec, volumeId, &part, &name); if (*ec) { Volume *vp; Error error; vp = VGetVolume_r(&error, volumeId); if (vp) { osi_Assert(V_inUse(vp) == 0); VDetachVolume_r(ec, vp); } return NULL; } return VAttachVolumeByName_r(ec, part, name, mode); } /* Increment a reference count to a volume, sans context swaps. Requires * possibly reading the volume header in from the disk, since there's * an invariant in the volume package that nUsers>0 ==> vp->header is valid. * * N.B. This call can fail if we can't read in the header!! In this case * we still guarantee we won't context swap, but the ref count won't be * incremented (otherwise we'd violate the invariant). */ /* NOTE: with the demand attach fileserver extensions, the global lock * is dropped within VHold */ #ifdef AFS_DEMAND_ATTACH_FS static int VHold_r(Volume * vp) { Error error; VCreateReservation_r(vp); VWaitExclusiveState_r(vp); LoadVolumeHeader(&error, vp); if (error) { VCancelReservation_r(vp); return error; } vp->nUsers++; VCancelReservation_r(vp); return 0; } #else /* AFS_DEMAND_ATTACH_FS */ static int VHold_r(Volume * vp) { Error error; LoadVolumeHeader(&error, vp); if (error) return error; vp->nUsers++; return 0; } #endif /* AFS_DEMAND_ATTACH_FS */ /**** volume timeout-related stuff ****/ #ifdef AFS_PTHREAD_ENV static struct timespec *shutdown_timeout; static pthread_once_t shutdown_timeout_once = PTHREAD_ONCE_INIT; static_inline int VTimedOut(const struct timespec *ts) { struct timeval tv; int code; if (ts->tv_sec == 0) { /* short-circuit; this will have always timed out */ return 1; } code = gettimeofday(&tv, NULL); if (code) { Log("Error %d from gettimeofday, assuming we have not timed out\n", errno); /* assume no timeout; failure mode is we just wait longer than normal * instead of returning errors when we shouldn't */ return 0; } if (tv.tv_sec < ts->tv_sec || (tv.tv_sec == ts->tv_sec && tv.tv_usec*1000 < ts->tv_nsec)) { return 0; } return 1; } /** * Calculate an absolute timeout. * * @param[out] ts A timeout that is "timeout" seconds from now, if we return * NULL, the memory is not touched * @param[in] timeout How long the timeout should be from now * * @return timeout to use * @retval NULL no timeout; wait forever * @retval non-NULL the given value for "ts" * * @internal */ static struct timespec * VCalcTimeout(struct timespec *ts, afs_int32 timeout) { struct timeval now; int code; if (timeout < 0) { return NULL; } if (timeout == 0) { ts->tv_sec = ts->tv_nsec = 0; return ts; } code = gettimeofday(&now, NULL); if (code) { Log("Error %d from gettimeofday, falling back to 'forever' timeout\n", errno); return NULL; } ts->tv_sec = now.tv_sec + timeout; ts->tv_nsec = now.tv_usec * 1000; return ts; } /** * Initialize the shutdown_timeout global. */ static void VShutdownTimeoutInit(void) { struct timespec *ts; ts = malloc(sizeof(*ts)); shutdown_timeout = VCalcTimeout(ts, vol_opts.offline_shutdown_timeout); if (!shutdown_timeout) { free(ts); } } /** * Figure out the timeout that should be used for waiting for offline volumes. * * @param[out] ats Storage space for a local timeout value if needed * * @return The timeout value that should be used * @retval NULL No timeout; wait forever for offlining volumes * @retval non-NULL A pointer to the absolute time that should be used as * the deadline for waiting for offlining volumes. * * @note If we return non-NULL, the pointer we return may or may not be the * same as "ats" */ static const struct timespec * VOfflineTimeout(struct timespec *ats) { if (vol_shutting_down) { osi_Assert(pthread_once(&shutdown_timeout_once, VShutdownTimeoutInit) == 0); return shutdown_timeout; } else { return VCalcTimeout(ats, vol_opts.offline_timeout); } } #else /* AFS_PTHREAD_ENV */ /* Waiting a certain amount of time for offlining volumes is not supported * for LWP due to a lack of primitives. So, we never time out */ # define VTimedOut(x) (0) # define VOfflineTimeout(x) (NULL) #endif /* !AFS_PTHREAD_ENV */ #if 0 static int VHold(Volume * vp) { int retVal; VOL_LOCK; retVal = VHold_r(vp); VOL_UNLOCK; return retVal; } #endif static afs_int32 VIsGoingOffline_r(struct Volume *vp) { afs_int32 code = 0; if (vp->goingOffline) { if (vp->specialStatus) { code = vp->specialStatus; } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) { code = VNOVOL; } else { code = VOFFLINE; } } return code; } /** * Tell the caller if a volume is waiting to go offline. * * @param[in] vp The volume we want to know about * * @return volume status * @retval 0 volume is not waiting to go offline, go ahead and use it * @retval nonzero volume is waiting to offline, and give the returned code * as an error to anyone accessing the volume * * @pre VOL_LOCK is NOT held * @pre caller holds a heavyweight reference on vp */ afs_int32 VIsGoingOffline(struct Volume *vp) { afs_int32 code; VOL_LOCK; code = VIsGoingOffline_r(vp); VOL_UNLOCK; return code; } /** * Register an RX call with a volume. * * @param[inout] ec Error code; if unset when passed in, may be set if * the volume starts going offline * @param[out] client_ec @see GetVolume * @param[in] vp Volume struct * @param[in] cbv VCallByVol struct containing the RX call to register * * @pre VOL_LOCK held * @pre caller holds heavy ref on vp * * @internal */ static void VRegisterCall_r(Error *ec, Error *client_ec, Volume *vp, struct VCallByVol *cbv) { if (vp && cbv) { #ifdef AFS_DEMAND_ATTACH_FS if (!*ec) { /* just in case the volume started going offline after we got the * reference to it... otherwise, if the volume started going * offline right at the end of GetVolume(), we might race with the * RX call scanner, and return success and add our cbv to the * rx_call_list _after_ the scanner has scanned the list. */ *ec = VIsGoingOffline_r(vp); if (client_ec) { *client_ec = *ec; } } while (V_attachState(vp) == VOL_STATE_SCANNING_RXCALLS) { VWaitStateChange_r(vp); } #endif /* AFS_DEMAND_ATTACH_FS */ queue_Prepend(&vp->rx_call_list, cbv); } } /** * Deregister an RX call with a volume. * * @param[in] vp Volume struct * @param[in] cbv VCallByVol struct containing the RX call to deregister * * @pre VOL_LOCK held * @pre caller holds heavy ref on vp * * @internal */ static void VDeregisterCall_r(Volume *vp, struct VCallByVol *cbv) { if (cbv && queue_IsOnQueue(cbv)) { #ifdef AFS_DEMAND_ATTACH_FS while (V_attachState(vp) == VOL_STATE_SCANNING_RXCALLS) { VWaitStateChange_r(vp); } #endif /* AFS_DEMAND_ATTACH_FS */ queue_Remove(cbv); } } /***************************************************/ /* get and put volume routines */ /***************************************************/ /** * put back a heavyweight reference to a volume object. * * @param[in] vp volume object pointer * * @pre VOL_LOCK held * * @post heavyweight volume reference put back. * depending on state, volume may have been taken offline, * detached, salvaged, freed, etc. * * @internal volume package internal use only */ void VPutVolume_r(Volume * vp) { osi_Assert(--vp->nUsers >= 0); if (vp->nUsers == 0) { VCheckOffline(vp); ReleaseVolumeHeader(vp->header); #ifdef AFS_DEMAND_ATTACH_FS if (!VCheckDetach(vp)) { VCheckSalvage(vp); VCheckFree(vp); } #else /* AFS_DEMAND_ATTACH_FS */ VCheckDetach(vp); #endif /* AFS_DEMAND_ATTACH_FS */ } } void VPutVolume(Volume * vp) { VOL_LOCK; VPutVolume_r(vp); VOL_UNLOCK; } /** * Puts a volume reference obtained with VGetVolumeWithCall. * * @param[in] vp Volume struct * @param[in] cbv VCallByVol struct given to VGetVolumeWithCall, or NULL if none * * @pre VOL_LOCK is NOT held */ void VPutVolumeWithCall(Volume *vp, struct VCallByVol *cbv) { VOL_LOCK; VDeregisterCall_r(vp, cbv); VPutVolume_r(vp); VOL_UNLOCK; } /* Get a pointer to an attached volume. The pointer is returned regardless of whether or not the volume is in service or on/off line. An error code, however, is returned with an indication of the volume's status */ Volume * VGetVolume(Error * ec, Error * client_ec, VolId volumeId) { Volume *retVal; VOL_LOCK; retVal = GetVolume(ec, client_ec, volumeId, NULL, 0); VOL_UNLOCK; return retVal; } /** * Get a volume reference associated with an RX call. * * @param[out] ec @see GetVolume * @param[out] client_ec @see GetVolume * @param[in] volumeId @see GetVolume * @param[in] ts How long to wait for going-offline volumes (absolute time). * If NULL, wait forever. If ts->tv_sec == 0, return immediately * with an error if the volume is going offline. * @param[in] cbv Contains an RX call to be associated with this volume * reference. This call may be interrupted if the volume is * requested to go offline while we hold a ref on it. Give NULL * to not associate an RX call with this reference. * * @return @see GetVolume * * @note for LWP builds, ts must be NULL * * @note A reference obtained with this function MUST be put back with * VPutVolumeWithCall */ Volume * VGetVolumeWithCall(Error * ec, Error * client_ec, VolId volumeId, const struct timespec *ts, struct VCallByVol *cbv) { Volume *retVal; VOL_LOCK; retVal = GetVolume(ec, client_ec, volumeId, NULL, ts); VRegisterCall_r(ec, client_ec, retVal, cbv); VOL_UNLOCK; return retVal; } Volume * VGetVolume_r(Error * ec, VolId volumeId) { return GetVolume(ec, NULL, volumeId, NULL, NULL); } /* try to get a volume we've previously looked up */ /* for demand attach fs, caller MUST NOT hold a ref count on vp */ Volume * VGetVolumeByVp_r(Error * ec, Volume * vp) { return GetVolume(ec, NULL, vp->hashid, vp, NULL); } /** * private interface for getting a volume handle * * @param[out] ec error code (0 if no error) * @param[out] client_ec wire error code to be given to clients * @param[in] volumeId ID of the volume we want * @param[in] hint optional hint for hash lookups, or NULL * @param[in] timeout absolute deadline for waiting for the volume to go * offline, if it is going offline. NULL to wait forever. * * @return a volume handle for the specified volume * @retval NULL an error occurred, or the volume is in such a state that * we cannot load a header or return any volume struct * * @note for DAFS, caller must NOT hold a ref count on 'hint' * * @note 'timeout' is only checked if the volume is actually going offline; so * if you pass timeout->tv_sec = 0, this will exhibit typical * nonblocking behavior. * * @note for LWP builds, 'timeout' must be NULL */ static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, const struct timespec *timeout) { Volume *vp = hint; /* pull this profiling/debugging code out of regular builds */ #ifdef notdef #define VGET_CTR_INC(x) x++ unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V5 = 0, V6 = 0, V7 = 0, V8 = 0, V9 = 0; unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0; #else #define VGET_CTR_INC(x) #endif #ifdef AFS_DEMAND_ATTACH_FS Volume *avp, * rvp = hint; #endif /* * if VInit is zero, the volume package dynamic * data structures have not been initialized yet, * and we must immediately return an error */ if (VInit == 0) { vp = NULL; *ec = VOFFLINE; if (client_ec) { *client_ec = VOFFLINE; } goto not_inited; } #ifdef AFS_DEMAND_ATTACH_FS if (rvp) { VCreateReservation_r(rvp); } #endif /* AFS_DEMAND_ATTACH_FS */ for (;;) { *ec = 0; if (client_ec) *client_ec = 0; VGET_CTR_INC(V0); vp = VLookupVolume_r(ec, volumeId, vp); if (*ec) { vp = NULL; break; } #ifdef AFS_DEMAND_ATTACH_FS if (rvp && (rvp != vp)) { /* break reservation on old vp */ VCancelReservation_r(rvp); rvp = NULL; } #endif /* AFS_DEMAND_ATTACH_FS */ if (!vp) { VGET_CTR_INC(V1); if (VInit < 2) { VGET_CTR_INC(V2); /* Until we have reached an initialization level of 2 * we don't know whether this volume exists or not. * We can't sleep and retry later because before a volume * is attached, the caller tries to get it first. Just * return VOFFLINE and the caller can choose whether to * retry the command or not. */ *ec = VOFFLINE; break; } *ec = VNOVOL; break; } VGET_CTR_INC(V3); IncUInt64(&VStats.hdr_gets); #ifdef AFS_DEMAND_ATTACH_FS /* block if someone else is performing an exclusive op on this volume */ if (rvp != vp) { rvp = vp; VCreateReservation_r(rvp); } VWaitExclusiveState_r(vp); /* short circuit with VNOVOL in the following circumstances: * * - VOL_STATE_ERROR * - VOL_STATE_SHUTTING_DOWN */ if ((V_attachState(vp) == VOL_STATE_ERROR) || (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN) || (V_attachState(vp) == VOL_STATE_GOING_OFFLINE)) { *ec = VNOVOL; vp = NULL; break; } /* * short circuit with VOFFLINE for VOL_STATE_UNATTACHED and * VNOVOL for VOL_STATE_DELETED */ if ((V_attachState(vp) == VOL_STATE_UNATTACHED) || (V_attachState(vp) == VOL_STATE_DELETED)) { if (vp->specialStatus) { *ec = vp->specialStatus; } else if (V_attachState(vp) == VOL_STATE_DELETED) { *ec = VNOVOL; } else { *ec = VOFFLINE; } vp = NULL; break; } /* allowable states: * - PREATTACHED * - ATTACHED * - SALVAGING * - SALVAGE_REQ */ if (vp->salvage.requested) { VUpdateSalvagePriority_r(vp); } if (V_attachState(vp) == VOL_STATE_PREATTACHED) { if (vp->specialStatus) { *ec = vp->specialStatus; vp = NULL; break; } avp = VAttachVolumeByVp_r(ec, vp, 0); if (avp) { if (vp != avp) { /* VAttachVolumeByVp_r can return a pointer * != the vp passed to it under certain * conditions; make sure we don't leak * reservations if that happens */ vp = avp; VCancelReservation_r(rvp); rvp = avp; VCreateReservation_r(rvp); } VPutVolume_r(avp); } if (*ec) { int endloop = 0; switch (*ec) { case VSALVAGING: break; case VOFFLINE: if (!vp->pending_vol_op) { endloop = 1; } break; default: *ec = VNOVOL; endloop = 1; } if (endloop) { vp = NULL; break; } } } if (VIsSalvaging(vp) || (*ec == VSALVAGING)) { if (client_ec) { /* see CheckVnode() in afsfileprocs.c for an explanation * of this error code logic */ afs_uint32 now = FT_ApproxTime(); if ((vp->stats.last_salvage + (10 * 60)) >= now) { *client_ec = VBUSY; } else { *client_ec = VRESTARTING; } } *ec = VSALVAGING; vp = NULL; break; } if (VIsErrorState(V_attachState(vp))) { /* make sure we don't take a vp in VOL_STATE_ERROR state and use * it, or transition it out of that state */ if (!*ec) { *ec = VNOVOL; } vp = NULL; break; } /* * this test MUST happen after VAttachVolymeByVp, so vol_op_state is * not VolOpRunningUnknown (attach2 would have converted it to Online * or Offline) */ /* only valid before/during demand attachment */ osi_Assert(!vp->pending_vol_op || vp->pending_vol_op->vol_op_state != FSSYNC_VolOpRunningUnknown); /* deny getvolume due to running mutually exclusive vol op */ if (vp->pending_vol_op && vp->pending_vol_op->vol_op_state==FSSYNC_VolOpRunningOffline) { /* * volume cannot remain online during this volume operation. * notify client. */ if (vp->specialStatus) { /* * special status codes outrank normal VOFFLINE code */ *ec = vp->specialStatus; if (client_ec) { *client_ec = vp->specialStatus; } } else { if (client_ec) { /* see CheckVnode() in afsfileprocs.c for an explanation * of this error code logic */ afs_uint32 now = FT_ApproxTime(); if ((vp->stats.last_vol_op + (10 * 60)) >= now) { *client_ec = VBUSY; } else { *client_ec = VRESTARTING; } } *ec = VOFFLINE; } VChangeState_r(vp, VOL_STATE_UNATTACHED); FreeVolumeHeader(vp); vp = NULL; break; } #endif /* AFS_DEMAND_ATTACH_FS */ LoadVolumeHeader(ec, vp); if (*ec) { VGET_CTR_INC(V6); /* Only log the error if it was a totally unexpected error. Simply * a missing inode is likely to be caused by the volume being deleted */ if (errno != ENXIO || LogLevel) Log("Volume %u: couldn't reread volume header\n", vp->hashid); #ifdef AFS_DEMAND_ATTACH_FS if (VCanScheduleSalvage()) { VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0 /*flags*/); } else { FreeVolume(vp); vp = NULL; } #else /* AFS_DEMAND_ATTACH_FS */ FreeVolume(vp); vp = NULL; #endif /* AFS_DEMAND_ATTACH_FS */ break; } VGET_CTR_INC(V7); if (vp->shuttingDown) { VGET_CTR_INC(V8); *ec = VNOVOL; vp = NULL; break; } if (programType == fileServer) { VGET_CTR_INC(V9); if (vp->goingOffline) { if (timeout && VTimedOut(timeout)) { /* we've timed out; don't wait for the vol */ } else { VGET_CTR_INC(V10); #ifdef AFS_DEMAND_ATTACH_FS /* wait for the volume to go offline */ if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) { VTimedWaitStateChange_r(vp, timeout, NULL); } #elif defined(AFS_PTHREAD_ENV) VOL_CV_TIMEDWAIT(&vol_put_volume_cond, timeout, NULL); #else /* AFS_PTHREAD_ENV */ /* LWP has no timed wait, so the caller better not be * expecting one */ osi_Assert(!timeout); LWP_WaitProcess(VPutVolume); #endif /* AFS_PTHREAD_ENV */ continue; } } if (vp->specialStatus) { VGET_CTR_INC(V11); *ec = vp->specialStatus; } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) { VGET_CTR_INC(V12); *ec = VNOVOL; } else if (V_inUse(vp) == 0 || vp->goingOffline) { VGET_CTR_INC(V13); *ec = VOFFLINE; } else { VGET_CTR_INC(V14); } } break; } VGET_CTR_INC(V15); #ifdef AFS_DEMAND_ATTACH_FS /* if no error, bump nUsers */ if (vp) { vp->nUsers++; VLRU_UpdateAccess_r(vp); } if (rvp) { VCancelReservation_r(rvp); rvp = NULL; } if (client_ec && !*client_ec) { *client_ec = *ec; } #else /* AFS_DEMAND_ATTACH_FS */ /* if no error, bump nUsers */ if (vp) { vp->nUsers++; } if (client_ec) { *client_ec = *ec; } #endif /* AFS_DEMAND_ATTACH_FS */ not_inited: osi_Assert(vp || *ec); return vp; } /***************************************************/ /* Volume offline/detach routines */ /***************************************************/ /* caller MUST hold a heavyweight ref on vp */ #ifdef AFS_DEMAND_ATTACH_FS void VTakeOffline_r(Volume * vp) { Error error; osi_Assert(vp->nUsers > 0); osi_Assert(programType == fileServer); VCreateReservation_r(vp); VWaitExclusiveState_r(vp); vp->goingOffline = 1; V_needsSalvaged(vp) = 1; VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0); VCancelReservation_r(vp); } #else /* AFS_DEMAND_ATTACH_FS */ void VTakeOffline_r(Volume * vp) { osi_Assert(vp->nUsers > 0); osi_Assert(programType == fileServer); vp->goingOffline = 1; V_needsSalvaged(vp) = 1; } #endif /* AFS_DEMAND_ATTACH_FS */ void VTakeOffline(Volume * vp) { VOL_LOCK; VTakeOffline_r(vp); VOL_UNLOCK; } /** * force a volume offline. * * @param[in] vp volume object pointer * @param[in] flags flags (see note below) * * @note the flag VOL_FORCEOFF_NOUPDATE is a recursion control flag * used when VUpdateVolume_r needs to call VForceOffline_r * (which in turn would normally call VUpdateVolume_r) * * @see VUpdateVolume_r * * @pre VOL_LOCK must be held. * for DAFS, caller must hold ref. * * @note for DAFS, it _is safe_ to call this function from an * exclusive state * * @post needsSalvaged flag is set. * for DAFS, salvage is requested. * no further references to the volume through the volume * package will be honored. * all file descriptor and vnode caches are invalidated. * * @warning this is a heavy-handed interface. it results in * a volume going offline regardless of the current * reference count state. * * @internal volume package internal use only */ void VForceOffline_r(Volume * vp, int flags) { Error error; if (!V_inUse(vp)) { #ifdef AFS_DEMAND_ATTACH_FS VChangeState_r(vp, VOL_STATE_ERROR); #endif return; } strcpy(V_offlineMessage(vp), "Forced offline due to internal error: volume needs to be salvaged"); Log("Volume %u forced offline: it needs salvaging!\n", V_id(vp)); V_inUse(vp) = 0; vp->goingOffline = 0; V_needsSalvaged(vp) = 1; if (!(flags & VOL_FORCEOFF_NOUPDATE)) { VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF); } #ifdef AFS_DEMAND_ATTACH_FS VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0 /*flags*/); #endif /* AFS_DEMAND_ATTACH_FS */ #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&vol_put_volume_cond); #else /* AFS_PTHREAD_ENV */ LWP_NoYieldSignal(VPutVolume); #endif /* AFS_PTHREAD_ENV */ VReleaseVolumeHandles_r(vp); } /** * force a volume offline. * * @param[in] vp volume object pointer * * @see VForceOffline_r */ void VForceOffline(Volume * vp) { VOL_LOCK; VForceOffline_r(vp, 0); VOL_UNLOCK; } /** * Iterate over the RX calls associated with a volume, and interrupt them. * * @param[in] vp The volume whose RX calls we want to scan * * @pre VOL_LOCK held */ static void VScanCalls_r(struct Volume *vp) { struct VCallByVol *cbv, *ncbv; afs_int32 err; #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; #endif if (queue_IsEmpty(&vp->rx_call_list)) return; /* no calls to interrupt */ if (!vol_opts.interrupt_rxcall) return; /* we have no function with which to interrupt calls */ err = VIsGoingOffline_r(vp); if (!err) return; /* we're not going offline anymore */ #ifdef AFS_DEMAND_ATTACH_FS VWaitExclusiveState_r(vp); state_save = VChangeState_r(vp, VOL_STATE_SCANNING_RXCALLS); VOL_UNLOCK; #endif /* AFS_DEMAND_ATTACH_FS */ for(queue_Scan(&vp->rx_call_list, cbv, ncbv, VCallByVol)) { if (LogLevel > 0) { struct rx_peer *peer; char hoststr[16]; peer = rx_PeerOf(rx_ConnectionOf(cbv->call)); Log("Offlining volume %lu while client %s:%u is trying to read " "from it; kicking client off with error %ld\n", (long unsigned) vp->hashid, afs_inet_ntoa_r(rx_HostOf(peer), hoststr), (unsigned) ntohs(rx_PortOf(peer)), (long) err); } (*vol_opts.interrupt_rxcall) (cbv->call, err); } #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; VChangeState_r(vp, state_save); #endif /* AFS_DEMAND_ATTACH_FS */ } #ifdef AFS_DEMAND_ATTACH_FS /** * Wait for a vp to go offline. * * @param[out] ec 1 if a salvage on the volume has been requested and * salvok == 0, 0 otherwise * @param[in] vp The volume to wait for * @param[in] salvok If 0, we return immediately with *ec = 1 if the volume * has been requested to salvage. Otherwise we keep waiting * until the volume has gone offline. * * @pre VOL_LOCK held * @pre caller holds a lightweight ref on vp * * @note DAFS only */ static void VWaitForOfflineByVp_r(Error *ec, struct Volume *vp, int salvok) { struct timespec timeout_ts; const struct timespec *ts; int timedout = 0; ts = VOfflineTimeout(&timeout_ts); *ec = 0; while (!VIsOfflineState(V_attachState(vp)) && !timedout) { if (!salvok && vp->salvage.requested) { *ec = 1; return; } VTimedWaitStateChange_r(vp, ts, &timedout); } if (!timedout) { /* we didn't time out, so the volume must be offline, so we're done */ return; } /* If we got here, we timed out waiting for the volume to go offline. * Kick off the accessing RX calls and wait again */ VScanCalls_r(vp); while (!VIsOfflineState(V_attachState(vp))) { if (!salvok && vp->salvage.requested) { *ec = 1; return; } VWaitStateChange_r(vp); } } #else /* AFS_DEMAND_ATTACH_FS */ /** * Wait for a volume to go offline. * * @pre VOL_LOCK held * * @note non-DAFS only (for DAFS, use @see WaitForOfflineByVp_r) */ static void VWaitForOffline_r(Error *ec, VolumeId volid) { struct Volume *vp; const struct timespec *ts; #ifdef AFS_PTHREAD_ENV struct timespec timeout_ts; #endif ts = VOfflineTimeout(&timeout_ts); vp = GetVolume(ec, NULL, volid, NULL, ts); if (!vp) { /* error occurred so bad that we can't even get a vp; we have no * information on the vol so we don't know whether to wait, so just * return */ return; } if (!VIsGoingOffline_r(vp)) { /* volume is no longer going offline, so we're done */ VPutVolume_r(vp); return; } /* If we got here, we timed out waiting for the volume to go offline. * Kick off the accessing RX calls and wait again */ VScanCalls_r(vp); VPutVolume_r(vp); vp = NULL; vp = VGetVolume_r(ec, volid); if (vp) { /* In case it was reattached... */ VPutVolume_r(vp); } } #endif /* !AFS_DEMAND_ATTACH_FS */ /* The opposite of VAttachVolume. The volume header is written to disk, with the inUse bit turned off. A copy of the header is maintained in memory, however (which is why this is VOffline, not VDetach). */ void VOffline_r(Volume * vp, char *message) { Error error; #ifndef AFS_DEMAND_ATTACH_FS VolumeId vid = V_id(vp); #endif osi_Assert(programType != volumeUtility && programType != volumeServer); if (!V_inUse(vp)) { VPutVolume_r(vp); return; } if (V_offlineMessage(vp)[0] == '\0') strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp))); V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0'; vp->goingOffline = 1; #ifdef AFS_DEMAND_ATTACH_FS VChangeState_r(vp, VOL_STATE_GOING_OFFLINE); VCreateReservation_r(vp); VPutVolume_r(vp); VWaitForOfflineByVp_r(&error, vp, 1); VCancelReservation_r(vp); #else /* AFS_DEMAND_ATTACH_FS */ VPutVolume_r(vp); VWaitForOffline_r(&error, vid); #endif /* AFS_DEMAND_ATTACH_FS */ } #ifdef AFS_DEMAND_ATTACH_FS /** * Take a volume offline in order to perform a volume operation. * * @param[inout] ec address in which to store error code * @param[in] vp volume object pointer * @param[in] message volume offline status message * * @pre * - VOL_LOCK is held * - caller MUST hold a heavyweight ref on vp * * @post * - volume is taken offline * - if possible, volume operation is promoted to running state * - on failure, *ec is set to nonzero * * @note Although this function does not return any value, it may * still fail to promote our pending volume operation to * a running state. Any caller MUST check the value of *ec, * and MUST NOT blindly assume success. * * @warning if the caller does not hold a lightweight ref on vp, * then it MUST NOT reference vp after this function * returns to the caller. * * @internal volume package internal use only */ void VOfflineForVolOp_r(Error *ec, Volume *vp, char *message) { int salvok = 1; osi_Assert(vp->pending_vol_op); if (!V_inUse(vp)) { VPutVolume_r(vp); *ec = 1; return; } if (V_offlineMessage(vp)[0] == '\0') strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp))); V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0'; vp->goingOffline = 1; VChangeState_r(vp, VOL_STATE_GOING_OFFLINE); VCreateReservation_r(vp); VPutVolume_r(vp); if (vp->pending_vol_op->com.programType != salvageServer) { /* do not give corrupted volumes to the volserver */ salvok = 0; } *ec = 0; VWaitForOfflineByVp_r(ec, vp, salvok); VCancelReservation_r(vp); } #endif /* AFS_DEMAND_ATTACH_FS */ void VOffline(Volume * vp, char *message) { VOL_LOCK; VOffline_r(vp, message); VOL_UNLOCK; } /* This gets used for the most part by utility routines that don't want * to keep all the volume headers around. Generally, the file server won't * call this routine, because then the offline message in the volume header * (or other information) won't be available to clients. For NAMEI, also * close the file handles. However, the fileserver does call this during * an attach following a volume operation. */ void VDetachVolume_r(Error * ec, Volume * vp) { #ifdef FSSYNC_BUILD_CLIENT VolumeId volume; struct DiskPartition64 *tpartp; int notifyServer = 0; int useDone = FSYNC_VOL_ON; if (VCanUseFSSYNC()) { notifyServer = vp->needsPutBack; if (V_destroyMe(vp) == DESTROY_ME) useDone = FSYNC_VOL_LEAVE_OFF; # ifdef AFS_DEMAND_ATTACH_FS else if (!V_blessed(vp) || !V_inService(vp)) useDone = FSYNC_VOL_LEAVE_OFF; # endif } # ifdef AFS_DEMAND_ATTACH_FS if (V_needsSalvaged(vp)) { notifyServer = 0; VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, 0); } # endif tpartp = vp->partition; volume = V_id(vp); #endif /* FSSYNC_BUILD_CLIENT */ *ec = 0; /* always "succeeds" */ DeleteVolumeFromHashTable(vp); vp->shuttingDown = 1; #ifdef AFS_DEMAND_ATTACH_FS DeleteVolumeFromVByPList_r(vp); VLRU_Delete_r(vp); VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN); #else if (programType != fileServer) V_inUse(vp) = 0; #endif /* AFS_DEMAND_ATTACH_FS */ VPutVolume_r(vp); /* Will be detached sometime in the future--this is OK since volume is offline */ /* XXX the following code should really be moved to VCheckDetach() since the volume * is not technically detached until the refcounts reach zero */ #ifdef FSSYNC_BUILD_CLIENT if (VCanUseFSSYNC() && notifyServer) { if (notifyServer == VOL_PUTBACK_DELETE) { /* Only send FSYNC_VOL_DONE if the volume was actually deleted. * volserver code will set needsPutBack to VOL_PUTBACK_DELETE * to signify a deleted volume. */ useDone = FSYNC_VOL_DONE; } /* * Note: The server is not notified in the case of a bogus volume * explicitly to make it possible to create a volume, do a partial * restore, then abort the operation without ever putting the volume * online. This is essential in the case of a volume move operation * between two partitions on the same server. In that case, there * would be two instances of the same volume, one of them bogus, * which the file server would attempt to put on line */ FSYNC_VolOp(volume, tpartp->name, useDone, 0, NULL); /* XXX this code path is only hit by volume utilities, thus * V_BreakVolumeCallbacks will always be NULL. if we really * want to break callbacks in this path we need to use FSYNC_VolOp() */ #ifdef notdef /* Dettaching it so break all callbacks on it */ if (V_BreakVolumeCallbacks) { Log("volume %u detached; breaking all call backs\n", volume); (*V_BreakVolumeCallbacks) (volume); } #endif } #endif /* FSSYNC_BUILD_CLIENT */ } void VDetachVolume(Error * ec, Volume * vp) { VOL_LOCK; VDetachVolume_r(ec, vp); VOL_UNLOCK; } /***************************************************/ /* Volume fd/inode handle closing routines */ /***************************************************/ /* For VDetachVolume, we close all cached file descriptors, but keep * the Inode handles in case we need to read from a busy volume. */ /* for demand attach, caller MUST hold ref count on vp */ static void VCloseVolumeHandles_r(Volume * vp) { #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; state_save = VChangeState_r(vp, VOL_STATE_OFFLINING); VOL_UNLOCK; #endif DFlushVolume(vp->hashid); #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; #endif /* DAFS: VCloseVnodeFiles_r drops the glock internally */ VCloseVnodeFiles_r(vp); #ifdef AFS_DEMAND_ATTACH_FS VOL_UNLOCK; #endif /* Too time consuming and unnecessary for the volserver */ if (programType == fileServer) { IH_CONDSYNC(vp->vnodeIndex[vLarge].handle); IH_CONDSYNC(vp->vnodeIndex[vSmall].handle); IH_CONDSYNC(vp->diskDataHandle); #ifdef AFS_NT40_ENV IH_CONDSYNC(vp->linkHandle); #endif /* AFS_NT40_ENV */ } IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle); IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle); IH_REALLYCLOSE(vp->diskDataHandle); IH_REALLYCLOSE(vp->linkHandle); #ifdef AFS_DEMAND_ATTACH_FS if ((V_attachFlags(vp) & VOL_LOCKED)) { VUnlockVolume(vp); } VOL_LOCK; VChangeState_r(vp, state_save); #endif } /* For both VForceOffline and VOffline, we close all relevant handles. * For VOffline, if we re-attach the volume, the files may possible be * different than before. */ /* for demand attach, caller MUST hold a ref count on vp */ static void VReleaseVolumeHandles_r(Volume * vp) { #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; state_save = VChangeState_r(vp, VOL_STATE_DETACHING); VOL_UNLOCK; #endif DFlushVolume(vp->hashid); #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; #endif VReleaseVnodeFiles_r(vp); /* DAFS: releases the glock internally */ #ifdef AFS_DEMAND_ATTACH_FS VOL_UNLOCK; #endif /* Too time consuming and unnecessary for the volserver */ if (programType == fileServer) { IH_CONDSYNC(vp->vnodeIndex[vLarge].handle); IH_CONDSYNC(vp->vnodeIndex[vSmall].handle); IH_CONDSYNC(vp->diskDataHandle); #ifdef AFS_NT40_ENV IH_CONDSYNC(vp->linkHandle); #endif /* AFS_NT40_ENV */ } IH_RELEASE(vp->vnodeIndex[vLarge].handle); IH_RELEASE(vp->vnodeIndex[vSmall].handle); IH_RELEASE(vp->diskDataHandle); IH_RELEASE(vp->linkHandle); #ifdef AFS_DEMAND_ATTACH_FS if ((V_attachFlags(vp) & VOL_LOCKED)) { VUnlockVolume(vp); } VOL_LOCK; VChangeState_r(vp, state_save); #endif } /***************************************************/ /* Volume write and fsync routines */ /***************************************************/ void VUpdateVolume_r(Error * ec, Volume * vp, int flags) { #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; if (flags & VOL_UPDATE_WAIT) { VCreateReservation_r(vp); VWaitExclusiveState_r(vp); } #endif *ec = 0; if (programType == fileServer) V_uniquifier(vp) = (V_inUse(vp) ? V_nextVnodeUnique(vp) + 200 : V_nextVnodeUnique(vp)); #ifdef AFS_DEMAND_ATTACH_FS state_save = VChangeState_r(vp, VOL_STATE_UPDATING); VOL_UNLOCK; #endif WriteVolumeHeader_r(ec, vp); #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; VChangeState_r(vp, state_save); if (flags & VOL_UPDATE_WAIT) { VCancelReservation_r(vp); } #endif if (*ec) { Log("VUpdateVolume: error updating volume header, volume %u (%s)\n", V_id(vp), V_name(vp)); /* try to update on-disk header, * while preventing infinite recursion */ if (!(flags & VOL_UPDATE_NOFORCEOFF)) { VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE); } } } void VUpdateVolume(Error * ec, Volume * vp) { VOL_LOCK; VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT); VOL_UNLOCK; } void VSyncVolume_r(Error * ec, Volume * vp, int flags) { FdHandle_t *fdP; int code; #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; #endif if (flags & VOL_SYNC_WAIT) { VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT); } else { VUpdateVolume_r(ec, vp, 0); } if (!*ec) { #ifdef AFS_DEMAND_ATTACH_FS state_save = VChangeState_r(vp, VOL_STATE_UPDATING); VOL_UNLOCK; #endif fdP = IH_OPEN(V_diskDataHandle(vp)); osi_Assert(fdP != NULL); code = FDH_SYNC(fdP); osi_Assert(code == 0); FDH_CLOSE(fdP); #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; VChangeState_r(vp, state_save); #endif } } void VSyncVolume(Error * ec, Volume * vp) { VOL_LOCK; VSyncVolume_r(ec, vp, VOL_SYNC_WAIT); VOL_UNLOCK; } /***************************************************/ /* Volume dealloaction routines */ /***************************************************/ #ifdef AFS_DEMAND_ATTACH_FS static void FreeVolume(Volume * vp) { /* free the heap space, iff it's safe. * otherwise, pull it out of the hash table, so it * will get deallocated when all refs to it go away */ if (!VCheckFree(vp)) { DeleteVolumeFromHashTable(vp); DeleteVolumeFromVByPList_r(vp); /* make sure we invalidate the header cache entry */ FreeVolumeHeader(vp); } } #endif /* AFS_DEMAND_ATTACH_FS */ static void ReallyFreeVolume(Volume * vp) { int i; if (!vp) return; #ifdef AFS_DEMAND_ATTACH_FS /* debug */ VChangeState_r(vp, VOL_STATE_FREED); if (vp->pending_vol_op) free(vp->pending_vol_op); #endif /* AFS_DEMAND_ATTACH_FS */ for (i = 0; i < nVNODECLASSES; i++) if (vp->vnodeIndex[i].bitmap) free(vp->vnodeIndex[i].bitmap); FreeVolumeHeader(vp); #ifndef AFS_DEMAND_ATTACH_FS DeleteVolumeFromHashTable(vp); #endif /* AFS_DEMAND_ATTACH_FS */ free(vp); } /* check to see if we should shutdown this volume * returns 1 if volume was freed, 0 otherwise */ #ifdef AFS_DEMAND_ATTACH_FS static int VCheckDetach(Volume * vp) { int ret = 0; Error ec = 0; if (vp->nUsers || vp->nWaiters) return ret; if (vp->shuttingDown) { ret = 1; if ((programType != fileServer) && (V_inUse(vp) == programType) && ((V_checkoutMode(vp) == V_VOLUPD) || (V_checkoutMode(vp) == V_SECRETLY) || ((V_checkoutMode(vp) == V_CLONE) && (VolumeWriteable(vp))))) { V_inUse(vp) = 0; VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF); if (ec) { Log("VCheckDetach: volume header update for volume %u " "failed with errno %d\n", vp->hashid, errno); } } VReleaseVolumeHandles_r(vp); VCheckSalvage(vp); ReallyFreeVolume(vp); if (programType == fileServer) { CV_BROADCAST(&vol_put_volume_cond); } } return ret; } #else /* AFS_DEMAND_ATTACH_FS */ static int VCheckDetach(Volume * vp) { int ret = 0; Error ec = 0; if (vp->nUsers) return ret; if (vp->shuttingDown) { ret = 1; if ((programType != fileServer) && (V_inUse(vp) == programType) && ((V_checkoutMode(vp) == V_VOLUPD) || (V_checkoutMode(vp) == V_SECRETLY) || ((V_checkoutMode(vp) == V_CLONE) && (VolumeWriteable(vp))))) { V_inUse(vp) = 0; VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF); if (ec) { Log("VCheckDetach: volume header update for volume %u failed with errno %d\n", vp->hashid, errno); } } VReleaseVolumeHandles_r(vp); ReallyFreeVolume(vp); if (programType == fileServer) { #if defined(AFS_PTHREAD_ENV) CV_BROADCAST(&vol_put_volume_cond); #else /* AFS_PTHREAD_ENV */ LWP_NoYieldSignal(VPutVolume); #endif /* AFS_PTHREAD_ENV */ } } return ret; } #endif /* AFS_DEMAND_ATTACH_FS */ /* check to see if we should offline this volume * return 1 if volume went offline, 0 otherwise */ #ifdef AFS_DEMAND_ATTACH_FS static int VCheckOffline(Volume * vp) { int ret = 0; if (vp->goingOffline && !vp->nUsers) { Error error; osi_Assert(programType == fileServer); osi_Assert((V_attachState(vp) != VOL_STATE_ATTACHED) && (V_attachState(vp) != VOL_STATE_FREED) && (V_attachState(vp) != VOL_STATE_PREATTACHED) && (V_attachState(vp) != VOL_STATE_UNATTACHED) && (V_attachState(vp) != VOL_STATE_DELETED)); /* valid states: * * VOL_STATE_GOING_OFFLINE * VOL_STATE_SHUTTING_DOWN * VIsErrorState(V_attachState(vp)) * VIsExclusiveState(V_attachState(vp)) */ VCreateReservation_r(vp); VChangeState_r(vp, VOL_STATE_OFFLINING); ret = 1; /* must clear the goingOffline flag before we drop the glock */ vp->goingOffline = 0; V_inUse(vp) = 0; VLRU_Delete_r(vp); /* perform async operations */ VUpdateVolume_r(&error, vp, 0); VCloseVolumeHandles_r(vp); if (LogLevel) { if (V_offlineMessage(vp)[0]) { Log("VOffline: Volume %lu (%s) is now offline (%s)\n", afs_printable_uint32_lu(V_id(vp)), V_name(vp), V_offlineMessage(vp)); } else { Log("VOffline: Volume %lu (%s) is now offline\n", afs_printable_uint32_lu(V_id(vp)), V_name(vp)); } } /* invalidate the volume header cache entry */ FreeVolumeHeader(vp); /* if nothing changed state to error or salvaging, * drop state to unattached */ if (!VIsErrorState(V_attachState(vp))) { VChangeState_r(vp, VOL_STATE_UNATTACHED); } VCancelReservation_r(vp); /* no usage of vp is safe beyond this point */ } return ret; } #else /* AFS_DEMAND_ATTACH_FS */ static int VCheckOffline(Volume * vp) { int ret = 0; if (vp->goingOffline && !vp->nUsers) { Error error; osi_Assert(programType == fileServer); ret = 1; vp->goingOffline = 0; V_inUse(vp) = 0; VUpdateVolume_r(&error, vp, 0); VCloseVolumeHandles_r(vp); if (LogLevel) { if (V_offlineMessage(vp)[0]) { Log("VOffline: Volume %lu (%s) is now offline (%s)\n", afs_printable_uint32_lu(V_id(vp)), V_name(vp), V_offlineMessage(vp)); } else { Log("VOffline: Volume %lu (%s) is now offline\n", afs_printable_uint32_lu(V_id(vp)), V_name(vp)); } } FreeVolumeHeader(vp); #ifdef AFS_PTHREAD_ENV CV_BROADCAST(&vol_put_volume_cond); #else /* AFS_PTHREAD_ENV */ LWP_NoYieldSignal(VPutVolume); #endif /* AFS_PTHREAD_ENV */ } return ret; } #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* demand attach fs ref counting routines */ /***************************************************/ #ifdef AFS_DEMAND_ATTACH_FS /* the following two functions handle reference counting for * asynchronous operations on volume structs. * * their purpose is to prevent a VDetachVolume or VShutdown * from free()ing the Volume struct during an async i/o op */ /* register with the async volume op ref counter */ /* VCreateReservation_r moved into inline code header because it * is now needed in vnode.c -- tkeiser 11/20/2007 */ /** * decrement volume-package internal refcount. * * @param vp volume object pointer * * @internal volume package internal use only * * @pre * @arg VOL_LOCK is held * @arg lightweight refcount held * * @post volume waiters refcount is decremented; volume may * have been deallocated/shutdown/offlined/salvaged/ * whatever during the process * * @warning once you have tossed your last reference (you can acquire * lightweight refs recursively) it is NOT SAFE to reference * a volume object pointer ever again * * @see VCreateReservation_r * * @note DEMAND_ATTACH_FS only */ void VCancelReservation_r(Volume * vp) { osi_Assert(--vp->nWaiters >= 0); if (vp->nWaiters == 0) { VCheckOffline(vp); if (!VCheckDetach(vp)) { VCheckSalvage(vp); VCheckFree(vp); } } } /* check to see if we should free this volume now * return 1 if volume was freed, 0 otherwise */ static int VCheckFree(Volume * vp) { int ret = 0; if ((vp->nUsers == 0) && (vp->nWaiters == 0) && !(V_attachFlags(vp) & (VOL_IN_HASH | VOL_ON_VBYP_LIST | VOL_IS_BUSY | VOL_ON_VLRU))) { ReallyFreeVolume(vp); ret = 1; } return ret; } #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* online volume operations routines */ /***************************************************/ #ifdef AFS_DEMAND_ATTACH_FS /** * register a volume operation on a given volume. * * @param[in] vp volume object * @param[in] vopinfo volume operation info object * * @pre VOL_LOCK is held * * @post volume operation info object attached to volume object. * volume operation statistics updated. * * @note by "attached" we mean a copy of the passed in object is made * * @internal volume package internal use only */ int VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) { FSSYNC_VolOp_info * info; /* attach a vol op info node to the volume struct */ info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info)); osi_Assert(info != NULL); memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info)); vp->pending_vol_op = info; /* update stats */ vp->stats.last_vol_op = FT_ApproxTime(); vp->stats.vol_ops++; IncUInt64(&VStats.vol_ops); return 0; } /** * deregister the volume operation attached to this volume. * * @param[in] vp volume object pointer * * @pre VOL_LOCK is held * * @post the volume operation info object is detached from the volume object * * @internal volume package internal use only */ int VDeregisterVolOp_r(Volume * vp) { if (vp->pending_vol_op) { free(vp->pending_vol_op); vp->pending_vol_op = NULL; } return 0; } #endif /* AFS_DEMAND_ATTACH_FS */ /** * determine whether it is safe to leave a volume online during * the volume operation described by the vopinfo object. * * @param[in] vp volume object * @param[in] vopinfo volume operation info object * * @return whether it is safe to leave volume online * @retval 0 it is NOT SAFE to leave the volume online * @retval 1 it is safe to leave the volume online during the operation * * @pre * @arg VOL_LOCK is held * @arg disk header attached to vp (heavyweight ref on vp will guarantee * this condition is met) * * @internal volume package internal use only */ int VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) { return (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline || (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME && (vopinfo->com.reason == V_READONLY || (!VolumeWriteable(vp) && (vopinfo->com.reason == V_CLONE || vopinfo->com.reason == V_DUMP))))); } /** * same as VVolOpLeaveOnline_r, but does not require a volume with an attached * header. * * @param[in] vp volume object * @param[in] vopinfo volume operation info object * * @return whether it is safe to leave volume online * @retval 0 it is NOT SAFE to leave the volume online * @retval 1 it is safe to leave the volume online during the operation * @retval -1 unsure; volume header is required in order to know whether or * not is is safe to leave the volume online * * @pre VOL_LOCK is held * * @internal volume package internal use only */ int VVolOpLeaveOnlineNoHeader_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) { /* follow the logic in VVolOpLeaveOnline_r; this is the same, except * assume that we don't know VolumeWriteable; return -1 if the answer * depends on VolumeWriteable */ if (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline) { return 1; } if (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME && vopinfo->com.reason == V_READONLY) { return 1; } if (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME && (vopinfo->com.reason == V_CLONE || vopinfo->com.reason == V_DUMP)) { /* must know VolumeWriteable */ return -1; } return 0; } /** * determine whether VBUSY should be set during this volume operation. * * @param[in] vp volume object * @param[in] vopinfo volume operation info object * * @return whether VBUSY should be set * @retval 0 VBUSY does NOT need to be set * @retval 1 VBUSY SHOULD be set * * @pre VOL_LOCK is held * * @internal volume package internal use only */ int VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo) { return ((vopinfo->com.command == FSYNC_VOL_OFF && vopinfo->com.reason == FSYNC_SALVAGE) || (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME && (vopinfo->com.reason == V_CLONE || vopinfo->com.reason == V_DUMP))); } /***************************************************/ /* online salvager routines */ /***************************************************/ #if defined(AFS_DEMAND_ATTACH_FS) /** * offline a volume to let it be salvaged. * * @param[in] vp Volume to offline * * @return whether we offlined the volume successfully * @retval 0 volume was not offlined * @retval 1 volume is now offline * * @note This is similar to VCheckOffline, but slightly different. We do not * deal with vp->goingOffline, and we try to avoid touching the volume * header except just to set needsSalvaged * * @pre VOL_LOCK held * @pre vp->nUsers == 0 * @pre V_attachState(vp) == VOL_STATE_SALVAGE_REQ */ static int VOfflineForSalvage_r(struct Volume *vp) { Error error; VCreateReservation_r(vp); VWaitExclusiveState_r(vp); if (vp->nUsers || V_attachState(vp) == VOL_STATE_SALVAGING) { /* Someone's using the volume, or someone got to scheduling the salvage * before us. I don't think either of these should be possible, as we * should gain no new heavyweight references while we're trying to * salvage, but just to be sure... */ VCancelReservation_r(vp); return 0; } VChangeState_r(vp, VOL_STATE_OFFLINING); VLRU_Delete_r(vp); if (vp->header) { V_needsSalvaged(vp) = 1; /* ignore error; updating needsSalvaged is just best effort */ VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF); } VCloseVolumeHandles_r(vp); FreeVolumeHeader(vp); /* volume has been effectively offlined; we can mark it in the SALVAGING * state now, which lets FSSYNC give it away */ VChangeState_r(vp, VOL_STATE_SALVAGING); VCancelReservation_r(vp); return 1; } /** * check whether a salvage needs to be performed on this volume. * * @param[in] vp pointer to volume object * * @return status code * @retval 0 no salvage scheduled * @retval 1 a salvage has been scheduled with the salvageserver * * @pre VOL_LOCK is held * * @post if salvage request flag is set and nUsers and nWaiters are zero, * then a salvage will be requested * * @note this is one of the event handlers called by VCancelReservation_r * * @note the caller must check if the volume needs to be freed after calling * this; the volume may not have any references or be on any lists after * we return, and we do not free it * * @see VCancelReservation_r * * @internal volume package internal use only. */ static int VCheckSalvage(Volume * vp) { int ret = 0; #if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT) if (vp->nUsers) return ret; if (!vp->salvage.requested) { return ret; } /* prevent recursion; some of the code below creates and removes * lightweight refs, which can call VCheckSalvage */ if (vp->salvage.scheduling) { return ret; } vp->salvage.scheduling = 1; if (V_attachState(vp) == VOL_STATE_SALVAGE_REQ) { if (!VOfflineForSalvage_r(vp)) { vp->salvage.scheduling = 0; return ret; } } if (vp->salvage.requested) { VScheduleSalvage_r(vp); ret = 1; } vp->salvage.scheduling = 0; #endif /* SALVSYNC_BUILD_CLIENT || FSSYNC_BUILD_CLIENT */ return ret; } /** * request volume salvage. * * @param[out] ec computed client error code * @param[in] vp volume object pointer * @param[in] reason reason code (passed to salvageserver via SALVSYNC) * @param[in] flags see flags note below * * @note flags: * VOL_SALVAGE_NO_OFFLINE do not need to wait to offline the volume; it has * not been fully attached * * @pre VOL_LOCK is held. * * @post volume state is changed. * for fileserver, salvage will be requested once refcount reaches zero. * * @return operation status code * @retval 0 volume salvage will occur * @retval 1 volume salvage could not be scheduled * * @note DAFS only * * @note in the fileserver, this call does not synchronously schedule a volume * salvage. rather, it sets volume state so that when volume refcounts * reach zero, a volume salvage will occur. by "refcounts", we mean both * nUsers and nWaiters must be zero. * * @internal volume package internal use only. */ int VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags) { int code = 0; /* * for DAFS volume utilities that are not supposed to schedule salvages, * just transition to error state instead */ if (!VCanScheduleSalvage()) { VChangeState_r(vp, VOL_STATE_ERROR); *ec = VSALVAGE; return 1; } if (programType != fileServer && !VCanUseFSSYNC()) { VChangeState_r(vp, VOL_STATE_ERROR); *ec = VSALVAGE; return 1; } if (!vp->salvage.requested) { vp->salvage.requested = 1; vp->salvage.reason = reason; vp->stats.last_salvage = FT_ApproxTime(); /* Note that it is not possible for us to reach this point if a * salvage is already running on this volume (even if the fileserver * was restarted during the salvage). If a salvage were running, the * salvager would have write-locked the volume header file, so when * we tried to lock the volume header, the lock would have failed, * and we would have failed during attachment prior to calling * VRequestSalvage. So we know that we can schedule salvages without * fear of a salvage already running for this volume. */ if (vp->stats.salvages < SALVAGE_COUNT_MAX) { /* if we don't need to offline the volume, we can go directly * to SALVAGING. SALVAGING says the volume is offline and is * either salvaging or ready to be handed to the salvager. * SALVAGE_REQ says that we want to salvage the volume, but we * are waiting for it to go offline first. */ if (flags & VOL_SALVAGE_NO_OFFLINE) { VChangeState_r(vp, VOL_STATE_SALVAGING); } else { VChangeState_r(vp, VOL_STATE_SALVAGE_REQ); if (vp->nUsers == 0) { /* normally VOfflineForSalvage_r would be called from * PutVolume et al when nUsers reaches 0, but if * it's already 0, just do it ourselves, since PutVolume * isn't going to get called */ VOfflineForSalvage_r(vp); } } /* If we are non-fileserver, we're telling the fileserver to * salvage the vol, so we don't need to give it back separately. */ vp->needsPutBack = 0; *ec = VSALVAGING; } else { Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid); /* make sure neither VScheduleSalvage_r nor * VUpdateSalvagePriority_r try to schedule another salvage */ vp->salvage.requested = vp->salvage.scheduled = 0; VChangeState_r(vp, VOL_STATE_ERROR); *ec = VSALVAGE; code = 1; } } return code; } /** * update salvageserver scheduling priority for a volume. * * @param[in] vp pointer to volume object * * @return operation status * @retval 0 success * @retval 1 request denied, or SALVSYNC communications failure * * @pre VOL_LOCK is held. * * @post in-core salvage priority counter is incremented. if at least * SALVAGE_PRIO_UPDATE_INTERVAL seconds have elapsed since the * last SALVSYNC_RAISEPRIO request, we contact the salvageserver * to update its priority queue. if no salvage is scheduled, * this function is a no-op. * * @note DAFS fileserver only * * @note this should be called whenever a VGetVolume fails due to a * pending salvage request * * @todo should set exclusive state and drop glock around salvsync call * * @internal volume package internal use only. */ int VUpdateSalvagePriority_r(Volume * vp) { int ret=0; #ifdef SALVSYNC_BUILD_CLIENT afs_uint32 now; int code; vp->salvage.prio++; now = FT_ApproxTime(); /* update the salvageserver priority queue occasionally so that * frequently requested volumes get moved to the head of the queue */ if ((vp->salvage.scheduled) && (vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) { code = SALVSYNC_SalvageVolume(vp->hashid, VPartitionPath(vp->partition), SALVSYNC_RAISEPRIO, vp->salvage.reason, vp->salvage.prio, NULL); vp->stats.last_salvage_req = now; if (code != SYNC_OK) { ret = 1; } } #endif /* SALVSYNC_BUILD_CLIENT */ return ret; } #if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT) /* A couple of little helper functions. These return true if we tried to * use this mechanism to schedule a salvage, false if we haven't tried. * If we did try a salvage then the results are contained in code. */ static_inline int try_SALVSYNC(Volume *vp, char *partName, int *code) { #ifdef SALVSYNC_BUILD_CLIENT if (VCanUseSALVSYNC()) { Log("Scheduling salvage for volume %lu on part %s over SALVSYNC\n", afs_printable_uint32_lu(vp->hashid), partName); /* can't use V_id() since there's no guarantee * we have the disk data header at this point */ *code = SALVSYNC_SalvageVolume(vp->hashid, partName, SALVSYNC_SALVAGE, vp->salvage.reason, vp->salvage.prio, NULL); return 1; } #endif return 0; } static_inline int try_FSSYNC(Volume *vp, char *partName, int *code) { #ifdef FSSYNC_BUILD_CLIENT if (VCanUseFSSYNC()) { Log("Scheduling salvage for volume %lu on part %s over FSSYNC\n", afs_printable_uint32_lu(vp->hashid), partName); /* * If we aren't the fileserver, tell the fileserver the volume * needs to be salvaged. We could directly tell the * salvageserver, but the fileserver keeps track of some stats * related to salvages, and handles some other salvage-related * complications for us. */ *code = FSYNC_VolOp(vp->hashid, partName, FSYNC_VOL_FORCE_ERROR, FSYNC_SALVAGE, NULL); return 1; } #endif /* FSSYNC_BUILD_CLIENT */ return 0; } /** * schedule a salvage with the salvage server or fileserver. * * @param[in] vp pointer to volume object * * @return operation status * @retval 0 salvage scheduled successfully * @retval 1 salvage not scheduled, or SALVSYNC/FSSYNC com error * * @pre * @arg VOL_LOCK is held. * @arg nUsers and nWaiters should be zero. * * @post salvageserver or fileserver is sent a salvage request * * @note If we are the fileserver, the request will be sent to the salvage * server over SALVSYNC. If we are not the fileserver, the request will be * sent to the fileserver over FSSYNC (FSYNC_VOL_FORCE_ERROR/FSYNC_SALVAGE). * * @note the caller must check if the volume needs to be freed after calling * this; the volume may not have any references or be on any lists after * we return, and we do not free it * * @note DAFS only * * @internal volume package internal use only. */ static int VScheduleSalvage_r(Volume * vp) { int ret=0; int code = 0; VolState state_save; VThreadOptions_t * thread_opts; char partName[16]; osi_Assert(VCanUseSALVSYNC() || VCanUseFSSYNC()); if (vp->nWaiters || vp->nUsers) { return 1; } /* prevent endless salvage,attach,salvage,attach,... loops */ if (vp->stats.salvages >= SALVAGE_COUNT_MAX) return 1; /* * don't perform salvsync ops on certain threads */ thread_opts = pthread_getspecific(VThread_key); if (thread_opts == NULL) { thread_opts = &VThread_defaults; } if (thread_opts->disallow_salvsync || vol_disallow_salvsync) { return 1; } if (vp->salvage.scheduled) { return ret; } VCreateReservation_r(vp); VWaitExclusiveState_r(vp); /* * XXX the scheduling process should really be done asynchronously * to avoid fssync deadlocks */ if (!vp->salvage.scheduled) { /* if we haven't previously scheduled a salvage, do so now * * set the volume to an exclusive state and drop the lock * around the SALVSYNC call */ strlcpy(partName, vp->partition->name, sizeof(partName)); state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ); VOL_UNLOCK; osi_Assert(try_SALVSYNC(vp, partName, &code) || try_FSSYNC(vp, partName, &code)); VOL_LOCK; VChangeState_r(vp, state_save); if (code == SYNC_OK) { vp->salvage.scheduled = 1; vp->stats.last_salvage_req = FT_ApproxTime(); if (VCanUseSALVSYNC()) { /* don't record these stats for non-fileservers; let the * fileserver take care of these */ vp->stats.salvages++; IncUInt64(&VStats.salvages); } } else { ret = 1; switch(code) { case SYNC_BAD_COMMAND: case SYNC_COM_ERROR: break; case SYNC_DENIED: Log("VScheduleSalvage_r: Salvage request for volume %lu " "denied\n", afs_printable_uint32_lu(vp->hashid)); break; case SYNC_FAILED: Log("VScheduleSalvage_r: Salvage request for volume %lu " "failed\n", afs_printable_uint32_lu(vp->hashid)); break; default: Log("VScheduleSalvage_r: Salvage request for volume %lu " "received unknown protocol error %d\n", afs_printable_uint32_lu(vp->hashid), code); break; } if (VCanUseFSSYNC()) { VChangeState_r(vp, VOL_STATE_ERROR); } } } /* NB: this is cancelling the reservation we obtained above, but we do * not call VCancelReservation_r, since that may trigger the vp dtor, * possibly free'ing the vp. We need to keep the vp around after * this, as the caller may reference vp without any refs. Instead, it * is the duty of the caller to inspect 'vp' after we return to see if * needs to be freed. */ osi_Assert(--vp->nWaiters >= 0); return ret; } #endif /* SALVSYNC_BUILD_CLIENT || FSSYNC_BUILD_CLIENT */ #ifdef SALVSYNC_BUILD_CLIENT /** * connect to the salvageserver SYNC service. * * @return operation status * @retval 0 failure * @retval 1 success * * @post connection to salvageserver SYNC service established * * @see VConnectSALV_r * @see VDisconnectSALV * @see VReconnectSALV */ int VConnectSALV(void) { int retVal; VOL_LOCK; retVal = VConnectSALV_r(); VOL_UNLOCK; return retVal; } /** * connect to the salvageserver SYNC service. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre VOL_LOCK is held. * * @post connection to salvageserver SYNC service established * * @see VConnectSALV * @see VDisconnectSALV_r * @see VReconnectSALV_r * @see SALVSYNC_clientInit * * @internal volume package internal use only. */ int VConnectSALV_r(void) { return SALVSYNC_clientInit(); } /** * disconnect from the salvageserver SYNC service. * * @return operation status * @retval 0 success * * @pre client should have a live connection to the salvageserver * * @post connection to salvageserver SYNC service destroyed * * @see VDisconnectSALV_r * @see VConnectSALV * @see VReconnectSALV */ int VDisconnectSALV(void) { VOL_LOCK; VDisconnectSALV_r(); VOL_UNLOCK; return 0; } /** * disconnect from the salvageserver SYNC service. * * @return operation status * @retval 0 success * * @pre * @arg VOL_LOCK is held. * @arg client should have a live connection to the salvageserver. * * @post connection to salvageserver SYNC service destroyed * * @see VDisconnectSALV * @see VConnectSALV_r * @see VReconnectSALV_r * @see SALVSYNC_clientFinis * * @internal volume package internal use only. */ int VDisconnectSALV_r(void) { return SALVSYNC_clientFinis(); } /** * disconnect and then re-connect to the salvageserver SYNC service. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre client should have a live connection to the salvageserver * * @post old connection is dropped, and a new one is established * * @see VConnectSALV * @see VDisconnectSALV * @see VReconnectSALV_r */ int VReconnectSALV(void) { int retVal; VOL_LOCK; retVal = VReconnectSALV_r(); VOL_UNLOCK; return retVal; } /** * disconnect and then re-connect to the salvageserver SYNC service. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre * @arg VOL_LOCK is held. * @arg client should have a live connection to the salvageserver. * * @post old connection is dropped, and a new one is established * * @see VConnectSALV_r * @see VDisconnectSALV * @see VReconnectSALV * @see SALVSYNC_clientReconnect * * @internal volume package internal use only. */ int VReconnectSALV_r(void) { return SALVSYNC_clientReconnect(); } #endif /* SALVSYNC_BUILD_CLIENT */ #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* FSSYNC routines */ /***************************************************/ /* This must be called by any volume utility which needs to run while the file server is also running. This is separated from VInitVolumePackage2 so that a utility can fork--and each of the children can independently initialize communication with the file server */ #ifdef FSSYNC_BUILD_CLIENT /** * connect to the fileserver SYNC service. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre * @arg VInit must equal 2. * @arg Program Type must not be fileserver or salvager. * * @post connection to fileserver SYNC service established * * @see VConnectFS_r * @see VDisconnectFS * @see VChildProcReconnectFS */ int VConnectFS(void) { int retVal; VOL_LOCK; retVal = VConnectFS_r(); VOL_UNLOCK; return retVal; } /** * connect to the fileserver SYNC service. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre * @arg VInit must equal 2. * @arg Program Type must not be fileserver or salvager. * @arg VOL_LOCK is held. * * @post connection to fileserver SYNC service established * * @see VConnectFS * @see VDisconnectFS_r * @see VChildProcReconnectFS_r * * @internal volume package internal use only. */ int VConnectFS_r(void) { int rc; osi_Assert((VInit == 2) && (programType != fileServer) && (programType != salvager)); rc = FSYNC_clientInit(); if (rc) { VSetVInit_r(3); } return rc; } /** * disconnect from the fileserver SYNC service. * * @pre * @arg client should have a live connection to the fileserver. * @arg VOL_LOCK is held. * @arg Program Type must not be fileserver or salvager. * * @post connection to fileserver SYNC service destroyed * * @see VDisconnectFS * @see VConnectFS_r * @see VChildProcReconnectFS_r * * @internal volume package internal use only. */ void VDisconnectFS_r(void) { osi_Assert((programType != fileServer) && (programType != salvager)); FSYNC_clientFinis(); VSetVInit_r(2); } /** * disconnect from the fileserver SYNC service. * * @pre * @arg client should have a live connection to the fileserver. * @arg Program Type must not be fileserver or salvager. * * @post connection to fileserver SYNC service destroyed * * @see VDisconnectFS_r * @see VConnectFS * @see VChildProcReconnectFS */ void VDisconnectFS(void) { VOL_LOCK; VDisconnectFS_r(); VOL_UNLOCK; } /** * connect to the fileserver SYNC service from a child process following a fork. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre * @arg VOL_LOCK is held. * @arg current FSYNC handle is shared with a parent process * * @post current FSYNC handle is discarded and a new connection to the * fileserver SYNC service is established * * @see VChildProcReconnectFS * @see VConnectFS_r * @see VDisconnectFS_r * * @internal volume package internal use only. */ int VChildProcReconnectFS_r(void) { return FSYNC_clientChildProcReconnect(); } /** * connect to the fileserver SYNC service from a child process following a fork. * * @return operation status * @retval 0 failure * @retval 1 success * * @pre current FSYNC handle is shared with a parent process * * @post current FSYNC handle is discarded and a new connection to the * fileserver SYNC service is established * * @see VChildProcReconnectFS_r * @see VConnectFS * @see VDisconnectFS */ int VChildProcReconnectFS(void) { int ret; VOL_LOCK; ret = VChildProcReconnectFS_r(); VOL_UNLOCK; return ret; } #endif /* FSSYNC_BUILD_CLIENT */ /***************************************************/ /* volume bitmap routines */ /***************************************************/ /** * allocate a vnode bitmap number for the vnode * * @param[out] ec error code * @param[in] vp volume object pointer * @param[in] index vnode index number for the vnode * @param[in] flags flag values described in note * * @note for DAFS, flags parameter controls locking behavior. * If (flags & VOL_ALLOC_BITMAP_WAIT) is set, then this function * will create a reservation and block on any other exclusive * operations. Otherwise, this function assumes the caller * already has exclusive access to vp, and we just change the * volume state. * * @pre VOL_LOCK held * * @return bit number allocated */ /* */ int VAllocBitmapEntry_r(Error * ec, Volume * vp, struct vnodeIndex *index, int flags) { int ret = 0; byte *bp, *ep; #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; #endif /* AFS_DEMAND_ATTACH_FS */ *ec = 0; /* This test is probably redundant */ if (!VolumeWriteable(vp)) { *ec = (bit32) VREADONLY; return ret; } #ifdef AFS_DEMAND_ATTACH_FS if (flags & VOL_ALLOC_BITMAP_WAIT) { VCreateReservation_r(vp); VWaitExclusiveState_r(vp); } state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP); #endif /* AFS_DEMAND_ATTACH_FS */ #ifdef BITMAP_LATER if ((programType == fileServer) && !index->bitmap) { int i; #ifndef AFS_DEMAND_ATTACH_FS /* demand attach fs uses the volume state to avoid races. * specialStatus field is not used at all */ int wasVBUSY = 0; if (vp->specialStatus == VBUSY) { if (vp->goingOffline) { /* vos dump waiting for the volume to * go offline. We probably come here * from AddNewReadableResidency */ wasVBUSY = 1; } else { while (vp->specialStatus == VBUSY) { #ifdef AFS_PTHREAD_ENV VOL_UNLOCK; sleep(2); VOL_LOCK; #else /* !AFS_PTHREAD_ENV */ IOMGR_Sleep(2); #endif /* !AFS_PTHREAD_ENV */ } } } #endif /* !AFS_DEMAND_ATTACH_FS */ if (!index->bitmap) { #ifndef AFS_DEMAND_ATTACH_FS vp->specialStatus = VBUSY; /* Stop anyone else from using it. */ #endif /* AFS_DEMAND_ATTACH_FS */ for (i = 0; i < nVNODECLASSES; i++) { VGetBitmap_r(ec, vp, i); if (*ec) { #ifdef AFS_DEMAND_ATTACH_FS VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, 0 /*flags*/); #else /* AFS_DEMAND_ATTACH_FS */ DeleteVolumeFromHashTable(vp); vp->shuttingDown = 1; /* Let who has it free it. */ vp->specialStatus = 0; #endif /* AFS_DEMAND_ATTACH_FS */ goto done; } } #ifndef AFS_DEMAND_ATTACH_FS if (!wasVBUSY) vp->specialStatus = 0; /* Allow others to have access. */ #endif /* AFS_DEMAND_ATTACH_FS */ } } #endif /* BITMAP_LATER */ #ifdef AFS_DEMAND_ATTACH_FS VOL_UNLOCK; #endif /* AFS_DEMAND_ATTACH_FS */ bp = index->bitmap + index->bitmapOffset; ep = index->bitmap + index->bitmapSize; while (bp < ep) { if ((*(bit32 *) bp) != (bit32) 0xffffffff) { int o; index->bitmapOffset = (afs_uint32) (bp - index->bitmap); while (*bp == 0xff) bp++; o = ffs(~*bp) - 1; /* ffs is documented in BSTRING(3) */ *bp |= (1 << o); ret = ((bp - index->bitmap) * 8 + o); #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; #endif /* AFS_DEMAND_ATTACH_FS */ goto done; } bp += sizeof(bit32) /* i.e. 4 */ ; } /* No bit map entry--must grow bitmap */ bp = (byte *) realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE); osi_Assert(bp != NULL); index->bitmap = bp; bp += index->bitmapSize; memset(bp, 0, VOLUME_BITMAP_GROWSIZE); index->bitmapOffset = index->bitmapSize; index->bitmapSize += VOLUME_BITMAP_GROWSIZE; *bp = 1; ret = index->bitmapOffset * 8; #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; #endif /* AFS_DEMAND_ATTACH_FS */ done: #ifdef AFS_DEMAND_ATTACH_FS VChangeState_r(vp, state_save); if (flags & VOL_ALLOC_BITMAP_WAIT) { VCancelReservation_r(vp); } #endif /* AFS_DEMAND_ATTACH_FS */ return ret; } int VAllocBitmapEntry(Error * ec, Volume * vp, struct vnodeIndex * index) { int retVal; VOL_LOCK; retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT); VOL_UNLOCK; return retVal; } void VFreeBitMapEntry_r(Error * ec, Volume *vp, struct vnodeIndex *index, unsigned bitNumber, int flags) { unsigned int offset; *ec = 0; #ifdef AFS_DEMAND_ATTACH_FS if (flags & VOL_FREE_BITMAP_WAIT) { /* VAllocBitmapEntry_r allocs bitmap entries under an exclusive volume * state, so ensure we're not in an exclusive volume state when we update * the bitmap */ VCreateReservation_r(vp); VWaitExclusiveState_r(vp); } #endif #ifdef BITMAP_LATER if (!index->bitmap) goto done; #endif /* BITMAP_LATER */ offset = bitNumber >> 3; if (offset >= index->bitmapSize) { *ec = VNOVNODE; goto done; } if (offset < index->bitmapOffset) index->bitmapOffset = offset & ~3; /* Truncate to nearest bit32 */ *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7)); done: #ifdef AFS_DEMAND_ATTACH_FS VCancelReservation_r(vp); #endif return; /* make the compiler happy for non-DAFS */ } void VFreeBitMapEntry(Error * ec, Volume *vp, struct vnodeIndex *index, unsigned bitNumber) { VOL_LOCK; VFreeBitMapEntry_r(ec, vp, index, bitNumber, VOL_FREE_BITMAP_WAIT); VOL_UNLOCK; } /* this function will drop the glock internally. * for old pthread fileservers, this is safe thanks to vbusy. * * for demand attach fs, caller must have already called * VCreateReservation_r and VWaitExclusiveState_r */ static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class) { StreamHandle_t *file; afs_sfsize_t nVnodes, size; struct VnodeClassInfo *vcp = &VnodeClassInfo[class]; struct vnodeIndex *vip = &vp->vnodeIndex[class]; struct VnodeDiskObject *vnode; unsigned int unique = 0; FdHandle_t *fdP; #ifdef BITMAP_LATER byte *BitMap = 0; #endif /* BITMAP_LATER */ #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; #endif /* AFS_DEMAND_ATTACH_FS */ *ec = 0; #ifdef AFS_DEMAND_ATTACH_FS state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP); #endif /* AFS_DEMAND_ATTACH_FS */ VOL_UNLOCK; fdP = IH_OPEN(vip->handle); osi_Assert(fdP != NULL); file = FDH_FDOPEN(fdP, "r"); osi_Assert(file != NULL); vnode = (VnodeDiskObject *) malloc(vcp->diskSize); osi_Assert(vnode != NULL); size = OS_SIZE(fdP->fd_fd); osi_Assert(size != -1); nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize) >> vcp->logSize; vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4; /* The 10 is a little extra so * a few files can be created in this volume, * the whole thing is rounded up to nearest 4 * bytes, because the bit map allocator likes * it that way */ #ifdef BITMAP_LATER BitMap = (byte *) calloc(1, vip->bitmapSize); osi_Assert(BitMap != NULL); #else /* BITMAP_LATER */ vip->bitmap = (byte *) calloc(1, vip->bitmapSize); osi_Assert(vip->bitmap != NULL); vip->bitmapOffset = 0; #endif /* BITMAP_LATER */ if (STREAM_ASEEK(file, vcp->diskSize) != -1) { int bitNumber = 0; for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) { if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1) break; if (vnode->type != vNull) { if (vnode->vnodeMagic != vcp->magic) { Log("GetBitmap: addled vnode index in volume %s; volume needs salvage\n", V_name(vp)); *ec = VSALVAGE; break; } #ifdef BITMAP_LATER *(BitMap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7)); #else /* BITMAP_LATER */ *(vip->bitmap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7)); #endif /* BITMAP_LATER */ if (unique <= vnode->uniquifier) unique = vnode->uniquifier + 1; } #ifndef AFS_PTHREAD_ENV if ((bitNumber & 0x00ff) == 0x0ff) { /* every 256 iterations */ IOMGR_Poll(); } #endif /* !AFS_PTHREAD_ENV */ } } if (vp->nextVnodeUnique < unique) { Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp)); *ec = VSALVAGE; } /* Paranoia, partly justified--I think fclose after fdopen * doesn't seem to close fd. In any event, the documentation * doesn't specify, so it's safer to close it twice. */ STREAM_CLOSE(file); FDH_CLOSE(fdP); free(vnode); VOL_LOCK; #ifdef BITMAP_LATER /* There may have been a racing condition with some other thread, both * creating the bitmaps for this volume. If the other thread was faster * the pointer to bitmap should already be filled and we can free ours. */ if (vip->bitmap == NULL) { vip->bitmap = BitMap; vip->bitmapOffset = 0; } else free((byte *) BitMap); #endif /* BITMAP_LATER */ #ifdef AFS_DEMAND_ATTACH_FS VChangeState_r(vp, state_save); #endif /* AFS_DEMAND_ATTACH_FS */ } /***************************************************/ /* Volume Path and Volume Number utility routines */ /***************************************************/ /** * find the first occurrence of a volume header file and return the path. * * @param[out] ec outbound error code * @param[in] volumeId volume id to find * @param[out] partitionp pointer to disk partition path string * @param[out] namep pointer to volume header file name string * * @post path to first occurrence of volume header is returned in partitionp * and namep, or ec is set accordingly. * * @warning this function is NOT re-entrant -- partitionp and namep point to * static data segments * * @note if a volume utility inadvertently leaves behind a stale volume header * on a vice partition, it is possible for callers to get the wrong one, * depending on the order of the disk partition linked list. * */ void VGetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep) { static char partition[VMAXPATHLEN], name[VMAXPATHLEN]; char path[VMAXPATHLEN]; int found = 0; struct DiskPartition64 *dp; *ec = 0; name[0] = OS_DIRSEPC; snprintf(&name[1], (sizeof name) - 1, VFORMAT, afs_printable_uint32_lu(volumeId)); for (dp = DiskPartitionList; dp; dp = dp->next) { struct afs_stat_st status; strcpy(path, VPartitionPath(dp)); strcat(path, name); if (afs_stat(path, &status) == 0) { strcpy(partition, dp->name); found = 1; break; } } if (!found) { *ec = VNOVOL; *partitionp = *namep = NULL; } else { *partitionp = partition; *namep = name; } } /** * extract a volume number from a volume header filename string. * * @param[in] name volume header filename string * * @return volume number * * @note the string must be of the form VFORMAT. the only permissible * deviation is a leading OS_DIRSEPC character. * * @see VFORMAT */ int VolumeNumber(char *name) { if (*name == OS_DIRSEPC) name++; return strtoul(name + 1, NULL, 10); } /** * compute the volume header filename. * * @param[in] volumeId * * @return volume header filename * * @post volume header filename string is constructed * * @warning this function is NOT re-entrant -- the returned string is * stored in a static char array. see VolumeExternalName_r * for a re-entrant equivalent. * * @see VolumeExternalName_r * * @deprecated due to the above re-entrancy warning, this interface should * be considered deprecated. Please use VolumeExternalName_r * in its stead. */ char * VolumeExternalName(VolumeId volumeId) { static char name[VMAXPATHLEN]; snprintf(name, sizeof name, VFORMAT, afs_printable_uint32_lu(volumeId)); return name; } /** * compute the volume header filename. * * @param[in] volumeId * @param[inout] name array in which to store filename * @param[in] len length of name array * * @return result code from afs_snprintf * * @see VolumeExternalName * @see afs_snprintf * * @note re-entrant equivalent of VolumeExternalName */ int VolumeExternalName_r(VolumeId volumeId, char * name, size_t len) { return snprintf(name, len, VFORMAT, afs_printable_uint32_lu(volumeId)); } /***************************************************/ /* Volume Usage Statistics routines */ /***************************************************/ #if OPENAFS_VOL_STATS #define OneDay (86400) /* 24 hours' worth of seconds */ #else #define OneDay (24*60*60) /* 24 hours */ #endif /* OPENAFS_VOL_STATS */ static time_t Midnight(time_t t) { struct tm local, *l; time_t midnight; #if defined(AFS_PTHREAD_ENV) && !defined(AFS_NT40_ENV) l = localtime_r(&t, &local); #else l = localtime(&t); #endif if (l != NULL) { /* the following is strictly speaking problematic on the switching day to daylight saving time, after the switch, as tm_isdst does not match. Similarly, on the looong day when switching back the OneDay check will not do what naively expected! The effects are minor, though, and more a matter of interpreting the numbers. */ #ifndef AFS_PTHREAD_ENV local = *l; #endif local.tm_hour = local.tm_min=local.tm_sec = 0; midnight = mktime(&local); if (midnight != (time_t) -1) return(midnight); } return( (t/OneDay)*OneDay ); } /*------------------------------------------------------------------------ * [export] VAdjustVolumeStatistics * * Description: * If we've passed midnight, we need to update all the day use * statistics as well as zeroing the detailed volume statistics * (if we are implementing them). * * Arguments: * vp : Pointer to the volume structure describing the lucky * volume being considered for update. * * Returns: * 0 (always!) * * Environment: * Nothing interesting. * * Side Effects: * As described. *------------------------------------------------------------------------*/ int VAdjustVolumeStatistics_r(Volume * vp) { unsigned int now = FT_ApproxTime(); if (now - V_dayUseDate(vp) > OneDay) { int ndays, i; ndays = (now - V_dayUseDate(vp)) / OneDay; for (i = 6; i > ndays - 1; i--) V_weekUse(vp)[i] = V_weekUse(vp)[i - ndays]; for (i = 0; i < ndays - 1 && i < 7; i++) V_weekUse(vp)[i] = 0; if (ndays <= 7) V_weekUse(vp)[ndays - 1] = V_dayUse(vp); V_dayUse(vp) = 0; V_dayUseDate(vp) = Midnight(now); #if OPENAFS_VOL_STATS /* * All we need to do is bzero the entire VOL_STATS_BYTES of * the detailed volume statistics area. */ memset((V_stat_area(vp)), 0, VOL_STATS_BYTES); #endif /* OPENAFS_VOL_STATS */ } /*It's been more than a day of collection */ /* * Always return happily. */ return (0); } /*VAdjustVolumeStatistics */ int VAdjustVolumeStatistics(Volume * vp) { int retVal; VOL_LOCK; retVal = VAdjustVolumeStatistics_r(vp); VOL_UNLOCK; return retVal; } void VBumpVolumeUsage_r(Volume * vp) { unsigned int now = FT_ApproxTime(); V_accessDate(vp) = now; if (now - V_dayUseDate(vp) > OneDay) VAdjustVolumeStatistics_r(vp); /* * Save the volume header image to disk after a threshold of bumps to dayUse, * at most every usage_rate_limit seconds. */ V_dayUse(vp)++; vp->usage_bumps_outstanding++; if (vp->usage_bumps_outstanding >= vol_opts.usage_threshold && vp->usage_bumps_next_write <= now) { Error error; vp->usage_bumps_outstanding = 0; vp->usage_bumps_next_write = now + vol_opts.usage_rate_limit; VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT); } } void VBumpVolumeUsage(Volume * vp) { VOL_LOCK; VBumpVolumeUsage_r(vp); VOL_UNLOCK; } void VSetDiskUsage_r(void) { #ifndef AFS_DEMAND_ATTACH_FS static int FifteenMinuteCounter = 0; #endif while (VInit < 2) { /* NOTE: Don't attempt to access the partitions list until the * initialization level indicates that all volumes are attached, * which implies that all partitions are initialized. */ #ifdef AFS_PTHREAD_ENV VOL_CV_WAIT(&vol_vinit_cond); #else /* AFS_PTHREAD_ENV */ IOMGR_Sleep(10); #endif /* AFS_PTHREAD_ENV */ } VResetDiskUsage_r(); #ifndef AFS_DEMAND_ATTACH_FS if (++FifteenMinuteCounter == 3) { FifteenMinuteCounter = 0; VScanUpdateList(); } #endif /* !AFS_DEMAND_ATTACH_FS */ } void VSetDiskUsage(void) { VOL_LOCK; VSetDiskUsage_r(); VOL_UNLOCK; } /***************************************************/ /* Volume Update List routines */ /***************************************************/ /* The number of minutes that a volume hasn't been updated before the * "Dont salvage" flag in the volume header will be turned on */ #define SALVAGE_INTERVAL (10*60) /* * demand attach fs * * volume update list functionality has been moved into the VLRU * the DONT_SALVAGE flag is now set during VLRU demotion */ #ifndef AFS_DEMAND_ATTACH_FS static VolumeId *UpdateList = NULL; /* Pointer to array of Volume ID's */ static int nUpdatedVolumes = 0; /* Updated with entry in UpdateList, salvage after crash flag on */ static int updateSize = 0; /* number of entries possible */ #define UPDATE_LIST_SIZE 128 /* initial size increment (must be a power of 2!) */ #endif /* !AFS_DEMAND_ATTACH_FS */ void VAddToVolumeUpdateList_r(Error * ec, Volume * vp) { *ec = 0; vp->updateTime = FT_ApproxTime(); if (V_dontSalvage(vp) == 0) return; V_dontSalvage(vp) = 0; VSyncVolume_r(ec, vp, 0); #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(vp) &= ~(VOL_HDR_DONTSALV); #else /* !AFS_DEMAND_ATTACH_FS */ if (*ec) return; if (UpdateList == NULL) { updateSize = UPDATE_LIST_SIZE; UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize); } else { if (nUpdatedVolumes == updateSize) { updateSize <<= 1; if (updateSize > 524288) { Log("warning: there is likely a bug in the volume update scanner\n"); return; } UpdateList = (VolumeId *) realloc(UpdateList, sizeof(VolumeId) * updateSize); } } osi_Assert(UpdateList != NULL); UpdateList[nUpdatedVolumes++] = V_id(vp); #endif /* !AFS_DEMAND_ATTACH_FS */ } #ifndef AFS_DEMAND_ATTACH_FS static void VScanUpdateList(void) { int i, gap; Volume *vp; Error error; afs_uint32 now = FT_ApproxTime(); /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */ for (i = gap = 0; i < nUpdatedVolumes; i++) { if (gap) UpdateList[i - gap] = UpdateList[i]; /* XXX this routine needlessly messes up the Volume LRU by * breaking the LRU temporal-locality assumptions..... * we should use a special volume header allocator here */ vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]); if (error) { gap++; } else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) { V_dontSalvage(vp) = DONT_SALVAGE; VUpdateVolume_r(&error, vp, 0); /* No need to fsync--not critical */ gap++; } if (vp) { VPutVolume_r(vp); } #ifndef AFS_PTHREAD_ENV IOMGR_Poll(); #endif /* !AFS_PTHREAD_ENV */ } nUpdatedVolumes -= gap; } #endif /* !AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* Volume LRU routines */ /***************************************************/ /* demand attach fs * volume LRU * * with demand attach fs, we attempt to soft detach(1) * volumes which have not been accessed in a long time * in order to speed up fileserver shutdown * * (1) by soft detach we mean a process very similar * to VOffline, except the final state of the * Volume will be VOL_STATE_PREATTACHED, instead * of the usual VOL_STATE_UNATTACHED */ #ifdef AFS_DEMAND_ATTACH_FS /* implementation is reminiscent of a generational GC * * queue 0 is newly attached volumes. this queue is * sorted by attach timestamp * * queue 1 is volumes that have been around a bit * longer than queue 0. this queue is sorted by * attach timestamp * * queue 2 is volumes tha have been around the longest. * this queue is unsorted * * queue 3 is volumes that have been marked as * candidates for soft detachment. this queue is * unsorted */ #define VLRU_GENERATIONS 3 /**< number of generations in VLRU */ #define VLRU_QUEUES 5 /**< total number of VLRU queues */ /** * definition of a VLRU queue. */ struct VLRU_q { volatile struct rx_queue q; volatile int len; volatile int busy; pthread_cond_t cv; }; /** * main VLRU data structure. */ struct VLRU { struct VLRU_q q[VLRU_QUEUES]; /**< VLRU queues */ /* VLRU config */ /** time interval (in seconds) between promotion passes for * each young generation queue. */ afs_uint32 promotion_interval[VLRU_GENERATIONS-1]; /** time interval (in seconds) between soft detach candidate * scans for each generation queue. * * scan_interval[VLRU_QUEUE_CANDIDATE] defines how frequently * we perform a soft detach pass. */ afs_uint32 scan_interval[VLRU_GENERATIONS+1]; /* scheduler state */ int next_idx; /**< next queue to receive attention */ afs_uint32 last_promotion[VLRU_GENERATIONS-1]; /**< timestamp of last promotion scan */ afs_uint32 last_scan[VLRU_GENERATIONS+1]; /**< timestamp of last detach scan */ int scanner_state; /**< state of scanner thread */ pthread_cond_t cv; /**< state transition CV */ }; /** global VLRU state */ static struct VLRU volume_LRU; /** * defined states for VLRU scanner thread. */ typedef enum { VLRU_SCANNER_STATE_OFFLINE = 0, /**< vlru scanner thread is offline */ VLRU_SCANNER_STATE_ONLINE = 1, /**< vlru scanner thread is online */ VLRU_SCANNER_STATE_SHUTTING_DOWN = 2, /**< vlru scanner thread is shutting down */ VLRU_SCANNER_STATE_PAUSING = 3, /**< vlru scanner thread is getting ready to pause */ VLRU_SCANNER_STATE_PAUSED = 4 /**< vlru scanner thread is paused */ } vlru_thread_state_t; /* vlru disk data header stuff */ #define VLRU_DISK_MAGIC 0x7a8b9cad /**< vlru disk entry magic number */ #define VLRU_DISK_VERSION 1 /**< vlru disk entry version number */ /** vlru default expiration time (for eventual fs state serialization of vlru data) */ #define VLRU_DUMP_EXPIRATION_TIME (60*60*24*7) /* expire vlru data after 1 week */ /** minimum volume inactivity (in seconds) before a volume becomes eligible for * soft detachment. */ static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH; /** time interval (in seconds) between VLRU scanner thread soft detach passes. */ static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL; /** maximum number of volumes to soft detach in a VLRU soft detach pass. */ static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX; /** VLRU control flag. non-zero value implies VLRU subsystem is activated. */ static afs_uint32 VLRU_enabled = 1; /* queue synchronization routines */ static void VLRU_BeginExclusive_r(struct VLRU_q * q); static void VLRU_EndExclusive_r(struct VLRU_q * q); static void VLRU_Wait_r(struct VLRU_q * q); /** * set VLRU subsystem tunable parameters. * * @param[in] option tunable option to modify * @param[in] val new value for tunable parameter * * @pre @c VInitVolumePackage2 has not yet been called. * * @post tunable parameter is modified * * @note DAFS only * * @note valid option parameters are: * @arg @c VLRU_SET_THRESH * set the period of inactivity after which * volumes are eligible for soft detachment * @arg @c VLRU_SET_INTERVAL * set the time interval between calls * to the volume LRU "garbage collector" * @arg @c VLRU_SET_MAX * set the max number of volumes to deallocate * in one GC pass */ void VLRU_SetOptions(int option, afs_uint32 val) { if (option == VLRU_SET_THRESH) { VLRU_offline_thresh = val; } else if (option == VLRU_SET_INTERVAL) { VLRU_offline_interval = val; } else if (option == VLRU_SET_MAX) { VLRU_offline_max = val; } else if (option == VLRU_SET_ENABLED) { VLRU_enabled = val; } VLRU_ComputeConstants(); } /** * compute VLRU internal timing parameters. * * @post VLRU scanner thread internal timing parameters are computed * * @note computes internal timing parameters based upon user-modifiable * tunable parameters. * * @note DAFS only * * @internal volume package internal use only. */ static void VLRU_ComputeConstants(void) { afs_uint32 factor = VLRU_offline_thresh / VLRU_offline_interval; /* compute the candidate scan interval */ volume_LRU.scan_interval[VLRU_QUEUE_CANDIDATE] = VLRU_offline_interval; /* compute the promotion intervals */ volume_LRU.promotion_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh * 2; volume_LRU.promotion_interval[VLRU_QUEUE_MID] = VLRU_offline_thresh * 4; if (factor > 16) { /* compute the gen 0 scan interval */ volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh / 8; } else { /* compute the gen 0 scan interval */ volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_interval * 2; } } /** * initialize VLRU subsystem. * * @pre this function has not yet been called * * @post VLRU subsystem is initialized and VLRU scanner thread is starting * * @note DAFS only * * @internal volume package internal use only. */ static void VInitVLRU(void) { pthread_t tid; pthread_attr_t attrs; int i; if (!VLRU_enabled) { Log("VLRU: disabled\n"); return; } /* initialize each of the VLRU queues */ for (i = 0; i < VLRU_QUEUES; i++) { queue_Init(&volume_LRU.q[i]); volume_LRU.q[i].len = 0; volume_LRU.q[i].busy = 0; CV_INIT(&volume_LRU.q[i].cv, "vol lru", CV_DEFAULT, 0); } /* setup the timing constants */ VLRU_ComputeConstants(); /* XXX put inside LogLevel check? */ Log("VLRU: starting scanner with the following configuration parameters:\n"); Log("VLRU: offlining volumes after minimum of %d seconds of inactivity\n", VLRU_offline_thresh); Log("VLRU: running VLRU soft detach pass every %d seconds\n", VLRU_offline_interval); Log("VLRU: taking up to %d volumes offline per pass\n", VLRU_offline_max); Log("VLRU: scanning generation 0 for inactive volumes every %d seconds\n", volume_LRU.scan_interval[0]); Log("VLRU: scanning for promotion/demotion between generations 0 and 1 every %d seconds\n", volume_LRU.promotion_interval[0]); Log("VLRU: scanning for promotion/demotion between generations 1 and 2 every %d seconds\n", volume_LRU.promotion_interval[1]); /* start up the VLRU scanner */ volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE; if (programType == fileServer) { CV_INIT(&volume_LRU.cv, "vol lru", CV_DEFAULT, 0); osi_Assert(pthread_attr_init(&attrs) == 0); osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0); osi_Assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0); } } /** * initialize the VLRU-related fields of a newly allocated volume object. * * @param[in] vp pointer to volume object * * @pre * @arg @c VOL_LOCK is held. * @arg volume object is not on a VLRU queue. * * @post VLRU fields are initialized to indicate that volume object is not * currently registered with the VLRU subsystem * * @note DAFS only * * @internal volume package interal use only. */ static void VLRU_Init_Node_r(Volume * vp) { if (!VLRU_enabled) return; osi_Assert(queue_IsNotOnQueue(&vp->vlru)); vp->vlru.idx = VLRU_QUEUE_INVALID; } /** * add a volume object to a VLRU queue. * * @param[in] vp pointer to volume object * * @pre * @arg @c VOL_LOCK is held. * @arg caller MUST hold a lightweight ref on @p vp. * @arg caller MUST NOT hold exclusive ownership of the VLRU queue. * * @post the volume object is added to the appropriate VLRU queue * * @note if @c vp->vlru.idx contains the index of a valid VLRU queue, * then the volume is added to that queue. Otherwise, the value * @c VLRU_QUEUE_NEW is stored into @c vp->vlru.idx and the * volume is added to the NEW generation queue. * * @note @c VOL_LOCK may be dropped internally * * @note Volume state is temporarily set to @c VOL_STATE_VLRU_ADD * during the add operation, and is restored to the previous * state prior to return. * * @note DAFS only * * @internal volume package internal use only. */ static void VLRU_Add_r(Volume * vp) { int idx; VolState state_save; if (!VLRU_enabled) return; if (queue_IsOnQueue(&vp->vlru)) return; state_save = VChangeState_r(vp, VOL_STATE_VLRU_ADD); idx = vp->vlru.idx; if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) { idx = VLRU_QUEUE_NEW; } VLRU_Wait_r(&volume_LRU.q[idx]); /* repeat check since VLRU_Wait_r may have dropped * the glock */ if (queue_IsNotOnQueue(&vp->vlru)) { vp->vlru.idx = idx; queue_Prepend(&volume_LRU.q[idx], &vp->vlru); volume_LRU.q[idx].len++; V_attachFlags(vp) |= VOL_ON_VLRU; vp->stats.last_promote = FT_ApproxTime(); } VChangeState_r(vp, state_save); } /** * delete a volume object from a VLRU queue. * * @param[in] vp pointer to volume object * * @pre * @arg @c VOL_LOCK is held. * @arg caller MUST hold a lightweight ref on @p vp. * @arg caller MUST NOT hold exclusive ownership of the VLRU queue. * * @post volume object is removed from the VLRU queue * * @note @c VOL_LOCK may be dropped internally * * @note DAFS only * * @todo We should probably set volume state to something exlcusive * (as @c VLRU_Add_r does) prior to dropping @c VOL_LOCK. * * @internal volume package internal use only. */ static void VLRU_Delete_r(Volume * vp) { int idx; if (!VLRU_enabled) return; if (queue_IsNotOnQueue(&vp->vlru)) return; /* handle races */ do { idx = vp->vlru.idx; if (idx == VLRU_QUEUE_INVALID) return; VLRU_Wait_r(&volume_LRU.q[idx]); } while (idx != vp->vlru.idx); /* now remove from the VLRU and update * the appropriate counter */ queue_Remove(&vp->vlru); volume_LRU.q[idx].len--; vp->vlru.idx = VLRU_QUEUE_INVALID; V_attachFlags(vp) &= ~(VOL_ON_VLRU); } /** * tell the VLRU subsystem that a volume was just accessed. * * @param[in] vp pointer to volume object * * @pre * @arg @c VOL_LOCK is held * @arg caller MUST hold a lightweight ref on @p vp * @arg caller MUST NOT hold exclusive ownership of any VLRU queue * * @post volume VLRU access statistics are updated. If the volume was on * the VLRU soft detach candidate queue, it is moved to the NEW * generation queue. * * @note @c VOL_LOCK may be dropped internally * * @note DAFS only * * @internal volume package internal use only. */ static void VLRU_UpdateAccess_r(Volume * vp) { Volume * rvp = NULL; if (!VLRU_enabled) return; if (queue_IsNotOnQueue(&vp->vlru)) return; osi_Assert(V_attachFlags(vp) & VOL_ON_VLRU); /* update the access timestamp */ vp->stats.last_get = FT_ApproxTime(); /* * if the volume is on the soft detach candidate * list, we need to safely move it back to a * regular generation. this has to be done * carefully so we don't race against the scanner * thread. */ /* if this volume is on the soft detach candidate queue, * then grab exclusive access to the necessary queues */ if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) { rvp = vp; VCreateReservation_r(rvp); VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]); VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]); VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); } /* make sure multiple threads don't race to update */ if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) { VLRU_SwitchQueues(vp, VLRU_QUEUE_NEW, 1); } if (rvp) { VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]); VCancelReservation_r(rvp); } } /** * switch a volume between two VLRU queues. * * @param[in] vp pointer to volume object * @param[in] new_idx index of VLRU queue onto which the volume will be moved * @param[in] append controls whether the volume will be appended or * prepended to the queue. A nonzero value means it will * be appended; zero means it will be prepended. * * @pre The new (and old, if applicable) queue(s) must either be owned * exclusively by the calling thread for asynchronous manipulation, * or the queue(s) must be quiescent and VOL_LOCK must be held. * Please see VLRU_BeginExclusive_r, VLRU_EndExclusive_r and VLRU_Wait_r * for further details of the queue asynchronous processing mechanism. * * @post If the volume object was already on a VLRU queue, it is * removed from the queue. Depending on the value of the append * parameter, the volume object is either appended or prepended * to the VLRU queue referenced by the new_idx parameter. * * @note DAFS only * * @see VLRU_BeginExclusive_r * @see VLRU_EndExclusive_r * @see VLRU_Wait_r * * @internal volume package internal use only. */ static void VLRU_SwitchQueues(Volume * vp, int new_idx, int append) { if (queue_IsNotOnQueue(&vp->vlru)) return; queue_Remove(&vp->vlru); volume_LRU.q[vp->vlru.idx].len--; /* put the volume back on the correct generational queue */ if (append) { queue_Append(&volume_LRU.q[new_idx], &vp->vlru); } else { queue_Prepend(&volume_LRU.q[new_idx], &vp->vlru); } volume_LRU.q[new_idx].len++; vp->vlru.idx = new_idx; } /** * VLRU background thread. * * The VLRU Scanner Thread is responsible for periodically scanning through * each VLRU queue looking for volumes which should be moved to another * queue, or soft detached. * * @param[in] args unused thread arguments parameter * * @return unused thread return value * @retval NULL always * * @internal volume package internal use only. */ static void * VLRU_ScannerThread(void * args) { afs_uint32 now, min_delay, delay; int i, min_idx, min_op, overdue, state; /* set t=0 for promotion cycle to be * fileserver startup */ now = FT_ApproxTime(); for (i=0; i < VLRU_GENERATIONS-1; i++) { volume_LRU.last_promotion[i] = now; } /* don't start the scanner until VLRU_offline_thresh * plus a small delay for VInitVolumePackage2 to finish * has gone by */ sleep(VLRU_offline_thresh + 60); /* set t=0 for scan cycle to be now */ now = FT_ApproxTime(); for (i=0; i < VLRU_GENERATIONS+1; i++) { volume_LRU.last_scan[i] = now; } VOL_LOCK; if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_OFFLINE) { volume_LRU.scanner_state = VLRU_SCANNER_STATE_ONLINE; } while ((state = volume_LRU.scanner_state) != VLRU_SCANNER_STATE_SHUTTING_DOWN) { /* check to see if we've been asked to pause */ if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) { volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED; CV_BROADCAST(&volume_LRU.cv); do { VOL_CV_WAIT(&volume_LRU.cv); } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED); } /* scheduling can happen outside the glock */ VOL_UNLOCK; /* figure out what is next on the schedule */ /* figure out a potential schedule for the new generation first */ overdue = 0; min_delay = volume_LRU.scan_interval[0] + volume_LRU.last_scan[0] - now; min_idx = 0; min_op = 0; if (min_delay > volume_LRU.scan_interval[0]) { /* unsigned overflow -- we're overdue to run this scan */ min_delay = 0; overdue = 1; } /* if we're not overdue for gen 0, figure out schedule for candidate gen */ if (!overdue) { i = VLRU_QUEUE_CANDIDATE; delay = volume_LRU.scan_interval[i] + volume_LRU.last_scan[i] - now; if (delay < min_delay) { min_delay = delay; min_idx = i; } if (delay > volume_LRU.scan_interval[i]) { /* unsigned overflow -- we're overdue to run this scan */ min_delay = 0; min_idx = i; overdue = 1; } } /* if we're still not overdue for something, figure out schedules for promotions */ for (i=0; !overdue && i < VLRU_GENERATIONS-1; i++) { delay = volume_LRU.promotion_interval[i] + volume_LRU.last_promotion[i] - now; if (delay < min_delay) { min_delay = delay; min_idx = i; min_op = 1; } if (delay > volume_LRU.promotion_interval[i]) { /* unsigned overflow -- we're overdue to run this promotion */ min_delay = 0; min_idx = i; min_op = 1; overdue = 1; break; } } /* sleep as needed */ if (min_delay) { sleep(min_delay); } /* do whatever is next */ VOL_LOCK; if (min_op) { VLRU_Promote_r(min_idx); VLRU_Demote_r(min_idx+1); } else { VLRU_Scan_r(min_idx); } now = FT_ApproxTime(); } Log("VLRU scanner asked to go offline (scanner_state=%d)\n", state); /* signal that scanner is down */ volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE; CV_BROADCAST(&volume_LRU.cv); VOL_UNLOCK; return NULL; } /** * promote volumes from one VLRU generation to the next. * * This routine scans a VLRU generation looking for volumes which are * eligible to be promoted to the next generation. All volumes which * meet the eligibility requirement are promoted. * * Promotion eligibility is based upon meeting both of the following * requirements: * * @arg The volume has been accessed since the last promotion: * @c (vp->stats.last_get >= vp->stats.last_promote) * @arg The last promotion occurred at least * @c volume_LRU.promotion_interval[idx] seconds ago * * As a performance optimization, promotions are "globbed". In other * words, we promote arbitrarily large contiguous sublists of elements * as one operation. * * @param[in] idx VLRU queue index to scan * * @note DAFS only * * @internal VLRU internal use only. */ static void VLRU_Promote_r(int idx) { int len, chaining, promote; afs_uint32 now, thresh; struct rx_queue *qp, *nqp; Volume * vp, *start = NULL, *end = NULL; /* get exclusive access to two chains, and drop the glock */ VLRU_Wait_r(&volume_LRU.q[idx]); VLRU_BeginExclusive_r(&volume_LRU.q[idx]); VLRU_Wait_r(&volume_LRU.q[idx+1]); VLRU_BeginExclusive_r(&volume_LRU.q[idx+1]); VOL_UNLOCK; thresh = volume_LRU.promotion_interval[idx]; now = FT_ApproxTime(); len = chaining = 0; for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) { vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); promote = (((vp->stats.last_promote + thresh) <= now) && (vp->stats.last_get >= vp->stats.last_promote)); if (chaining) { if (promote) { vp->vlru.idx++; len++; start = vp; } else { /* promote and prepend chain */ queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru); chaining = 0; } } else { if (promote) { vp->vlru.idx++; len++; chaining = 1; start = end = vp; } } } if (chaining) { /* promote and prepend */ queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru); } if (len) { volume_LRU.q[idx].len -= len; volume_LRU.q[idx+1].len += len; } /* release exclusive access to the two chains */ VOL_LOCK; volume_LRU.last_promotion[idx] = now; VLRU_EndExclusive_r(&volume_LRU.q[idx+1]); VLRU_EndExclusive_r(&volume_LRU.q[idx]); } /* run the demotions */ static void VLRU_Demote_r(int idx) { Error ec; int len, chaining, demote; afs_uint32 now, thresh; struct rx_queue *qp, *nqp; Volume * vp, *start = NULL, *end = NULL; Volume ** salv_flag_vec = NULL; int salv_vec_offset = 0; osi_Assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD); /* get exclusive access to two chains, and drop the glock */ VLRU_Wait_r(&volume_LRU.q[idx-1]); VLRU_BeginExclusive_r(&volume_LRU.q[idx-1]); VLRU_Wait_r(&volume_LRU.q[idx]); VLRU_BeginExclusive_r(&volume_LRU.q[idx]); VOL_UNLOCK; /* no big deal if this allocation fails */ if (volume_LRU.q[idx].len) { salv_flag_vec = (Volume **) malloc(volume_LRU.q[idx].len * sizeof(Volume *)); } now = FT_ApproxTime(); thresh = volume_LRU.promotion_interval[idx-1]; len = chaining = 0; for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) { vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); demote = (((vp->stats.last_promote + thresh) <= now) && (vp->stats.last_get < (now - thresh))); /* we now do volume update list DONT_SALVAGE flag setting during * demotion passes */ if (salv_flag_vec && !(V_attachFlags(vp) & VOL_HDR_DONTSALV) && demote && (vp->updateTime < (now - SALVAGE_INTERVAL)) && (V_attachState(vp) == VOL_STATE_ATTACHED)) { salv_flag_vec[salv_vec_offset++] = vp; VCreateReservation_r(vp); } if (chaining) { if (demote) { vp->vlru.idx--; len++; start = vp; } else { /* demote and append chain */ queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru); chaining = 0; } } else { if (demote) { vp->vlru.idx--; len++; chaining = 1; start = end = vp; } } } if (chaining) { queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru); } if (len) { volume_LRU.q[idx].len -= len; volume_LRU.q[idx-1].len += len; } /* release exclusive access to the two chains */ VOL_LOCK; VLRU_EndExclusive_r(&volume_LRU.q[idx]); VLRU_EndExclusive_r(&volume_LRU.q[idx-1]); /* now go back and set the DONT_SALVAGE flags as appropriate */ if (salv_flag_vec) { int i; for (i = 0; i < salv_vec_offset; i++) { vp = salv_flag_vec[i]; if (!(V_attachFlags(vp) & VOL_HDR_DONTSALV) && (vp->updateTime < (now - SALVAGE_INTERVAL)) && (V_attachState(vp) == VOL_STATE_ATTACHED)) { ec = VHold_r(vp); if (!ec) { V_attachFlags(vp) |= VOL_HDR_DONTSALV; V_dontSalvage(vp) = DONT_SALVAGE; VUpdateVolume_r(&ec, vp, 0); VPutVolume_r(vp); } } VCancelReservation_r(vp); } free(salv_flag_vec); } } /* run a pass of the VLRU GC scanner */ static void VLRU_Scan_r(int idx) { afs_uint32 now, thresh; struct rx_queue *qp, *nqp; Volume * vp; int i, locked = 1; osi_Assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE); /* gain exclusive access to the idx VLRU */ VLRU_Wait_r(&volume_LRU.q[idx]); VLRU_BeginExclusive_r(&volume_LRU.q[idx]); if (idx != VLRU_QUEUE_CANDIDATE) { /* gain exclusive access to the candidate VLRU */ VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); } now = FT_ApproxTime(); thresh = now - VLRU_offline_thresh; /* perform candidate selection and soft detaching */ if (idx == VLRU_QUEUE_CANDIDATE) { /* soft detach some volumes from the candidate pool */ VOL_UNLOCK; locked = 0; for (i=0,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) { vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); if (i >= VLRU_offline_max) { break; } /* check timestamp to see if it's a candidate for soft detaching */ if (vp->stats.last_get <= thresh) { VOL_LOCK; if (VCheckSoftDetach(vp, thresh)) i++; VOL_UNLOCK; } } } else { /* scan for volumes to become soft detach candidates */ for (i=1,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue),i++) { vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); /* check timestamp to see if it's a candidate for soft detaching */ if (vp->stats.last_get <= thresh) { VCheckSoftDetachCandidate(vp, thresh); } if (!(i&0x7f)) { /* lock coarsening optimization */ VOL_UNLOCK; pthread_yield(); VOL_LOCK; } } } /* relinquish exclusive access to the VLRU chains */ if (!locked) { VOL_LOCK; } volume_LRU.last_scan[idx] = now; if (idx != VLRU_QUEUE_CANDIDATE) { VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]); } VLRU_EndExclusive_r(&volume_LRU.q[idx]); } /* check whether volume is safe to soft detach * caller MUST NOT hold a ref count on vp */ static int VCheckSoftDetach(Volume * vp, afs_uint32 thresh) { int ret=0; if (vp->nUsers || vp->nWaiters) return 0; if (vp->stats.last_get <= thresh) { ret = VSoftDetachVolume_r(vp, thresh); } return ret; } /* check whether volume should be made a * soft detach candidate */ static int VCheckSoftDetachCandidate(Volume * vp, afs_uint32 thresh) { int idx, ret = 0; if (vp->nUsers || vp->nWaiters) return 0; idx = vp->vlru.idx; osi_Assert(idx == VLRU_QUEUE_NEW); if (vp->stats.last_get <= thresh) { /* move to candidate pool */ queue_Remove(&vp->vlru); volume_LRU.q[VLRU_QUEUE_NEW].len--; queue_Prepend(&volume_LRU.q[VLRU_QUEUE_CANDIDATE], &vp->vlru); vp->vlru.idx = VLRU_QUEUE_CANDIDATE; volume_LRU.q[VLRU_QUEUE_CANDIDATE].len++; ret = 1; } return ret; } /* begin exclusive access on VLRU */ static void VLRU_BeginExclusive_r(struct VLRU_q * q) { osi_Assert(q->busy == 0); q->busy = 1; } /* end exclusive access on VLRU */ static void VLRU_EndExclusive_r(struct VLRU_q * q) { osi_Assert(q->busy); q->busy = 0; CV_BROADCAST(&q->cv); } /* wait for another thread to end exclusive access on VLRU */ static void VLRU_Wait_r(struct VLRU_q * q) { while(q->busy) { VOL_CV_WAIT(&q->cv); } } /* demand attach fs * volume soft detach * * caller MUST NOT hold a ref count on vp */ static int VSoftDetachVolume_r(Volume * vp, afs_uint32 thresh) { afs_uint32 ts_save; int ret = 0; osi_Assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE); ts_save = vp->stats.last_get; if (ts_save > thresh) return 0; if (vp->nUsers || vp->nWaiters) return 0; if (VIsExclusiveState(V_attachState(vp))) { return 0; } switch (V_attachState(vp)) { case VOL_STATE_UNATTACHED: case VOL_STATE_PREATTACHED: case VOL_STATE_ERROR: case VOL_STATE_GOING_OFFLINE: case VOL_STATE_SHUTTING_DOWN: case VOL_STATE_SALVAGING: case VOL_STATE_DELETED: volume_LRU.q[vp->vlru.idx].len--; /* create and cancel a reservation to * give the volume an opportunity to * be deallocated */ VCreateReservation_r(vp); queue_Remove(&vp->vlru); vp->vlru.idx = VLRU_QUEUE_INVALID; V_attachFlags(vp) &= ~(VOL_ON_VLRU); VCancelReservation_r(vp); return 0; default: break; } /* hold the volume and take it offline. * no need for reservations, as VHold_r * takes care of that internally. */ if (VHold_r(vp) == 0) { /* vhold drops the glock, so now we should * check to make sure we aren't racing against * other threads. if we are racing, offlining vp * would be wasteful, and block the scanner for a while */ if (vp->nWaiters || (vp->nUsers > 1) || (vp->shuttingDown) || (vp->goingOffline) || (vp->stats.last_get != ts_save)) { /* looks like we're racing someone else. bail */ VPutVolume_r(vp); vp = NULL; } else { /* pull it off the VLRU */ osi_Assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE); volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--; queue_Remove(&vp->vlru); vp->vlru.idx = VLRU_QUEUE_INVALID; V_attachFlags(vp) &= ~(VOL_ON_VLRU); /* take if offline */ VOffline_r(vp, "volume has been soft detached"); /* invalidate the volume header cache */ FreeVolumeHeader(vp); /* update stats */ IncUInt64(&VStats.soft_detaches); vp->stats.soft_detaches++; /* put in pre-attached state so demand * attacher can work on it */ VChangeState_r(vp, VOL_STATE_PREATTACHED); ret = 1; } } return ret; } #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* Volume Header Cache routines */ /***************************************************/ /** * volume header cache. */ struct volume_hdr_LRU_t volume_hdr_LRU; /** * initialize the volume header cache. * * @param[in] howMany number of header cache entries to preallocate * * @pre VOL_LOCK held. Function has never been called before. * * @post howMany cache entries are allocated, initialized, and added * to the LRU list. Header cache statistics are initialized. * * @note only applicable to fileServer program type. Should only be * called once during volume package initialization. * * @internal volume package internal use only. */ static void VInitVolumeHeaderCache(afs_uint32 howMany) { struct volHeader *hp; if (programType != fileServer) return; queue_Init(&volume_hdr_LRU); volume_hdr_LRU.stats.free = 0; volume_hdr_LRU.stats.used = howMany; volume_hdr_LRU.stats.attached = 0; hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader))); osi_Assert(hp != NULL); while (howMany--) /* We are using ReleaseVolumeHeader to initialize the values on the header list * to ensure they have the right values */ ReleaseVolumeHeader(hp++); } /* get a volume header off of the volume header LRU. * * @return volume header * @retval NULL no usable volume header is available on the LRU * * @pre VOL_LOCK held * * @post for DAFS, if the returned header is associated with a volume, that * volume is NOT in an exclusive state * * @internal volume package internal use only. */ #ifdef AFS_DEMAND_ATTACH_FS static struct volHeader* GetVolHeaderFromLRU(void) { struct volHeader *hd = NULL, *qh, *nqh; /* Usually, a volume in an exclusive state will not have its header on * the LRU. However, it is possible for this to occur when a salvage * request is received over FSSYNC, and possibly in other corner cases. * So just skip over headers whose volumes are in an exclusive state. We * could VWaitExclusiveState_r instead, but not waiting is faster and * easier to do */ for (queue_Scan(&volume_hdr_LRU, qh, nqh, volHeader)) { if (!qh->back || !VIsExclusiveState(V_attachState(qh->back))) { queue_Remove(qh); hd = qh; break; } } return hd; } #else /* AFS_DEMAND_ATTACH_FS */ static struct volHeader* GetVolHeaderFromLRU(void) { struct volHeader *hd = NULL; if (queue_IsNotEmpty(&volume_hdr_LRU)) { hd = queue_First(&volume_hdr_LRU, volHeader); queue_Remove(hd); } return hd; } #endif /* !AFS_DEMAND_ATTACH_FS */ /** * get a volume header and attach it to the volume object. * * @param[in] vp pointer to volume object * * @return cache entry status * @retval 0 volume header was newly attached; cache data is invalid * @retval 1 volume header was previously attached; cache data is valid * * @pre VOL_LOCK held. For DAFS, lightweight ref must be held on volume object. * * @post volume header attached to volume object. if necessary, header cache * entry on LRU is synchronized to disk. Header is removed from LRU list. * * @note VOL_LOCK may be dropped * * @warning this interface does not load header data from disk. it merely * attaches a header object to the volume object, and may sync the old * header cache data out to disk in the process. * * @internal volume package internal use only. */ static int GetVolumeHeader(Volume * vp) { Error error; struct volHeader *hd; int old; static int everLogged = 0; #ifdef AFS_DEMAND_ATTACH_FS VolState vp_save = 0, back_save = 0; /* XXX debug 9/19/05 we've apparently got * a ref counting bug somewhere that's * breaking the nUsers == 0 => header on LRU * assumption */ if (vp->header && queue_IsNotOnQueue(vp->header)) { Log("nUsers == 0, but header not on LRU\n"); return 1; } #endif old = (vp->header != NULL); /* old == volume already has a header */ if (programType != fileServer) { /* for volume utilities, we allocate volHeaders as needed */ if (!vp->header) { hd = (struct volHeader *)calloc(1, sizeof(*vp->header)); osi_Assert(hd != NULL); vp->header = hd; hd->back = vp; #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(vp) |= VOL_HDR_ATTACHED; #endif } } else { /* for the fileserver, we keep a volume header cache */ if (old) { /* the header we previously dropped in the lru is * still available. pull it off the lru and return */ hd = vp->header; queue_Remove(hd); osi_Assert(hd->back == vp); #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(vp) &= ~(VOL_HDR_IN_LRU); #endif } else { hd = GetVolHeaderFromLRU(); if (!hd) { /* LRU is empty, so allocate a new volHeader * this is probably indicative of a leak, so let the user know */ hd = (struct volHeader *)calloc(1, sizeof(struct volHeader)); osi_Assert(hd != NULL); if (!everLogged) { Log("****Allocated more volume headers, probably leak****\n"); everLogged = 1; } volume_hdr_LRU.stats.free++; } if (hd->back) { /* this header used to belong to someone else. * we'll need to check if the header needs to * be sync'd out to disk */ #ifdef AFS_DEMAND_ATTACH_FS /* GetVolHeaderFromLRU had better not give us back a header * with a volume in exclusive state... */ osi_Assert(!VIsExclusiveState(V_attachState(hd->back))); #endif if (hd->diskstuff.inUse) { /* volume was in use, so we'll need to sync * its header to disk */ #ifdef AFS_DEMAND_ATTACH_FS back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING); vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING); VCreateReservation_r(hd->back); VOL_UNLOCK; #endif WriteVolumeHeader_r(&error, hd->back); /* Ignore errors; catch them later */ #ifdef AFS_DEMAND_ATTACH_FS VOL_LOCK; #endif } hd->back->header = NULL; #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU); if (hd->diskstuff.inUse) { VChangeState_r(hd->back, back_save); VCancelReservation_r(hd->back); VChangeState_r(vp, vp_save); } #endif } else { volume_hdr_LRU.stats.attached++; } hd->back = vp; vp->header = hd; #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(vp) |= VOL_HDR_ATTACHED; #endif } volume_hdr_LRU.stats.free--; volume_hdr_LRU.stats.used++; } IncUInt64(&VStats.hdr_gets); #ifdef AFS_DEMAND_ATTACH_FS IncUInt64(&vp->stats.hdr_gets); vp->stats.last_hdr_get = FT_ApproxTime(); #endif return old; } /** * make sure volume header is attached and contains valid cache data. * * @param[out] ec outbound error code * @param[in] vp pointer to volume object * * @pre VOL_LOCK held. For DAFS, lightweight ref held on vp. * * @post header cache entry attached, and loaded with valid data, or * *ec is nonzero, and the header is released back into the LRU. * * @internal volume package internal use only. */ static void LoadVolumeHeader(Error * ec, Volume * vp) { #ifdef AFS_DEMAND_ATTACH_FS VolState state_save; afs_uint32 now; *ec = 0; if (vp->nUsers == 0 && !GetVolumeHeader(vp)) { IncUInt64(&VStats.hdr_loads); state_save = VChangeState_r(vp, VOL_STATE_HDR_LOADING); VOL_UNLOCK; ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION); IncUInt64(&vp->stats.hdr_loads); now = FT_ApproxTime(); VOL_LOCK; if (!*ec) { V_attachFlags(vp) |= VOL_HDR_LOADED; vp->stats.last_hdr_load = now; } VChangeState_r(vp, state_save); } #else /* AFS_DEMAND_ATTACH_FS */ *ec = 0; if (vp->nUsers == 0 && !GetVolumeHeader(vp)) { IncUInt64(&VStats.hdr_loads); ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp), sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION); } #endif /* AFS_DEMAND_ATTACH_FS */ if (*ec) { /* maintain (nUsers==0) => header in LRU invariant */ FreeVolumeHeader(vp); } } /** * release a header cache entry back into the LRU list. * * @param[in] hd pointer to volume header cache object * * @pre VOL_LOCK held. * * @post header cache object appended onto end of LRU list. * * @note only applicable to fileServer program type. * * @note used to place a header cache entry back into the * LRU pool without invalidating it as a cache entry. * * @internal volume package internal use only. */ static void ReleaseVolumeHeader(struct volHeader *hd) { if (programType != fileServer) return; if (!hd || queue_IsOnQueue(hd)) /* no header, or header already released */ return; queue_Append(&volume_hdr_LRU, hd); #ifdef AFS_DEMAND_ATTACH_FS if (hd->back) { V_attachFlags(hd->back) |= VOL_HDR_IN_LRU; } #endif volume_hdr_LRU.stats.free++; volume_hdr_LRU.stats.used--; } /** * free/invalidate a volume header cache entry. * * @param[in] vp pointer to volume object * * @pre VOL_LOCK is held. * * @post For fileserver, header cache entry is returned to LRU, and it is * invalidated as a cache entry. For volume utilities, the header * cache entry is freed. * * @note For fileserver, this should be utilized instead of ReleaseVolumeHeader * whenever it is necessary to invalidate the header cache entry. * * @see ReleaseVolumeHeader * * @internal volume package internal use only. */ static void FreeVolumeHeader(Volume * vp) { struct volHeader *hd = vp->header; if (!hd) return; if (programType == fileServer) { ReleaseVolumeHeader(hd); hd->back = NULL; } else { free(hd); } #ifdef AFS_DEMAND_ATTACH_FS V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED); #endif volume_hdr_LRU.stats.attached--; vp->header = NULL; } /***************************************************/ /* Volume Hash Table routines */ /***************************************************/ /** * set size of volume object hash table. * * @param[in] logsize log(2) of desired hash table size * * @return operation status * @retval 0 success * @retval -1 failure * * @pre MUST be called prior to VInitVolumePackage2 * * @post Volume Hash Table will have 2^logsize buckets */ int VSetVolHashSize(int logsize) { /* 64 to 268435456 hash buckets seems like a reasonable range */ if ((logsize < 6 ) || (logsize > 28)) { return -1; } if (!VInit) { VolumeHashTable.Size = 1 << logsize; VolumeHashTable.Mask = VolumeHashTable.Size - 1; } else { /* we can't yet support runtime modification of this * parameter. we'll need a configuration rwlock to * make runtime modification feasible.... */ return -1; } return 0; } /** * initialize dynamic data structures for volume hash table. * * @post hash table is allocated, and fields are initialized. * * @internal volume package internal use only. */ static void VInitVolumeHash(void) { int i; VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size, sizeof(VolumeHashChainHead)); osi_Assert(VolumeHashTable.Table != NULL); for (i=0; i < VolumeHashTable.Size; i++) { queue_Init(&VolumeHashTable.Table[i]); #ifdef AFS_DEMAND_ATTACH_FS CV_INIT(&VolumeHashTable.Table[i].chain_busy_cv, "vhash busy", CV_DEFAULT, 0); #endif /* AFS_DEMAND_ATTACH_FS */ } } /** * add a volume object to the hash table. * * @param[in] vp pointer to volume object * @param[in] hashid hash of volume id * * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight * reference on vp. * * @post volume is added to hash chain. * * @internal volume package internal use only. * * @note For DAFS, VOL_LOCK may be dropped in order to wait for an * asynchronous hash chain reordering to finish. */ static void AddVolumeToHashTable(Volume * vp, int hashid) { VolumeHashChainHead * head; if (queue_IsOnQueue(vp)) return; head = &VolumeHashTable.Table[VOLUME_HASH(hashid)]; #ifdef AFS_DEMAND_ATTACH_FS /* wait for the hash chain to become available */ VHashWait_r(head); V_attachFlags(vp) |= VOL_IN_HASH; vp->chainCacheCheck = ++head->cacheCheck; #endif /* AFS_DEMAND_ATTACH_FS */ head->len++; vp->hashid = hashid; queue_Append(head, vp); vp->vnodeHashOffset = VolumeHashOffset_r(); } /** * delete a volume object from the hash table. * * @param[in] vp pointer to volume object * * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight * reference on vp. * * @post volume is removed from hash chain. * * @internal volume package internal use only. * * @note For DAFS, VOL_LOCK may be dropped in order to wait for an * asynchronous hash chain reordering to finish. */ static void DeleteVolumeFromHashTable(Volume * vp) { VolumeHashChainHead * head; if (!queue_IsOnQueue(vp)) return; head = &VolumeHashTable.Table[VOLUME_HASH(vp->hashid)]; #ifdef AFS_DEMAND_ATTACH_FS /* wait for the hash chain to become available */ VHashWait_r(head); V_attachFlags(vp) &= ~(VOL_IN_HASH); head->cacheCheck++; #endif /* AFS_DEMAND_ATTACH_FS */ head->len--; queue_Remove(vp); /* do NOT reset hashid to zero, as the online * salvager package may need to know the volume id * after the volume is removed from the hash */ } /** * lookup a volume object in the hash table given a volume id. * * @param[out] ec error code return * @param[in] volumeId volume id * @param[in] hint volume object which we believe could be the correct mapping * * @return volume object pointer * @retval NULL no such volume id is registered with the hash table. * * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight ref on hint. * * @post volume object with the given id is returned. volume object and * hash chain access statistics are updated. hash chain may have * been reordered. * * @note For DAFS, VOL_LOCK may be dropped in order to wait for an * asynchronous hash chain reordering operation to finish, or * in order for us to perform an asynchronous chain reordering. * * @note Hash chain reorderings occur when the access count for the * volume object being looked up exceeds the sum of the previous * node's (the node ahead of it in the hash chain linked list) * access count plus the constant VOLUME_HASH_REORDER_THRESHOLD. * * @note For DAFS, the hint parameter allows us to short-circuit if the * cacheCheck fields match between the hash chain head and the * hint volume object. */ Volume * VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint) { int looks = 0; Volume * vp, *np; #ifdef AFS_DEMAND_ATTACH_FS Volume *pp; #endif VolumeHashChainHead * head; *ec = 0; head = &VolumeHashTable.Table[VOLUME_HASH(volumeId)]; #ifdef AFS_DEMAND_ATTACH_FS /* wait for the hash chain to become available */ VHashWait_r(head); /* check to see if we can short circuit without walking the hash chain */ if (hint && (hint->chainCacheCheck == head->cacheCheck)) { IncUInt64(&hint->stats.hash_short_circuits); return hint; } #endif /* AFS_DEMAND_ATTACH_FS */ /* someday we need to either do per-chain locks, RWlocks, * or both for volhash access. * (and move to a data structure with better cache locality) */ /* search the chain for this volume id */ for(queue_Scan(head, vp, np, Volume)) { looks++; if ((vp->hashid == volumeId)) { break; } } if (queue_IsEnd(head, vp)) { vp = NULL; } #ifdef AFS_DEMAND_ATTACH_FS /* update hash chain statistics */ { afs_uint64 lks; FillInt64(lks, 0, looks); AddUInt64(head->looks, lks, &head->looks); AddUInt64(VStats.hash_looks, lks, &VStats.hash_looks); IncUInt64(&head->gets); } if (vp) { afs_uint64 thresh; IncUInt64(&vp->stats.hash_lookups); /* for demand attach fileserver, we permit occasional hash chain reordering * so that frequently looked up volumes move towards the head of the chain */ pp = queue_Prev(vp, Volume); if (!queue_IsEnd(head, pp)) { FillInt64(thresh, 0, VOLUME_HASH_REORDER_THRESHOLD); AddUInt64(thresh, pp->stats.hash_lookups, &thresh); if (GEInt64(vp->stats.hash_lookups, thresh)) { VReorderHash_r(head, pp, vp); } } /* update the short-circuit cache check */ vp->chainCacheCheck = head->cacheCheck; } #endif /* AFS_DEMAND_ATTACH_FS */ return vp; } #ifdef AFS_DEMAND_ATTACH_FS /* perform volume hash chain reordering. * * advance a subchain beginning at vp ahead of * the adjacent subchain ending at pp */ static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp) { Volume *tp, *np, *lp; afs_uint64 move_thresh; /* this should never be called if the chain is already busy, so * no need to wait for other exclusive chain ops to finish */ /* this is a rather heavy set of operations, * so let's set the chain busy flag and drop * the vol_glock */ VHashBeginExclusive_r(head); VOL_UNLOCK; /* scan forward in the chain from vp looking for the last element * in the chain we want to advance */ FillInt64(move_thresh, 0, VOLUME_HASH_REORDER_CHAIN_THRESH); AddUInt64(move_thresh, pp->stats.hash_lookups, &move_thresh); for(queue_ScanFrom(head, vp, tp, np, Volume)) { if (LTInt64(tp->stats.hash_lookups, move_thresh)) { break; } } lp = queue_Prev(tp, Volume); /* scan backwards from pp to determine where to splice and * insert the subchain we're advancing */ for(queue_ScanBackwardsFrom(head, pp, tp, np, Volume)) { if (GTInt64(tp->stats.hash_lookups, move_thresh)) { break; } } tp = queue_Next(tp, Volume); /* rebalance chain(vp,...,lp) ahead of chain(tp,...,pp) */ queue_MoveChainBefore(tp,vp,lp); VOL_LOCK; IncUInt64(&VStats.hash_reorders); head->cacheCheck++; IncUInt64(&head->reorders); /* wake up any threads waiting for the hash chain */ VHashEndExclusive_r(head); } /* demand-attach fs volume hash * asynchronous exclusive operations */ /** * begin an asynchronous exclusive operation on a volume hash chain. * * @param[in] head pointer to volume hash chain head object * * @pre VOL_LOCK held. hash chain is quiescent. * * @post hash chain marked busy. * * @note this interface is used in conjunction with VHashEndExclusive_r and * VHashWait_r to perform asynchronous (wrt VOL_LOCK) operations on a * volume hash chain. Its main use case is hash chain reordering, which * has the potential to be a highly latent operation. * * @see VHashEndExclusive_r * @see VHashWait_r * * @note DAFS only * * @internal volume package internal use only. */ static void VHashBeginExclusive_r(VolumeHashChainHead * head) { osi_Assert(head->busy == 0); head->busy = 1; } /** * relinquish exclusive ownership of a volume hash chain. * * @param[in] head pointer to volume hash chain head object * * @pre VOL_LOCK held. thread owns the hash chain exclusively. * * @post hash chain is marked quiescent. threads awaiting use of * chain are awakened. * * @see VHashBeginExclusive_r * @see VHashWait_r * * @note DAFS only * * @internal volume package internal use only. */ static void VHashEndExclusive_r(VolumeHashChainHead * head) { osi_Assert(head->busy); head->busy = 0; CV_BROADCAST(&head->chain_busy_cv); } /** * wait for all asynchronous operations on a hash chain to complete. * * @param[in] head pointer to volume hash chain head object * * @pre VOL_LOCK held. * * @post hash chain object is quiescent. * * @see VHashBeginExclusive_r * @see VHashEndExclusive_r * * @note DAFS only * * @note This interface should be called before any attempt to * traverse the hash chain. It is permissible for a thread * to gain exclusive access to the chain, and then perform * latent operations on the chain asynchronously wrt the * VOL_LOCK. * * @warning if waiting is necessary, VOL_LOCK is dropped * * @internal volume package internal use only. */ static void VHashWait_r(VolumeHashChainHead * head) { while (head->busy) { VOL_CV_WAIT(&head->chain_busy_cv); } } #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* Volume by Partition List routines */ /***************************************************/ /* * demand attach fileserver adds a * linked list of volumes to each * partition object, thus allowing * for quick enumeration of all * volumes on a partition */ #ifdef AFS_DEMAND_ATTACH_FS /** * add a volume to its disk partition VByPList. * * @param[in] vp pointer to volume object * * @pre either the disk partition VByPList is owned exclusively * by the calling thread, or the list is quiescent and * VOL_LOCK is held. * * @post volume is added to disk partition VByPList * * @note DAFS only * * @warning it is the caller's responsibility to ensure list * quiescence. * * @see VVByPListWait_r * @see VVByPListBeginExclusive_r * @see VVByPListEndExclusive_r * * @internal volume package internal use only. */ static void AddVolumeToVByPList_r(Volume * vp) { if (queue_IsNotOnQueue(&vp->vol_list)) { queue_Append(&vp->partition->vol_list, &vp->vol_list); V_attachFlags(vp) |= VOL_ON_VBYP_LIST; vp->partition->vol_list.len++; } } /** * delete a volume from its disk partition VByPList. * * @param[in] vp pointer to volume object * * @pre either the disk partition VByPList is owned exclusively * by the calling thread, or the list is quiescent and * VOL_LOCK is held. * * @post volume is removed from the disk partition VByPList * * @note DAFS only * * @warning it is the caller's responsibility to ensure list * quiescence. * * @see VVByPListWait_r * @see VVByPListBeginExclusive_r * @see VVByPListEndExclusive_r * * @internal volume package internal use only. */ static void DeleteVolumeFromVByPList_r(Volume * vp) { if (queue_IsOnQueue(&vp->vol_list)) { queue_Remove(&vp->vol_list); V_attachFlags(vp) &= ~(VOL_ON_VBYP_LIST); vp->partition->vol_list.len--; } } /** * begin an asynchronous exclusive operation on a VByPList. * * @param[in] dp pointer to disk partition object * * @pre VOL_LOCK held. VByPList is quiescent. * * @post VByPList marked busy. * * @note this interface is used in conjunction with VVByPListEndExclusive_r and * VVByPListWait_r to perform asynchronous (wrt VOL_LOCK) operations on a * VByPList. * * @see VVByPListEndExclusive_r * @see VVByPListWait_r * * @note DAFS only * * @internal volume package internal use only. */ /* take exclusive control over the list */ static void VVByPListBeginExclusive_r(struct DiskPartition64 * dp) { osi_Assert(dp->vol_list.busy == 0); dp->vol_list.busy = 1; } /** * relinquish exclusive ownership of a VByPList. * * @param[in] dp pointer to disk partition object * * @pre VOL_LOCK held. thread owns the VByPList exclusively. * * @post VByPList is marked quiescent. threads awaiting use of * the list are awakened. * * @see VVByPListBeginExclusive_r * @see VVByPListWait_r * * @note DAFS only * * @internal volume package internal use only. */ static void VVByPListEndExclusive_r(struct DiskPartition64 * dp) { osi_Assert(dp->vol_list.busy); dp->vol_list.busy = 0; CV_BROADCAST(&dp->vol_list.cv); } /** * wait for all asynchronous operations on a VByPList to complete. * * @param[in] dp pointer to disk partition object * * @pre VOL_LOCK is held. * * @post disk partition's VByP list is quiescent * * @note DAFS only * * @note This interface should be called before any attempt to * traverse the VByPList. It is permissible for a thread * to gain exclusive access to the list, and then perform * latent operations on the list asynchronously wrt the * VOL_LOCK. * * @warning if waiting is necessary, VOL_LOCK is dropped * * @see VVByPListEndExclusive_r * @see VVByPListBeginExclusive_r * * @internal volume package internal use only. */ static void VVByPListWait_r(struct DiskPartition64 * dp) { while (dp->vol_list.busy) { VOL_CV_WAIT(&dp->vol_list.cv); } } #endif /* AFS_DEMAND_ATTACH_FS */ /***************************************************/ /* Volume Cache Statistics routines */ /***************************************************/ void VPrintCacheStats_r(void) { struct VnodeClassInfo *vcp; vcp = &VnodeClassInfo[vLarge]; Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes); vcp = &VnodeClassInfo[vSmall]; Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes); Log("Volume header cache, %d entries, %"AFS_INT64_FMT" gets, " "%"AFS_INT64_FMT" replacements\n", VStats.hdr_cache_size, VStats.hdr_gets, VStats.hdr_loads); } void VPrintCacheStats(void) { VOL_LOCK; VPrintCacheStats_r(); VOL_UNLOCK; } #ifdef AFS_DEMAND_ATTACH_FS static double UInt64ToDouble(afs_uint64 * x) { static double c32 = 4.0 * 1.073741824 * 1000000000.0; afs_uint32 h, l; SplitInt64(*x, h, l); return (((double)h) * c32) + ((double) l); } static char * DoubleToPrintable(double x, char * buf, int len) { static double billion = 1000000000.0; afs_uint32 y[3]; y[0] = (afs_uint32) (x / (billion * billion)); y[1] = (afs_uint32) ((x - (((double)y[0]) * billion * billion)) / billion); y[2] = (afs_uint32) (x - ((((double)y[0]) * billion * billion) + (((double)y[1]) * billion))); if (y[0]) { snprintf(buf, len, "%d%09d%09d", y[0], y[1], y[2]); } else if (y[1]) { snprintf(buf, len, "%d%09d", y[1], y[2]); } else { snprintf(buf, len, "%d", y[2]); } buf[len-1] = '\0'; return buf; } struct VLRUExtStatsEntry { VolumeId volid; }; struct VLRUExtStats { afs_uint32 len; afs_uint32 used; struct { afs_uint32 start; afs_uint32 len; } queue_info[VLRU_QUEUE_INVALID]; struct VLRUExtStatsEntry * vec; }; /** * add a 256-entry fudge factor onto the vector in case state changes * out from under us. */ #define VLRU_EXT_STATS_VEC_LEN_FUDGE 256 /** * collect extended statistics for the VLRU subsystem. * * @param[out] stats pointer to stats structure to be populated * @param[in] nvols number of volumes currently known to exist * * @pre VOL_LOCK held * * @post stats->vec allocated and populated * * @return operation status * @retval 0 success * @retval 1 failure */ static int VVLRUExtStats_r(struct VLRUExtStats * stats, afs_uint32 nvols) { afs_uint32 cur, idx, len; struct rx_queue * qp, * nqp; Volume * vp; struct VLRUExtStatsEntry * vec; len = nvols + VLRU_EXT_STATS_VEC_LEN_FUDGE; vec = stats->vec = calloc(len, sizeof(struct VLRUExtStatsEntry)); if (vec == NULL) { return 1; } cur = 0; for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) { VLRU_Wait_r(&volume_LRU.q[idx]); VLRU_BeginExclusive_r(&volume_LRU.q[idx]); VOL_UNLOCK; stats->queue_info[idx].start = cur; for (queue_Scan(&volume_LRU.q[idx], qp, nqp, rx_queue)) { if (cur == len) { /* out of space in vec */ break; } vp = (Volume *)((char *)qp - offsetof(Volume, vlru)); vec[cur].volid = vp->hashid; cur++; } stats->queue_info[idx].len = cur - stats->queue_info[idx].start; VOL_LOCK; VLRU_EndExclusive_r(&volume_LRU.q[idx]); } stats->len = len; stats->used = cur; return 0; } #define ENUMTOSTRING(en) #en #define ENUMCASE(en) \ case en: return ENUMTOSTRING(en) static char * vlru_idx_to_string(int idx) { switch (idx) { ENUMCASE(VLRU_QUEUE_NEW); ENUMCASE(VLRU_QUEUE_MID); ENUMCASE(VLRU_QUEUE_OLD); ENUMCASE(VLRU_QUEUE_CANDIDATE); ENUMCASE(VLRU_QUEUE_HELD); ENUMCASE(VLRU_QUEUE_INVALID); default: return "**UNKNOWN**"; } } void VPrintExtendedCacheStats_r(int flags) { int i; afs_uint32 vol_sum = 0; struct stats { double min; double max; double sum; double avg; }; struct stats looks, gets, reorders, len; struct stats ch_looks, ch_gets, ch_reorders; char pr_buf[4][32]; VolumeHashChainHead *head; Volume *vp, *np; struct VLRUExtStats vlru_stats; /* zero out stats */ memset(&looks, 0, sizeof(struct stats)); memset(&gets, 0, sizeof(struct stats)); memset(&reorders, 0, sizeof(struct stats)); memset(&len, 0, sizeof(struct stats)); memset(&ch_looks, 0, sizeof(struct stats)); memset(&ch_gets, 0, sizeof(struct stats)); memset(&ch_reorders, 0, sizeof(struct stats)); for (i = 0; i < VolumeHashTable.Size; i++) { head = &VolumeHashTable.Table[i]; VHashWait_r(head); VHashBeginExclusive_r(head); VOL_UNLOCK; ch_looks.sum = UInt64ToDouble(&head->looks); ch_gets.sum = UInt64ToDouble(&head->gets); ch_reorders.sum = UInt64ToDouble(&head->reorders); /* update global statistics */ { looks.sum += ch_looks.sum; gets.sum += ch_gets.sum; reorders.sum += ch_reorders.sum; len.sum += (double)head->len; vol_sum += head->len; if (i == 0) { len.min = (double) head->len; len.max = (double) head->len; looks.min = ch_looks.sum; looks.max = ch_looks.sum; gets.min = ch_gets.sum; gets.max = ch_gets.sum; reorders.min = ch_reorders.sum; reorders.max = ch_reorders.sum; } else { if (((double)head->len) < len.min) len.min = (double) head->len; if (((double)head->len) > len.max) len.max = (double) head->len; if (ch_looks.sum < looks.min) looks.min = ch_looks.sum; else if (ch_looks.sum > looks.max) looks.max = ch_looks.sum; if (ch_gets.sum < gets.min) gets.min = ch_gets.sum; else if (ch_gets.sum > gets.max) gets.max = ch_gets.sum; if (ch_reorders.sum < reorders.min) reorders.min = ch_reorders.sum; else if (ch_reorders.sum > reorders.max) reorders.max = ch_reorders.sum; } } if ((flags & VOL_STATS_PER_CHAIN2) && queue_IsNotEmpty(head)) { /* compute detailed per-chain stats */ struct stats hdr_loads, hdr_gets; double v_looks, v_loads, v_gets; /* initialize stats with data from first element in chain */ vp = queue_First(head, Volume); v_looks = UInt64ToDouble(&vp->stats.hash_lookups); v_loads = UInt64ToDouble(&vp->stats.hdr_loads); v_gets = UInt64ToDouble(&vp->stats.hdr_gets); ch_gets.min = ch_gets.max = v_looks; hdr_loads.min = hdr_loads.max = v_loads; hdr_gets.min = hdr_gets.max = v_gets; hdr_loads.sum = hdr_gets.sum = 0; vp = queue_Next(vp, Volume); /* pull in stats from remaining elements in chain */ for (queue_ScanFrom(head, vp, vp, np, Volume)) { v_looks = UInt64ToDouble(&vp->stats.hash_lookups); v_loads = UInt64ToDouble(&vp->stats.hdr_loads); v_gets = UInt64ToDouble(&vp->stats.hdr_gets); hdr_loads.sum += v_loads; hdr_gets.sum += v_gets; if (v_looks < ch_gets.min) ch_gets.min = v_looks; else if (v_looks > ch_gets.max) ch_gets.max = v_looks; if (v_loads < hdr_loads.min) hdr_loads.min = v_loads; else if (v_loads > hdr_loads.max) hdr_loads.max = v_loads; if (v_gets < hdr_gets.min) hdr_gets.min = v_gets; else if (v_gets > hdr_gets.max) hdr_gets.max = v_gets; } /* compute per-chain averages */ ch_gets.avg = ch_gets.sum / ((double)head->len); hdr_loads.avg = hdr_loads.sum / ((double)head->len); hdr_gets.avg = hdr_gets.sum / ((double)head->len); /* dump per-chain stats */ Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n", i, head->len, DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1]))); Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(ch_gets.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(ch_gets.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(ch_gets.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(ch_gets.sum, pr_buf[3], sizeof(pr_buf[3]))); Log("\tHDR gets : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(hdr_gets.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(hdr_gets.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(hdr_gets.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(hdr_gets.sum, pr_buf[3], sizeof(pr_buf[3]))); Log("\tHDR loads : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(hdr_loads.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(hdr_loads.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(hdr_loads.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(hdr_loads.sum, pr_buf[3], sizeof(pr_buf[3]))); } else if (flags & VOL_STATS_PER_CHAIN) { /* dump simple per-chain stats */ Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n", i, head->len, DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2]))); } VOL_LOCK; VHashEndExclusive_r(head); } VOL_UNLOCK; /* compute global averages */ len.avg = len.sum / ((double)VolumeHashTable.Size); looks.avg = looks.sum / ((double)VolumeHashTable.Size); gets.avg = gets.sum / ((double)VolumeHashTable.Size); reorders.avg = reorders.sum / ((double)VolumeHashTable.Size); /* dump global stats */ Log("Volume hash summary: %d buckets\n", VolumeHashTable.Size); Log(" chain length : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(len.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(len.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(len.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(len.sum, pr_buf[3], sizeof(pr_buf[3]))); Log(" looks : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(looks.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(looks.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(looks.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(looks.sum, pr_buf[3], sizeof(pr_buf[3]))); Log(" gets : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(gets.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(gets.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(gets.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(gets.sum, pr_buf[3], sizeof(pr_buf[3]))); Log(" reorders : min=%s, max=%s, avg=%s, total=%s\n", DoubleToPrintable(reorders.min, pr_buf[0], sizeof(pr_buf[0])), DoubleToPrintable(reorders.max, pr_buf[1], sizeof(pr_buf[1])), DoubleToPrintable(reorders.avg, pr_buf[2], sizeof(pr_buf[2])), DoubleToPrintable(reorders.sum, pr_buf[3], sizeof(pr_buf[3]))); /* print extended disk related statistics */ { struct DiskPartition64 * diskP; afs_uint32 vol_count[VOLMAXPARTS+1]; byte part_exists[VOLMAXPARTS+1]; Device id; int i; memset(vol_count, 0, sizeof(vol_count)); memset(part_exists, 0, sizeof(part_exists)); VOL_LOCK; for (diskP = DiskPartitionList; diskP; diskP = diskP->next) { id = diskP->index; vol_count[id] = diskP->vol_list.len; part_exists[id] = 1; } VOL_UNLOCK; for (i = 0; i <= VOLMAXPARTS; i++) { if (part_exists[i]) { /* XXX while this is currently safe, it is a violation * of the VGetPartitionById_r interface contract. */ diskP = VGetPartitionById_r(i, 0); if (diskP) { Log("Partition %s has %d online volumes\n", VPartitionPath(diskP), diskP->vol_list.len); } } } VOL_LOCK; } /* print extended VLRU statistics */ if (VVLRUExtStats_r(&vlru_stats, vol_sum) == 0) { afs_uint32 idx, cur, lpos; VolumeId line[5]; VOL_UNLOCK; Log("VLRU State Dump:\n\n"); for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) { Log("\t%s:\n", vlru_idx_to_string(idx)); lpos = 0; for (cur = vlru_stats.queue_info[idx].start; cur < vlru_stats.queue_info[idx].len; cur++) { line[lpos++] = vlru_stats.vec[cur].volid; if (lpos==5) { Log("\t\t%u, %u, %u, %u, %u,\n", line[0], line[1], line[2], line[3], line[4]); lpos = 0; } } if (lpos) { while (lpos < 5) { line[lpos++] = 0; } Log("\t\t%u, %u, %u, %u, %u\n", line[0], line[1], line[2], line[3], line[4]); } Log("\n"); } free(vlru_stats.vec); VOL_LOCK; } } void VPrintExtendedCacheStats(int flags) { VOL_LOCK; VPrintExtendedCacheStats_r(flags); VOL_UNLOCK; } #endif /* AFS_DEMAND_ATTACH_FS */ afs_int32 VCanScheduleSalvage(void) { return vol_opts.canScheduleSalvage; } afs_int32 VCanUseFSSYNC(void) { return vol_opts.canUseFSSYNC; } afs_int32 VCanUseSALVSYNC(void) { return vol_opts.canUseSALVSYNC; } afs_int32 VCanUnsafeAttach(void) { return vol_opts.unsafe_attach; }