/*
* Copyright 2000, International Business Machines Corporation and others.
* All Rights Reserved.
- *
+ *
* This software has been released under the terms of the IBM Public
* License. For details, see the LICENSE file in the top-level source
* directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2005-2008 Sine Nomine Associates
*/
/* 1/1/89: NB: this stuff is all going to be replaced. Don't take it too seriously */
#include <afsconfig.h>
#include <afs/param.h>
-RCSID
- ("$Header$");
#include <rx/xdr.h>
#include <afs/afsint.h>
#include <ctype.h>
+#include <signal.h>
#ifndef AFS_NT40_ENV
#include <sys/param.h>
#if !defined(AFS_SGI_ENV)
#if defined(AFS_SGI_ENV)
#include <fcntl.h>
#include <mntent.h>
-#ifdef AFS_SGI_EFS_IOPS_ENV
-#define ROOTINO EFS_ROOTINO
-#include <sys/fs/efs.h>
-#include "sgiefs/efs.h" /* until 5.1 release */
-#endif
-
#else
#ifndef AFS_LINUX20_ENV
#ifdef AFS_NT40_ENV
#include <io.h>
#endif
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
#include "vnode.h"
#include "volume.h"
#include "partition.h"
-#ifdef AFS_PTHREAD_ENV
-#include <assert.h>
-#else /* AFS_PTHREAD_ENV */
-#include "afs/assert.h"
-#endif /* AFS_PTHREAD_ENV */
+#include "volume_inline.h"
+#include "common.h"
+#include "afs/afs_assert.h"
#include "vutils.h"
-#include "fssync.h"
#ifndef AFS_NT40_ENV
+#include <afs/dir.h>
#include <unistd.h>
#endif
+#if !defined(offsetof)
+#include <stddef.h>
+#endif
+
#ifdef O_LARGEFILE
#define afs_stat stat64
#define afs_fstat fstat64
#ifdef AFS_PTHREAD_ENV
pthread_mutex_t vol_glock_mutex;
-pthread_mutex_t vol_attach_mutex;
-pthread_mutex_t vol_fsync_mutex;
pthread_mutex_t vol_trans_mutex;
pthread_cond_t vol_put_volume_cond;
pthread_cond_t vol_sleep_cond;
+pthread_cond_t vol_init_attach_cond;
+pthread_cond_t vol_vinit_cond;
+int vol_attach_threads = 1;
#endif /* AFS_PTHREAD_ENV */
+/* start-time configurable I/O parameters */
+ih_init_params vol_io_params;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+pthread_mutex_t vol_salvsync_mutex;
+
+/*
+ * Set this to 1 to disallow SALVSYNC communication in all threads; used
+ * during shutdown, since the salvageserver may have gone away.
+ */
+static volatile sig_atomic_t vol_disallow_salvsync = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
#ifdef AFS_OSF_ENV
extern void *calloc(), *realloc();
#endif
-/*@printflike@*/ extern void Log(const char *format, ...);
-
/* Forward declarations */
-static Volume *attach2(Error * ec, char *path,
- register struct VolumeHeader *header,
- struct DiskPartition *partp, int isbusy);
+static Volume *attach2(Error * ec, VolId volumeId, char *path,
+ struct DiskPartition64 *partp, Volume * vp,
+ int isbusy, int mode);
+static void ReallyFreeVolume(Volume * vp);
+#ifdef AFS_DEMAND_ATTACH_FS
static void FreeVolume(Volume * vp);
+#else /* !AFS_DEMAND_ATTACH_FS */
+#define FreeVolume(vp) ReallyFreeVolume(vp)
static void VScanUpdateList(void);
-static void InitLRU(int howMany);
-static int GetVolumeHeader(register Volume * vp);
-static void ReleaseVolumeHeader(register struct volHeader *hd);
-static void FreeVolumeHeader(register Volume * vp);
-static void AddVolumeToHashTable(register Volume * vp, int hashid);
-static void DeleteVolumeFromHashTable(register Volume * vp);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+static void VInitVolumeHeaderCache(afs_uint32 howMany);
+static int GetVolumeHeader(Volume * vp);
+static void ReleaseVolumeHeader(struct volHeader *hd);
+static void FreeVolumeHeader(Volume * vp);
+static void AddVolumeToHashTable(Volume * vp, int hashid);
+static void DeleteVolumeFromHashTable(Volume * vp);
+#if 0
static int VHold(Volume * vp);
+#endif
static int VHold_r(Volume * vp);
-static void GetBitmap(Error * ec, Volume * vp, VnodeClass class);
-static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp,
- char **namep);
+static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class);
static void VReleaseVolumeHandles_r(Volume * vp);
static void VCloseVolumeHandles_r(Volume * vp);
+static void LoadVolumeHeader(Error * ec, Volume * vp);
+static int VCheckOffline(Volume * vp);
+static int VCheckDetach(Volume * vp);
+static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags);
int LogLevel; /* Vice loglevel--not defined as extern so that it will be
* defined when not linked with vice, XXXX */
ProgramType programType; /* The type of program using the package */
+static VolumePackageOptions vol_opts;
+
+/* extended volume package statistics */
+VolPkgStats VStats;
+
+#ifdef VOL_LOCK_DEBUG
+pthread_t vol_glock_holder = 0;
+#endif
#define VOLUME_BITMAP_GROWSIZE 16 /* bytes, => 128vnodes */
/* Must be a multiple of 4 (1 word) !! */
-#define VOLUME_HASH_TABLE_SIZE 128 /* Must be a power of 2!! */
-#define VOLUME_HASH(volumeId) (volumeId&(VOLUME_HASH_TABLE_SIZE-1))
-private Volume *VolumeHashTable[VOLUME_HASH_TABLE_SIZE];
+
+/* this parameter needs to be tunable at runtime.
+ * 128 was really inadequate for largish servers -- at 16384 volumes this
+ * puts average chain length at 128, thus an average 65 deref's to find a volptr.
+ * talk about bad spatial locality...
+ *
+ * an AVL or splay tree might work a lot better, but we'll just increase
+ * the default hash table size for now
+ */
+#define DEFAULT_VOLUME_HASH_SIZE 256 /* Must be a power of 2!! */
+#define DEFAULT_VOLUME_HASH_MASK (DEFAULT_VOLUME_HASH_SIZE-1)
+#define VOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask))
+
+/*
+ * turn volume hash chains into partially ordered lists.
+ * when the threshold is exceeded between two adjacent elements,
+ * perform a chain rebalancing operation.
+ *
+ * keep the threshold high in order to keep cache line invalidates
+ * low "enough" on SMPs
+ */
+#define VOLUME_HASH_REORDER_THRESHOLD 200
+
+/*
+ * when possible, don't just reorder single elements, but reorder
+ * entire chains of elements at once. a chain of elements that
+ * exceed the element previous to the pivot by at least CHAIN_THRESH
+ * accesses are moved in front of the chain whose elements have at
+ * least CHAIN_THRESH less accesses than the pivot element
+ */
+#define VOLUME_HASH_REORDER_CHAIN_THRESH (VOLUME_HASH_REORDER_THRESHOLD / 2)
+
+#include "rx/rx_queue.h"
+
+
+VolumeHashTable_t VolumeHashTable = {
+ DEFAULT_VOLUME_HASH_SIZE,
+ DEFAULT_VOLUME_HASH_MASK,
+ NULL
+};
+
+
+static void VInitVolumeHash(void);
+
#ifndef AFS_HAVE_FFS
/* This macro is used where an ffs() call does not exist. Was in util/ffs.c */
}
#endif /* !AFS_HAVE_FFS */
-struct Lock vol_listLock; /* Lock obtained when listing volumes: prevents a volume from being missed if the volume is attached during a list volumes */
+#ifdef AFS_PTHREAD_ENV
+/**
+ * disk partition queue element
+ */
+typedef struct diskpartition_queue_t {
+ struct rx_queue queue; /**< queue header */
+ struct DiskPartition64 *diskP; /**< disk partition table entry */
+} diskpartition_queue_t;
+
+#ifndef AFS_DEMAND_ATTACH_FS
+
+typedef struct vinitvolumepackage_thread_t {
+ struct rx_queue queue;
+ pthread_cond_t thread_done_cv;
+ int n_threads_complete;
+} vinitvolumepackage_thread_t;
+static void * VInitVolumePackageThread(void * args);
+
+#else /* !AFS_DEMAND_ATTTACH_FS */
+#define VINIT_BATCH_MAX_SIZE 512
+
+/**
+ * disk partition work queue
+ */
+struct partition_queue {
+ struct rx_queue head; /**< diskpartition_queue_t queue */
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+};
+
+/**
+ * volumes parameters for preattach
+ */
+struct volume_init_batch {
+ struct rx_queue queue; /**< queue header */
+ int thread; /**< posting worker thread */
+ int last; /**< indicates thread is done */
+ int size; /**< number of volume ids in batch */
+ Volume *batch[VINIT_BATCH_MAX_SIZE]; /**< volumes ids to preattach */
+};
+
+/**
+ * volume parameters work queue
+ */
+struct volume_init_queue {
+ struct rx_queue head; /**< volume_init_batch queue */
+ pthread_mutex_t mutex;
+ pthread_cond_t cv;
+};
+
+/**
+ * volume init worker thread parameters
+ */
+struct vinitvolumepackage_thread_param {
+ int nthreads; /**< total number of worker threads */
+ int thread; /**< thread number for this worker thread */
+ struct partition_queue *pq; /**< queue partitions to scan */
+ struct volume_init_queue *vq; /**< queue of volume to preattach */
+};
+
+static void *VInitVolumePackageThread(void *args);
+static struct DiskPartition64 *VInitNextPartition(struct partition_queue *pq);
+static VolId VInitNextVolumeId(DIR *dirp);
+static int VInitPreAttachVolumes(int nthreads, struct volume_init_queue *vq);
+
+#endif /* !AFS_DEMAND_ATTACH_FS */
+#endif /* AFS_PTHREAD_ENV */
+
+#ifndef AFS_DEMAND_ATTACH_FS
+static int VAttachVolumesByPartition(struct DiskPartition64 *diskP,
+ int * nAttached, int * nUnattached);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* demand attach fileserver extensions */
+
+/* XXX
+ * in the future we will support serialization of VLRU state into the fs_state
+ * disk dumps
+ *
+ * these structures are the beginning of that effort
+ */
+struct VLRU_DiskHeader {
+ struct versionStamp stamp; /* magic and structure version number */
+ afs_uint32 mtime; /* time of dump to disk */
+ afs_uint32 num_records; /* number of VLRU_DiskEntry records */
+};
+
+struct VLRU_DiskEntry {
+ afs_uint32 vid; /* volume ID */
+ afs_uint32 idx; /* generation */
+ afs_uint32 last_get; /* timestamp of last get */
+};
+
+struct VLRU_StartupQueue {
+ struct VLRU_DiskEntry * entry;
+ int num_entries;
+ int next_idx;
+};
+
+typedef struct vshutdown_thread_t {
+ struct rx_queue q;
+ pthread_mutex_t lock;
+ pthread_cond_t cv;
+ pthread_cond_t master_cv;
+ int n_threads;
+ int n_threads_complete;
+ int vol_remaining;
+ int schedule_version;
+ int pass;
+ byte n_parts;
+ byte n_parts_done_pass;
+ byte part_thread_target[VOLMAXPARTS+1];
+ byte part_done_pass[VOLMAXPARTS+1];
+ struct rx_queue * part_pass_head[VOLMAXPARTS+1];
+ int stats[4][VOLMAXPARTS+1];
+} vshutdown_thread_t;
+static void * VShutdownThread(void * args);
+
+
+static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode);
+static int VCheckFree(Volume * vp);
+
+/* VByP List */
+static void AddVolumeToVByPList_r(Volume * vp);
+static void DeleteVolumeFromVByPList_r(Volume * vp);
+static void VVByPListBeginExclusive_r(struct DiskPartition64 * dp);
+static void VVByPListEndExclusive_r(struct DiskPartition64 * dp);
+static void VVByPListWait_r(struct DiskPartition64 * dp);
+
+/* online salvager */
+static int VCheckSalvage(Volume * vp);
+#if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT)
+static int VScheduleSalvage_r(Volume * vp);
+#endif
-extern struct Lock FSYNC_handler_lock;
+/* Volume hash table */
+static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp);
+static void VHashBeginExclusive_r(VolumeHashChainHead * head);
+static void VHashEndExclusive_r(VolumeHashChainHead * head);
+static void VHashWait_r(VolumeHashChainHead * head);
+
+/* shutdown */
+static int ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass);
+static int ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
+ struct rx_queue ** idx);
+static void ShutdownController(vshutdown_thread_t * params);
+static void ShutdownCreateSchedule(vshutdown_thread_t * params);
+
+/* VLRU */
+static void VLRU_ComputeConstants(void);
+static void VInitVLRU(void);
+static void VLRU_Init_Node_r(Volume * vp);
+static void VLRU_Add_r(Volume * vp);
+static void VLRU_Delete_r(Volume * vp);
+static void VLRU_UpdateAccess_r(Volume * vp);
+static void * VLRU_ScannerThread(void * args);
+static void VLRU_Scan_r(int idx);
+static void VLRU_Promote_r(int idx);
+static void VLRU_Demote_r(int idx);
+static void VLRU_SwitchQueues(Volume * vp, int new_idx, int append);
+
+/* soft detach */
+static int VCheckSoftDetach(Volume * vp, afs_uint32 thresh);
+static int VCheckSoftDetachCandidate(Volume * vp, afs_uint32 thresh);
+static int VSoftDetachVolume_r(Volume * vp, afs_uint32 thresh);
+
+
+pthread_key_t VThread_key;
+VThreadOptions_t VThread_defaults = {
+ 0 /**< allow salvsync */
+};
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+struct Lock vol_listLock; /* Lock obtained when listing volumes:
+ * prevents a volume from being missed
+ * if the volume is attached during a
+ * list volumes */
-static int TimeZoneCorrection; /* Number of seconds west of GMT */
/* Common message used when the volume goes off line */
char *VSalvageMessage =
* 3 - initialized, all volumes have been attached, and
* VConnectFS() has completed. */
+static int vinit_attach_abort = 0;
bit32 VolumeCacheCheck; /* Incremented everytime a volume goes on line--
* used to stamp volume headers and in-core
* vnodes. When the volume goes on-line the
- * vnode will be invalidated */
+ * vnode will be invalidated
+ * access only with VOL_LOCK held */
+
+
+
+
+/***************************************************/
+/* Startup routines */
+/***************************************************/
+
+#if defined(FAST_RESTART) && defined(AFS_DEMAND_ATTACH_FS)
+# error FAST_RESTART and DAFS are incompatible. For the DAFS equivalent \
+ of FAST_RESTART, use the -unsafe-nosalvage fileserver argument
+#endif
+
+/**
+ * assign default values to a VolumePackageOptions struct.
+ *
+ * Always call this on a VolumePackageOptions struct first, then set any
+ * specific options you want, then call VInitVolumePackage2.
+ *
+ * @param[in] pt caller's program type
+ * @param[out] opts volume package options
+ */
+void
+VOptDefaults(ProgramType pt, VolumePackageOptions *opts)
+{
+ opts->nLargeVnodes = opts->nSmallVnodes = 5;
+ opts->volcache = 0;
+
+ opts->canScheduleSalvage = 0;
+ opts->canUseFSSYNC = 0;
+ opts->canUseSALVSYNC = 0;
+
+#ifdef FAST_RESTART
+ opts->unsafe_attach = 1;
+#else /* !FAST_RESTART */
+ opts->unsafe_attach = 0;
+#endif /* !FAST_RESTART */
+
+ switch (pt) {
+ case fileServer:
+ opts->canScheduleSalvage = 1;
+ opts->canUseSALVSYNC = 1;
+ break;
+
+ case salvageServer:
+ opts->canUseFSSYNC = 1;
+ break;
-int VolumeCacheSize = 200, VolumeGets = 0, VolumeReplacements = 0, Vlooks = 0;
+ case volumeServer:
+ opts->nLargeVnodes = 0;
+ opts->nSmallVnodes = 0;
+ opts->canScheduleSalvage = 1;
+ opts->canUseFSSYNC = 1;
+ break;
+
+ default:
+ /* noop */
+ break;
+ }
+}
+
+/**
+ * Set VInit to a certain value, and signal waiters.
+ *
+ * @param[in] value the value to set VInit to
+ *
+ * @pre VOL_LOCK held
+ */
+static void
+VSetVInit_r(int value)
+{
+ VInit = value;
+ CV_BROADCAST(&vol_vinit_cond);
+}
int
-VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
- int connect, int volcache)
+VInitVolumePackage2(ProgramType pt, VolumePackageOptions * opts)
{
int errors = 0; /* Number of errors while finding vice partitions. */
- struct timeval tv;
- struct timezone tz;
programType = pt;
+ vol_opts = *opts;
-#ifdef AFS_PTHREAD_ENV
- assert(pthread_mutex_init(&vol_glock_mutex, NULL) == 0);
- assert(pthread_mutex_init(&vol_attach_mutex, NULL) == 0);
- assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0);
- assert(pthread_mutex_init(&vol_trans_mutex, NULL) == 0);
- assert(pthread_cond_init(&vol_put_volume_cond, NULL) == 0);
- assert(pthread_cond_init(&vol_sleep_cond, NULL) == 0);
-#else /* AFS_PTHREAD_ENV */
+ memset(&VStats, 0, sizeof(VStats));
+ VStats.hdr_cache_size = 200;
+
+ VInitPartitionPackage();
+ VInitVolumeHash();
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (programType == fileServer) {
+ VInitVLRU();
+ } else {
+ VLRU_SetOptions(VLRU_SET_ENABLED, 0);
+ }
+ osi_Assert(pthread_key_create(&VThread_key, NULL) == 0);
+#endif
+
+ MUTEX_INIT(&vol_glock_mutex, "vol glock", MUTEX_DEFAULT, 0);
+ MUTEX_INIT(&vol_trans_mutex, "vol trans", MUTEX_DEFAULT, 0);
+ CV_INIT(&vol_put_volume_cond, "vol put", CV_DEFAULT, 0);
+ CV_INIT(&vol_sleep_cond, "vol sleep", CV_DEFAULT, 0);
+ CV_INIT(&vol_init_attach_cond, "vol init attach", CV_DEFAULT, 0);
+ CV_INIT(&vol_vinit_cond, "vol init", CV_DEFAULT, 0);
+#ifndef AFS_PTHREAD_ENV
IOMGR_Initialize();
#endif /* AFS_PTHREAD_ENV */
Lock_Init(&vol_listLock);
- Lock_Init(&FSYNC_handler_lock);
+
srandom(time(0)); /* For VGetVolumeInfo */
- gettimeofday(&tv, &tz);
- TimeZoneCorrection = tz.tz_minuteswest * 60;
- /* Ok, we have done enough initialization that fileserver can
- * start accepting calls, even though the volumes may not be
+#ifdef AFS_DEMAND_ATTACH_FS
+ MUTEX_INIT(&vol_salvsync_mutex, "salvsync", MUTEX_DEFAULT, 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ /* Ok, we have done enough initialization that fileserver can
+ * start accepting calls, even though the volumes may not be
* available just yet.
*/
VInit = 1;
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_SERVER)
+ if (programType == salvageServer) {
+ SALVSYNC_salvInit();
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+#ifdef FSSYNC_BUILD_SERVER
if (programType == fileServer) {
- /* File server or "stand" */
FSYNC_fsInit();
}
+#endif
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT)
+ if (VCanUseSALVSYNC()) {
+ /* establish a connection to the salvager at this point */
+ osi_Assert(VConnectSALV() != 0);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
- if (volcache > VolumeCacheSize)
- VolumeCacheSize = volcache;
- InitLRU(VolumeCacheSize);
+ if (opts->volcache > VStats.hdr_cache_size)
+ VStats.hdr_cache_size = opts->volcache;
+ VInitVolumeHeaderCache(VStats.hdr_cache_size);
- VInitVnodes(vLarge, nLargeVnodes);
- VInitVnodes(vSmall, nSmallVnodes);
+ VInitVnodes(vLarge, opts->nLargeVnodes);
+ VInitVnodes(vSmall, opts->nSmallVnodes);
errors = VAttachPartitions();
if (errors)
return -1;
- if (programType == fileServer) {
- DIR *dirp;
- struct dirent *dp;
- struct DiskPartition *diskP;
-
-
- /* Attach all the volumes in this partition */
- for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
- int nAttached = 0, nUnattached = 0;
- Log("Partition %s: attaching volumes\n", diskP->name);
- dirp = opendir(VPartitionPath(diskP));
- assert(dirp);
- while ((dp = readdir(dirp))) {
- char *p;
- p = strrchr(dp->d_name, '.');
- if (p != NULL && strcmp(p, VHDREXT) == 0) {
- Error error;
- Volume *vp;
- vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
- V_VOLUPD);
- (*(vp ? &nAttached : &nUnattached))++;
- if (error == VOFFLINE)
- Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
- else if (LogLevel >= 5) {
- Log("Partition %s: attached volume %d (%s)\n",
- diskP->name, VolumeNumber(dp->d_name),
- dp->d_name);
- }
- if (vp) {
- VPutVolume(vp);
- }
- }
- }
- Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached);
- closedir(dirp);
- }
+ if (programType != fileServer) {
+ errors = VInitAttachVolumes(programType);
+ if (errors) {
+ return -1;
+ }
}
- VInit = 2; /* Initialized, and all volumes have been attached */
- if (programType == volumeUtility && connect) {
+#ifdef FSSYNC_BUILD_CLIENT
+ if (VCanUseFSSYNC()) {
if (!VConnectFS()) {
- Log("Unable to connect to file server; aborted\n");
- exit(1);
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (programType == salvageServer) {
+ Log("Unable to connect to file server; aborted\n");
+ exit(1);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+ Log("Unable to connect to file server; will retry at need\n");
}
}
+#endif /* FSSYNC_BUILD_CLIENT */
return 0;
}
-/* This must be called by any volume utility which needs to run while the
- file server is also running. This is separated from VInitVolumePackage so
- that a utility can fork--and each of the children can independently
- initialize communication with the file server */
+
+#if !defined(AFS_PTHREAD_ENV)
+/**
+ * Attach volumes in vice partitions
+ *
+ * @param[in] pt calling program type
+ *
+ * @return 0
+ * @note This is the original, non-threaded version of attach parititions.
+ *
+ * @post VInit state is 2
+ */
int
-VConnectFS(void)
+VInitAttachVolumes(ProgramType pt)
{
- int retVal;
+ osi_Assert(VInit==1);
+ if (pt == fileServer) {
+ struct DiskPartition64 *diskP;
+ /* Attach all the volumes in this partition */
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ int nAttached = 0, nUnattached = 0;
+ osi_Assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
+ }
+ }
VOL_LOCK;
- retVal = VConnectFS_r();
+ VSetVInit_r(2); /* Initialized, and all volumes have been attached */
+ LWP_NoYieldSignal(VInitAttachVolumes);
VOL_UNLOCK;
- return retVal;
+ return 0;
}
+#endif /* !AFS_PTHREAD_ENV */
+#if defined(AFS_PTHREAD_ENV) && !defined(AFS_DEMAND_ATTACH_FS)
+/**
+ * Attach volumes in vice partitions
+ *
+ * @param[in] pt calling program type
+ *
+ * @return 0
+ * @note Threaded version of attach parititions.
+ *
+ * @post VInit state is 2
+ */
int
-VConnectFS_r(void)
+VInitAttachVolumes(ProgramType pt)
{
- int rc;
- assert(VInit == 2 && programType == volumeUtility);
- rc = FSYNC_clientInit();
- if (rc)
- VInit = 3;
- return rc;
-}
+ osi_Assert(VInit==1);
+ if (pt == fileServer) {
+ struct DiskPartition64 *diskP;
+ struct vinitvolumepackage_thread_t params;
+ struct diskpartition_queue_t * dpq;
+ int i, threads, parts;
+ pthread_t tid;
+ pthread_attr_t attrs;
+
+ CV_INIT(¶ms.thread_done_cv, "thread done", CV_DEFAULT, 0);
+ queue_Init(¶ms);
+ params.n_threads_complete = 0;
+
+ /* create partition work queue */
+ for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
+ dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
+ osi_Assert(dpq != NULL);
+ dpq->diskP = diskP;
+ queue_Append(¶ms,dpq);
+ }
-void
-VDisconnectFS_r(void)
-{
- assert(programType == volumeUtility);
- FSYNC_clientFinis();
- VInit = 2;
+ threads = MIN(parts, vol_attach_threads);
+
+ if (threads > 1) {
+ /* spawn off a bunch of initialization threads */
+ osi_Assert(pthread_attr_init(&attrs) == 0);
+ osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+
+ Log("VInitVolumePackage: beginning parallel fileserver startup\n");
+ Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n",
+ threads, parts);
+
+ VOL_LOCK;
+ for (i=0; i < threads; i++) {
+ AFS_SIGSET_DECL;
+ AFS_SIGSET_CLEAR();
+ osi_Assert(pthread_create
+ (&tid, &attrs, &VInitVolumePackageThread,
+ ¶ms) == 0);
+ AFS_SIGSET_RESTORE();
+ }
+
+ while(params.n_threads_complete < threads) {
+ VOL_CV_WAIT(¶ms.thread_done_cv);
+ }
+ VOL_UNLOCK;
+
+ osi_Assert(pthread_attr_destroy(&attrs) == 0);
+ } else {
+ /* if we're only going to run one init thread, don't bother creating
+ * another LWP */
+ Log("VInitVolumePackage: beginning single-threaded fileserver startup\n");
+ Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n",
+ parts);
+
+ VInitVolumePackageThread(¶ms);
+ }
+
+ CV_DESTROY(¶ms.thread_done_cv);
+ }
+ VOL_LOCK;
+ VSetVInit_r(2); /* Initialized, and all volumes have been attached */
+ CV_BROADCAST(&vol_init_attach_cond);
+ VOL_UNLOCK;
+ return 0;
}
-void
-VDisconnectFS(void)
-{
+static void *
+VInitVolumePackageThread(void * args) {
+
+ struct DiskPartition64 *diskP;
+ struct vinitvolumepackage_thread_t * params;
+ struct diskpartition_queue_t * dpq;
+
+ params = (vinitvolumepackage_thread_t *) args;
+
+
VOL_LOCK;
- VDisconnectFS_r();
+ /* Attach all the volumes in this partition */
+ while (queue_IsNotEmpty(params)) {
+ int nAttached = 0, nUnattached = 0;
+
+ if (vinit_attach_abort) {
+ Log("Aborting initialization\n");
+ goto done;
+ }
+
+ dpq = queue_First(params,diskpartition_queue_t);
+ queue_Remove(dpq);
+ VOL_UNLOCK;
+ diskP = dpq->diskP;
+ free(dpq);
+
+ osi_Assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
+
+ VOL_LOCK;
+ }
+
+done:
+ params->n_threads_complete++;
+ CV_SIGNAL(¶ms->thread_done_cv);
VOL_UNLOCK;
+ return NULL;
}
+#endif /* AFS_PTHREAD_ENV && !AFS_DEMAND_ATTACH_FS */
-void
-VShutdown_r(void)
+#if defined(AFS_DEMAND_ATTACH_FS)
+/**
+ * Attach volumes in vice partitions
+ *
+ * @param[in] pt calling program type
+ *
+ * @return 0
+ * @note Threaded version of attach partitions.
+ *
+ * @post VInit state is 2
+ */
+int
+VInitAttachVolumes(ProgramType pt)
{
- int i;
- register Volume *vp, *np;
- register afs_int32 code;
-
- Log("VShutdown: shutting down on-line volumes...\n");
- for (i = 0; i < VOLUME_HASH_TABLE_SIZE; i++) {
- /* try to hold first volume in the hash table */
- for (vp = VolumeHashTable[i]; vp; vp = vp->hashNext) {
- code = VHold_r(vp);
- if (code == 0)
- break; /* got it */
- /* otherwise we go around again, trying another volume */
+ osi_Assert(VInit==1);
+ if (pt == fileServer) {
+
+ struct DiskPartition64 *diskP;
+ struct partition_queue pq;
+ struct volume_init_queue vq;
+
+ int i, threads, parts;
+ pthread_t tid;
+ pthread_attr_t attrs;
+
+ /* create partition work queue */
+ queue_Init(&pq);
+ CV_INIT(&(pq.cv), "partq", CV_DEFAULT, 0);
+ MUTEX_INIT(&(pq.mutex), "partq", MUTEX_DEFAULT, 0);
+ for (parts = 0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
+ struct diskpartition_queue_t *dp;
+ dp = (struct diskpartition_queue_t*)malloc(sizeof(struct diskpartition_queue_t));
+ osi_Assert(dp != NULL);
+ dp->diskP = diskP;
+ queue_Append(&pq, dp);
}
- while (vp) {
- if (LogLevel >= 5)
- Log("VShutdown: Attempting to take volume %u offline.\n",
- vp->hashid);
- /* first compute np before releasing vp, in case vp disappears
- * after releasing. Hold it, so it doesn't disapear. If we
- * can't hold it, try the next one in the chain. Invariant
- * at the top of this loop is that vp is held (has extra ref count).
- */
- for (np = vp->hashNext; np; np = np->hashNext) {
- code = VHold_r(np);
- if (code == 0)
- break; /* got it */
- }
- /* next, take the volume offline (drops reference count) */
- VOffline_r(vp, "File server was shut down");
- vp = np; /* next guy to try */
+
+ /* number of worker threads; at least one, not to exceed the number of partitions */
+ threads = MIN(parts, vol_attach_threads);
+
+ /* create volume work queue */
+ queue_Init(&vq);
+ CV_INIT(&(vq.cv), "volq", CV_DEFAULT, 0);
+ MUTEX_INIT(&(vq.mutex), "volq", MUTEX_DEFAULT, 0);
+
+ osi_Assert(pthread_attr_init(&attrs) == 0);
+ osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+
+ Log("VInitVolumePackage: beginning parallel fileserver startup\n");
+ Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n",
+ threads, parts);
+
+ /* create threads to scan disk partitions. */
+ for (i=0; i < threads; i++) {
+ struct vinitvolumepackage_thread_param *params;
+ AFS_SIGSET_DECL;
+
+ params = (struct vinitvolumepackage_thread_param *)malloc(sizeof(struct vinitvolumepackage_thread_param));
+ osi_Assert(params);
+ params->pq = &pq;
+ params->vq = &vq;
+ params->nthreads = threads;
+ params->thread = i+1;
+
+ AFS_SIGSET_CLEAR();
+ osi_Assert(pthread_create (&tid, &attrs, &VInitVolumePackageThread, (void*)params) == 0);
+ AFS_SIGSET_RESTORE();
}
+
+ VInitPreAttachVolumes(threads, &vq);
+
+ osi_Assert(pthread_attr_destroy(&attrs) == 0);
+ CV_DESTROY(&pq.cv);
+ MUTEX_DESTROY(&pq.mutex);
+ CV_DESTROY(&vq.cv);
+ MUTEX_DESTROY(&vq.mutex);
}
- Log("VShutdown: complete.\n");
-}
-void
-VShutdown(void)
-{
VOL_LOCK;
- VShutdown_r();
+ VSetVInit_r(2); /* Initialized, and all volumes have been attached */
+ CV_BROADCAST(&vol_init_attach_cond);
VOL_UNLOCK;
+
+ return 0;
}
+/**
+ * Volume package initialization worker thread. Scan partitions for volume
+ * header files. Gather batches of volume ids and dispatch them to
+ * the main thread to be preattached. The volume preattachement is done
+ * in the main thread to avoid global volume lock contention.
+ */
+static void *
+VInitVolumePackageThread(void *args)
+{
+ struct vinitvolumepackage_thread_param *params;
+ struct DiskPartition64 *partition;
+ struct partition_queue *pq;
+ struct volume_init_queue *vq;
+ struct volume_init_batch *vb;
+
+ osi_Assert(args);
+ params = (struct vinitvolumepackage_thread_param *)args;
+ pq = params->pq;
+ vq = params->vq;
+ osi_Assert(pq);
+ osi_Assert(vq);
+
+ vb = (struct volume_init_batch*)malloc(sizeof(struct volume_init_batch));
+ osi_Assert(vb);
+ vb->thread = params->thread;
+ vb->last = 0;
+ vb->size = 0;
+
+ Log("Scanning partitions on thread %d of %d\n", params->thread, params->nthreads);
+ while((partition = VInitNextPartition(pq))) {
+ DIR *dirp;
+ VolId vid;
+
+ Log("Partition %s: pre-attaching volumes\n", partition->name);
+ dirp = opendir(VPartitionPath(partition));
+ if (!dirp) {
+ Log("opendir on Partition %s failed, errno=%d!\n", partition->name, errno);
+ continue;
+ }
+ while ((vid = VInitNextVolumeId(dirp))) {
+ Volume *vp = (Volume*)malloc(sizeof(Volume));
+ osi_Assert(vp);
+ memset(vp, 0, sizeof(Volume));
+ vp->device = partition->device;
+ vp->partition = partition;
+ vp->hashid = vid;
+ queue_Init(&vp->vnode_list);
+ CV_INIT(&V_attachCV(vp), "partattach", CV_DEFAULT, 0);
+
+ vb->batch[vb->size++] = vp;
+ if (vb->size == VINIT_BATCH_MAX_SIZE) {
+ MUTEX_ENTER(&vq->mutex);
+ queue_Append(vq, vb);
+ CV_BROADCAST(&vq->cv);
+ MUTEX_EXIT(&vq->mutex);
+
+ vb = (struct volume_init_batch*)malloc(sizeof(struct volume_init_batch));
+ osi_Assert(vb);
+ vb->thread = params->thread;
+ vb->size = 0;
+ vb->last = 0;
+ }
+ }
+ closedir(dirp);
+ }
-static void
-ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
- bit32 version)
+ vb->last = 1;
+ MUTEX_ENTER(&vq->mutex);
+ queue_Append(vq, vb);
+ CV_BROADCAST(&vq->cv);
+ MUTEX_EXIT(&vq->mutex);
+
+ Log("Partition scan thread %d of %d ended\n", params->thread, params->nthreads);
+ free(params);
+ return NULL;
+}
+
+/**
+ * Read next element from the pre-populated partition list.
+ */
+static struct DiskPartition64*
+VInitNextPartition(struct partition_queue *pq)
{
- struct versionStamp *vsn;
- FdHandle_t *fdP;
+ struct DiskPartition64 *partition;
+ struct diskpartition_queue_t *dp; /* queue element */
- *ec = 0;
- if (h == NULL) {
- *ec = VSALVAGE;
- return;
+ if (vinit_attach_abort) {
+ Log("Aborting volume preattach thread.\n");
+ return NULL;
}
- fdP = IH_OPEN(h);
- if (fdP == NULL) {
- *ec = VSALVAGE;
- return;
+ /* get next partition to scan */
+ MUTEX_ENTER(&pq->mutex);
+ if (queue_IsEmpty(pq)) {
+ MUTEX_EXIT(&pq->mutex);
+ return NULL;
}
+ dp = queue_First(pq, diskpartition_queue_t);
+ queue_Remove(dp);
+ MUTEX_EXIT(&pq->mutex);
- if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
- *ec = VSALVAGE;
- FDH_REALLYCLOSE(fdP);
- return;
- }
- vsn = (struct versionStamp *)to;
- if (FDH_READ(fdP, to, size) != size || vsn->magic != magic) {
- *ec = VSALVAGE;
- FDH_REALLYCLOSE(fdP);
- return;
- }
- FDH_CLOSE(fdP);
+ osi_Assert(dp);
+ osi_Assert(dp->diskP);
- /* Check is conditional, in case caller wants to inspect version himself */
- if (version && vsn->version != version) {
- *ec = VSALVAGE;
+ partition = dp->diskP;
+ free(dp);
+ return partition;
+}
+
+/**
+ * Find next volume id on the partition.
+ */
+static VolId
+VInitNextVolumeId(DIR *dirp)
+{
+ struct dirent *d;
+ VolId vid = 0;
+ char *ext;
+
+ while((d = readdir(dirp))) {
+ if (vinit_attach_abort) {
+ Log("Aborting volume preattach thread.\n");
+ break;
+ }
+ ext = strrchr(d->d_name, '.');
+ if (d->d_name[0] == 'V' && ext && strcmp(ext, VHDREXT) == 0) {
+ vid = VolumeNumber(d->d_name);
+ if (vid) {
+ break;
+ }
+ Log("Warning: bogus volume header file: %s\n", d->d_name);
+ }
}
+ return vid;
}
-/* VolumeHeaderToDisk
- * Allows for storing 64 bit inode numbers in on-disk volume header
- * file.
+/**
+ * Preattach volumes in batches to avoid lock contention.
*/
-void
-VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
+static int
+VInitPreAttachVolumes(int nthreads, struct volume_init_queue *vq)
{
+ struct volume_init_batch *vb;
+ int i;
- memset((char *)dh, 0, sizeof(VolumeDiskHeader_t));
- dh->stamp = h->stamp;
- dh->id = h->id;
- dh->parent = h->parent;
+ while (nthreads) {
+ /* dequeue next volume */
+ MUTEX_ENTER(&vq->mutex);
+ if (queue_IsEmpty(vq)) {
+ CV_WAIT(&vq->cv, &vq->mutex);
+ }
+ vb = queue_First(vq, volume_init_batch);
+ queue_Remove(vb);
+ MUTEX_EXIT(&vq->mutex);
+
+ if (vb->size) {
+ VOL_LOCK;
+ for (i = 0; i<vb->size; i++) {
+ Volume *vp;
+ Volume *dup;
+ Error ec = 0;
+
+ vp = vb->batch[i];
+ dup = VLookupVolume_r(&ec, vp->hashid, NULL);
+ if (ec) {
+ Log("Error looking up volume, code=%d\n", ec);
+ }
+ else if (dup) {
+ Log("Warning: Duplicate volume id %d detected.\n", vp->hashid);
+ }
+ else {
+ /* put pre-attached volume onto the hash table
+ * and bring it up to the pre-attached state */
+ AddVolumeToHashTable(vp, vp->hashid);
+ AddVolumeToVByPList_r(vp);
+ VLRU_Init_Node_r(vp);
+ VChangeState_r(vp, VOL_STATE_PREATTACHED);
+ }
+ }
+ VOL_UNLOCK;
+ }
+
+ if (vb->last) {
+ nthreads--;
+ }
+ free(vb);
+ }
+ return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
-#ifdef AFS_64BIT_IOPS_ENV
- dh->volumeInfo_lo = (afs_int32) h->volumeInfo & 0xffffffff;
- dh->volumeInfo_hi = (afs_int32) (h->volumeInfo >> 32) & 0xffffffff;
- dh->smallVnodeIndex_lo = (afs_int32) h->smallVnodeIndex & 0xffffffff;
- dh->smallVnodeIndex_hi =
- (afs_int32) (h->smallVnodeIndex >> 32) & 0xffffffff;
- dh->largeVnodeIndex_lo = (afs_int32) h->largeVnodeIndex & 0xffffffff;
- dh->largeVnodeIndex_hi =
- (afs_int32) (h->largeVnodeIndex >> 32) & 0xffffffff;
- dh->linkTable_lo = (afs_int32) h->linkTable & 0xffffffff;
- dh->linkTable_hi = (afs_int32) (h->linkTable >> 32) & 0xffffffff;
-#else
- dh->volumeInfo_lo = h->volumeInfo;
- dh->smallVnodeIndex_lo = h->smallVnodeIndex;
- dh->largeVnodeIndex_lo = h->largeVnodeIndex;
- dh->linkTable_lo = h->linkTable;
-#endif
+#if !defined(AFS_DEMAND_ATTACH_FS)
+/*
+ * attach all volumes on a given disk partition
+ */
+static int
+VAttachVolumesByPartition(struct DiskPartition64 *diskP, int * nAttached, int * nUnattached)
+{
+ DIR * dirp;
+ struct dirent * dp;
+ int ret = 0;
+
+ Log("Partition %s: attaching volumes\n", diskP->name);
+ dirp = opendir(VPartitionPath(diskP));
+ if (!dirp) {
+ Log("opendir on Partition %s failed!\n", diskP->name);
+ return 1;
+ }
+
+ while ((dp = readdir(dirp))) {
+ char *p;
+ p = strrchr(dp->d_name, '.');
+
+ if (vinit_attach_abort) {
+ Log("Partition %s: abort attach volumes\n", diskP->name);
+ goto done;
+ }
+
+ if (p != NULL && strcmp(p, VHDREXT) == 0) {
+ Error error;
+ Volume *vp;
+ vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
+ V_VOLUPD);
+ (*(vp ? nAttached : nUnattached))++;
+ if (error == VOFFLINE)
+ Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
+ else if (LogLevel >= 5) {
+ Log("Partition %s: attached volume %d (%s)\n",
+ diskP->name, VolumeNumber(dp->d_name),
+ dp->d_name);
+ }
+ if (vp) {
+ VPutVolume(vp);
+ }
+ }
+ }
+
+ Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached);
+done:
+ closedir(dirp);
+ return ret;
}
+#endif /* !AFS_DEMAND_ATTACH_FS */
-/* DiskToVolumeHeader
- * Reads volume header file from disk, convering 64 bit inodes
- * if required. Makes the assumption that AFS has *always*
- * zero'd the volume header file so that high parts of inode
- * numbers are 0 in older (SGI EFS) volume header files.
+/***************************************************/
+/* Shutdown routines */
+/***************************************************/
+
+/*
+ * demand attach fs
+ * highly multithreaded volume package shutdown
+ *
+ * with the demand attach fileserver extensions,
+ * VShutdown has been modified to be multithreaded.
+ * In order to achieve optimal use of many threads,
+ * the shutdown code involves one control thread and
+ * n shutdown worker threads. The control thread
+ * periodically examines the number of volumes available
+ * for shutdown on each partition, and produces a worker
+ * thread allocation schedule. The idea is to eliminate
+ * redundant scheduling computation on the workers by
+ * having a single master scheduler.
+ *
+ * The scheduler's objectives are:
+ * (1) fairness
+ * each partition with volumes remaining gets allocated
+ * at least 1 thread (assuming sufficient threads)
+ * (2) performance
+ * threads are allocated proportional to the number of
+ * volumes remaining to be offlined. This ensures that
+ * the OS I/O scheduler has many requests to elevator
+ * seek on partitions that will (presumably) take the
+ * longest amount of time (from now) to finish shutdown
+ * (3) keep threads busy
+ * when there are extra threads, they are assigned to
+ * partitions using a simple round-robin algorithm
+ *
+ * In the future, we may wish to add the ability to adapt
+ * to the relative performance patterns of each disk
+ * partition.
+ *
+ *
+ * demand attach fs
+ * multi-step shutdown process
+ *
+ * demand attach shutdown is a four-step process. Each
+ * shutdown "pass" shuts down increasingly more difficult
+ * volumes. The main purpose is to achieve better cache
+ * utilization during shutdown.
+ *
+ * pass 0
+ * shutdown volumes in the unattached, pre-attached
+ * and error states
+ * pass 1
+ * shutdown attached volumes with cached volume headers
+ * pass 2
+ * shutdown all volumes in non-exclusive states
+ * pass 3
+ * shutdown all remaining volumes
*/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+
void
-DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
+VShutdown_r(void)
{
- memset((char *)h, 0, sizeof(VolumeHeader_t));
- h->stamp = dh->stamp;
- h->id = dh->id;
- h->parent = dh->parent;
+ int i;
+ struct DiskPartition64 * diskP;
+ struct diskpartition_queue_t * dpq;
+ vshutdown_thread_t params;
+ pthread_t tid;
+ pthread_attr_t attrs;
+
+ memset(¶ms, 0, sizeof(vshutdown_thread_t));
+
+ if (VInit < 2) {
+ Log("VShutdown: aborting attach volumes\n");
+ vinit_attach_abort = 1;
+ VOL_CV_WAIT(&vol_init_attach_cond);
+ }
-#ifdef AFS_64BIT_IOPS_ENV
- h->volumeInfo =
- (Inode) dh->volumeInfo_lo | ((Inode) dh->volumeInfo_hi << 32);
+ for (params.n_parts=0, diskP = DiskPartitionList;
+ diskP; diskP = diskP->next, params.n_parts++);
- h->smallVnodeIndex =
- (Inode) dh->smallVnodeIndex_lo | ((Inode) dh->
- smallVnodeIndex_hi << 32);
+ Log("VShutdown: shutting down on-line volumes on %d partition%s...\n",
+ params.n_parts, params.n_parts > 1 ? "s" : "");
- h->largeVnodeIndex =
- (Inode) dh->largeVnodeIndex_lo | ((Inode) dh->
- largeVnodeIndex_hi << 32);
- h->linkTable =
- (Inode) dh->linkTable_lo | ((Inode) dh->linkTable_hi << 32);
-#else
- h->volumeInfo = dh->volumeInfo_lo;
- h->smallVnodeIndex = dh->smallVnodeIndex_lo;
- h->largeVnodeIndex = dh->largeVnodeIndex_lo;
- h->linkTable = dh->linkTable_lo;
-#endif
+ if (vol_attach_threads > 1) {
+ /* prepare for parallel shutdown */
+ params.n_threads = vol_attach_threads;
+ MUTEX_INIT(¶ms.lock, "params", MUTEX_DEFAULT, 0);
+ CV_INIT(¶ms.cv, "params", CV_DEFAULT, 0);
+ CV_INIT(¶ms.master_cv, "params master", CV_DEFAULT, 0);
+ osi_Assert(pthread_attr_init(&attrs) == 0);
+ osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+ queue_Init(¶ms);
+
+ /* setup the basic partition information structures for
+ * parallel shutdown */
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ /* XXX debug */
+ struct rx_queue * qp, * nqp;
+ Volume * vp;
+ int count = 0;
+
+ VVByPListWait_r(diskP);
+ VVByPListBeginExclusive_r(diskP);
+
+ /* XXX debug */
+ for (queue_Scan(&diskP->vol_list, qp, nqp, rx_queue)) {
+ vp = (Volume *)((char *)qp - offsetof(Volume, vol_list));
+ if (vp->header)
+ count++;
+ }
+ Log("VShutdown: partition %s has %d volumes with attached headers\n",
+ VPartitionPath(diskP), count);
+
+
+ /* build up the pass 0 shutdown work queue */
+ dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
+ osi_Assert(dpq != NULL);
+ dpq->diskP = diskP;
+ queue_Prepend(¶ms, dpq);
+
+ params.part_pass_head[diskP->index] = queue_First(&diskP->vol_list, rx_queue);
+ }
+
+ Log("VShutdown: beginning parallel fileserver shutdown\n");
+ Log("VShutdown: using %d threads to offline volumes on %d partition%s\n",
+ vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" );
+
+ /* do pass 0 shutdown */
+ MUTEX_ENTER(¶ms.lock);
+ for (i=0; i < params.n_threads; i++) {
+ osi_Assert(pthread_create
+ (&tid, &attrs, &VShutdownThread,
+ ¶ms) == 0);
+ }
+
+ /* wait for all the pass 0 shutdowns to complete */
+ while (params.n_threads_complete < params.n_threads) {
+ CV_WAIT(¶ms.master_cv, ¶ms.lock);
+ }
+ params.n_threads_complete = 0;
+ params.pass = 1;
+ CV_BROADCAST(¶ms.cv);
+ MUTEX_EXIT(¶ms.lock);
+
+ Log("VShutdown: pass 0 completed using the 1 thread per partition algorithm\n");
+ Log("VShutdown: starting passes 1 through 3 using finely-granular mp-fast algorithm\n");
+
+ /* run the parallel shutdown scheduler. it will drop the glock internally */
+ ShutdownController(¶ms);
+
+ /* wait for all the workers to finish pass 3 and terminate */
+ while (params.pass < 4) {
+ VOL_CV_WAIT(¶ms.cv);
+ }
+
+ osi_Assert(pthread_attr_destroy(&attrs) == 0);
+ CV_DESTROY(¶ms.cv);
+ CV_DESTROY(¶ms.master_cv);
+ MUTEX_DESTROY(¶ms.lock);
+
+ /* drop the VByPList exclusive reservations */
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ VVByPListEndExclusive_r(diskP);
+ Log("VShutdown: %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
+ VPartitionPath(diskP),
+ params.stats[0][diskP->index],
+ params.stats[1][diskP->index],
+ params.stats[2][diskP->index],
+ params.stats[3][diskP->index]);
+ }
+
+ Log("VShutdown: shutdown finished using %d threads\n", params.n_threads);
+ } else {
+ /* if we're only going to run one shutdown thread, don't bother creating
+ * another LWP */
+ Log("VShutdown: beginning single-threaded fileserver shutdown\n");
+
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ VShutdownByPartition_r(diskP);
+ }
+ }
+
+ Log("VShutdown: complete.\n");
}
+#else /* AFS_DEMAND_ATTACH_FS */
void
-WriteVolumeHeader_r(ec, vp)
- Error *ec;
- Volume *vp;
+VShutdown_r(void)
{
- IHandle_t *h = V_diskDataHandle(vp);
- FdHandle_t *fdP;
-
- *ec = 0;
+ int i;
+ Volume *vp, *np;
+ afs_int32 code;
- fdP = IH_OPEN(h);
- if (fdP == NULL) {
- *ec = VSALVAGE;
- return;
- }
- if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
- *ec = VSALVAGE;
- FDH_REALLYCLOSE(fdP);
- return;
+ if (VInit < 2) {
+ Log("VShutdown: aborting attach volumes\n");
+ vinit_attach_abort = 1;
+#ifdef AFS_PTHREAD_ENV
+ VOL_CV_WAIT(&vol_init_attach_cond);
+#else
+ LWP_WaitProcess(VInitAttachVolumes);
+#endif /* AFS_PTHREAD_ENV */
}
- if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
- != sizeof(V_disk(vp))) {
- *ec = VSALVAGE;
- FDH_REALLYCLOSE(fdP);
- return;
+
+ Log("VShutdown: shutting down on-line volumes...\n");
+ for (i = 0; i < VolumeHashTable.Size; i++) {
+ /* try to hold first volume in the hash table */
+ for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) {
+ code = VHold_r(vp);
+ if (code == 0) {
+ if (LogLevel >= 5)
+ Log("VShutdown: Attempting to take volume %u offline.\n",
+ vp->hashid);
+
+ /* next, take the volume offline (drops reference count) */
+ VOffline_r(vp, "File server was shut down");
+ }
+ }
}
- FDH_CLOSE(fdP);
+ Log("VShutdown: complete.\n");
}
+#endif /* AFS_DEMAND_ATTACH_FS */
-/* Attach an existing volume, given its pathname, and return a
- pointer to the volume header information. The volume also
- normally goes online at this time. An offline volume
- must be reattached to make it go online */
-Volume *
-VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
+
+void
+VShutdown(void)
{
- Volume *retVal;
- VATTACH_LOCK;
+ osi_Assert(VInit>0);
VOL_LOCK;
- retVal = VAttachVolumeByName_r(ec, partition, name, mode);
+ VShutdown_r();
VOL_UNLOCK;
- VATTACH_UNLOCK;
- return retVal;
}
-Volume *
-VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
+/**
+ * stop new activity (e.g. SALVSYNC) from occurring
+ *
+ * Use this to make the volume package less busy; for example, during
+ * shutdown. This doesn't actually shutdown/detach anything in the
+ * volume package, but prevents certain processes from ocurring. For
+ * example, preventing new SALVSYNC communication in DAFS. In theory, we
+ * could also use this to prevent new volume attachment, or prevent
+ * other programs from checking out volumes, etc.
+ */
+void
+VSetTranquil(void)
{
- register Volume *vp;
- int fd, n;
- struct afs_stat status;
- struct VolumeDiskHeader diskHeader;
- struct VolumeHeader iheader;
- struct DiskPartition *partp;
- char path[64];
- int isbusy = 0;
- *ec = 0;
- if (programType == volumeUtility) {
- assert(VInit == 3);
- VLockPartition_r(partition);
- }
- if (programType == fileServer) {
- vp = VGetVolume_r(ec, VolumeNumber(name));
- if (vp) {
- if (V_inUse(vp))
- return vp;
- if (vp->specialStatus == VBUSY)
- isbusy = 1;
- VDetachVolume_r(ec, vp);
- if (*ec) {
- Log("VAttachVolume: Error detaching volume (%s)\n", name);
- }
- }
- }
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* make sure we don't try to contact the salvageserver, since it may
+ * not be around anymore */
+ vol_disallow_salvsync = 1;
+#endif
+}
- if (!(partp = VGetPartition_r(partition, 0))) {
- *ec = VNOVOL;
- Log("VAttachVolume: Error getting partition (%s)\n", partition);
- goto done;
- }
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * shutdown control thread
+ */
+static void
+ShutdownController(vshutdown_thread_t * params)
+{
+ /* XXX debug */
+ struct DiskPartition64 * diskP;
+ Device id;
+ vshutdown_thread_t shadow;
- *ec = 0;
- strcpy(path, VPartitionPath(partp));
- strcat(path, "/");
- strcat(path, name);
- VOL_UNLOCK;
- if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
- Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
- if (fd > -1)
- close(fd);
+ ShutdownCreateSchedule(params);
+
+ while ((params->pass < 4) &&
+ (params->n_threads_complete < params->n_threads)) {
+ /* recompute schedule once per second */
+
+ memcpy(&shadow, params, sizeof(vshutdown_thread_t));
+
+ VOL_UNLOCK;
+ /* XXX debug */
+ Log("ShutdownController: schedule version=%d, vol_remaining=%d, pass=%d\n",
+ shadow.schedule_version, shadow.vol_remaining, shadow.pass);
+ Log("ShutdownController: n_threads_complete=%d, n_parts_done_pass=%d\n",
+ shadow.n_threads_complete, shadow.n_parts_done_pass);
+ for (diskP = DiskPartitionList; diskP; diskP=diskP->next) {
+ id = diskP->index;
+ Log("ShutdownController: part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n",
+ id,
+ diskP->vol_list.len,
+ shadow.part_thread_target[id],
+ shadow.part_done_pass[id],
+ shadow.part_pass_head[id]);
+ }
+
+ sleep(1);
VOL_LOCK;
- *ec = VNOVOL;
- goto done;
+
+ ShutdownCreateSchedule(params);
}
- n = read(fd, &diskHeader, sizeof(diskHeader));
- close(fd);
- VOL_LOCK;
- if (n != sizeof(diskHeader)
- || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
- Log("VAttachVolume: Error reading volume header %s\n", path);
- *ec = VSALVAGE;
- goto done;
+}
+
+/* create the shutdown thread work schedule.
+ * this scheduler tries to implement fairness
+ * by allocating at least 1 thread to each
+ * partition with volumes to be shutdown,
+ * and then it attempts to allocate remaining
+ * threads based upon the amount of work left
+ */
+static void
+ShutdownCreateSchedule(vshutdown_thread_t * params)
+{
+ struct DiskPartition64 * diskP;
+ int sum, thr_workload, thr_left;
+ int part_residue[VOLMAXPARTS+1];
+ Device id;
+
+ /* compute the total number of outstanding volumes */
+ sum = 0;
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ sum += diskP->vol_list.len;
}
- if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
- Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
- *ec = VSALVAGE;
- goto done;
+
+ params->schedule_version++;
+ params->vol_remaining = sum;
+
+ if (!sum)
+ return;
+
+ /* compute average per-thread workload */
+ thr_workload = sum / params->n_threads;
+ if (sum % params->n_threads)
+ thr_workload++;
+
+ thr_left = params->n_threads;
+ memset(&part_residue, 0, sizeof(part_residue));
+
+ /* for fairness, give every partition with volumes remaining
+ * at least one thread */
+ for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
+ id = diskP->index;
+ if (diskP->vol_list.len) {
+ params->part_thread_target[id] = 1;
+ thr_left--;
+ } else {
+ params->part_thread_target[id] = 0;
+ }
}
- DiskToVolumeHeader(&iheader, &diskHeader);
- if (programType == volumeUtility && mode != V_SECRETLY) {
- if (FSYNC_askfs(iheader.id, partition, FSYNC_NEEDVOLUME, mode)
- == FSYNC_DENIED) {
- Log("VAttachVolume: attach of volume %u apparently denied by file server\n", iheader.id);
- *ec = VNOVOL; /* XXXX */
- goto done;
+ if (thr_left && thr_workload) {
+ /* compute length-weighted workloads */
+ int delta;
+
+ for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
+ id = diskP->index;
+ delta = (diskP->vol_list.len / thr_workload) -
+ params->part_thread_target[id];
+ if (delta < 0) {
+ continue;
+ }
+ if (delta < thr_left) {
+ params->part_thread_target[id] += delta;
+ thr_left -= delta;
+ } else {
+ params->part_thread_target[id] += thr_left;
+ thr_left = 0;
+ break;
+ }
}
}
- vp = attach2(ec, path, &iheader, partp, isbusy);
- if (programType == volumeUtility && vp) {
- /* duplicate computation in fssync.c about whether the server
- * takes the volume offline or not. If the volume isn't
- * offline, we must not return it when we detach the volume,
- * or the server will abort */
- if (mode == V_READONLY
- || (!VolumeWriteable(vp) && (mode == V_CLONE || mode == V_DUMP)))
- vp->needsPutBack = 0;
- else
- vp->needsPutBack = 1;
+ if (thr_left) {
+ /* try to assign any leftover threads to partitions that
+ * had volume lengths closer to needing thread_target+1 */
+ int max_residue, max_id = 0;
+
+ /* compute the residues */
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ part_residue[id] = diskP->vol_list.len -
+ (params->part_thread_target[id] * thr_workload);
+ }
+
+ /* now try to allocate remaining threads to partitions with the
+ * highest residues */
+ while (thr_left) {
+ max_residue = 0;
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ if (part_residue[id] > max_residue) {
+ max_residue = part_residue[id];
+ max_id = id;
+ }
+ }
+
+ if (!max_residue) {
+ break;
+ }
+
+ params->part_thread_target[max_id]++;
+ thr_left--;
+ part_residue[max_id] = 0;
+ }
}
- /* OK, there's a problem here, but one that I don't know how to
- * fix right now, and that I don't think should arise often.
- * Basically, we should only put back this volume to the server if
- * it was given to us by the server, but since we don't have a vp,
- * we can't run the VolumeWriteable function to find out as we do
- * above when computing vp->needsPutBack. So we send it back, but
- * there's a path in VAttachVolume on the server which may abort
- * if this volume doesn't have a header. Should be pretty rare
- * for all of that to happen, but if it does, probably the right
- * fix is for the server to allow the return of readonly volumes
- * that it doesn't think are really checked out. */
- if (programType == volumeUtility && vp == NULL && mode != V_SECRETLY) {
- FSYNC_askfs(iheader.id, partition, FSYNC_ON, 0);
- } else if (programType == fileServer && vp) {
- V_needsCallback(vp) = 0;
-#ifdef notdef
- if (VInit >= 2 && V_BreakVolumeCallbacks) {
- Log("VAttachVolume: Volume %u was changed externally; breaking callbacks\n", V_id(vp));
- (*V_BreakVolumeCallbacks) (V_id(vp));
+
+ if (thr_left) {
+ /* punt and give any remaining threads equally to each partition */
+ int alloc;
+ if (thr_left >= params->n_parts) {
+ alloc = thr_left / params->n_parts;
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ params->part_thread_target[id] += alloc;
+ thr_left -= alloc;
+ }
}
-#endif
- VUpdateVolume_r(ec, vp);
- if (*ec) {
- Log("VAttachVolume: Error updating volume\n");
- if (vp)
- VPutVolume_r(vp);
- goto done;
+
+ /* finish off the last of the threads */
+ for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) {
+ id = diskP->index;
+ params->part_thread_target[id]++;
+ thr_left--;
}
- if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
- /* This is a hack: by temporarily settint the incore
- * dontSalvage flag ON, the volume will be put back on the
- * Update list (with dontSalvage OFF again). It will then
- * come back in N minutes with DONT_SALVAGE eventually
- * set. This is the way that volumes that have never had
- * it set get it set; or that volumes that have been
- * offline without DONT SALVAGE having been set also
- * eventually get it set */
- V_dontSalvage(vp) = DONT_SALVAGE;
- VAddToVolumeUpdateList_r(ec, vp);
- if (*ec) {
- Log("VAttachVolume: Error adding volume to update list\n");
- if (vp)
- VPutVolume_r(vp);
- goto done;
- }
- }
- if (LogLevel)
- Log("VOnline: volume %u (%s) attached and online\n", V_id(vp),
- V_name(vp));
- }
- done:
- if (programType == volumeUtility) {
- VUnlockPartition_r(partition);
}
- if (*ec)
- return NULL;
- else
- return vp;
}
-private Volume *
-attach2(Error * ec, char *path, register struct VolumeHeader * header,
- struct DiskPartition * partp, int isbusy)
+/* worker thread for parallel shutdown */
+static void *
+VShutdownThread(void * args)
{
- register Volume *vp;
-
- VOL_UNLOCK;
- vp = (Volume *) calloc(1, sizeof(Volume));
- assert(vp != NULL);
- vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
- vp->device = partp->device;
- vp->partition = partp;
- IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent,
- header->largeVnodeIndex);
- IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header->parent,
- header->smallVnodeIndex);
- IH_INIT(vp->diskDataHandle, partp->device, header->parent,
- header->volumeInfo);
- IH_INIT(vp->linkHandle, partp->device, header->parent, header->linkTable);
- vp->cacheCheck = ++VolumeCacheCheck;
- /* just in case this ever rolls over */
- if (!vp->cacheCheck)
- vp->cacheCheck = ++VolumeCacheCheck;
- vp->shuttingDown = 0;
- vp->goingOffline = 0;
- vp->nUsers = 1;
- VOL_LOCK;
- GetVolumeHeader(vp);
- VOL_UNLOCK;
- (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
- sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
- VOL_LOCK;
- if (*ec) {
- Log("VAttachVolume: Error reading diskDataHandle vol header %s; error=%u\n", path, *ec);
+ vshutdown_thread_t * params;
+ int found, pass, schedule_version_save, count;
+ struct DiskPartition64 *diskP;
+ struct diskpartition_queue_t * dpq;
+ Device id;
+
+ params = (vshutdown_thread_t *) args;
+
+ /* acquire the shutdown pass 0 lock */
+ MUTEX_ENTER(¶ms->lock);
+
+ /* if there's still pass 0 work to be done,
+ * get a work entry, and do a pass 0 shutdown */
+ if (queue_IsNotEmpty(params)) {
+ dpq = queue_First(params, diskpartition_queue_t);
+ queue_Remove(dpq);
+ MUTEX_EXIT(¶ms->lock);
+ diskP = dpq->diskP;
+ free(dpq);
+ id = diskP->index;
+
+ count = 0;
+ while (ShutdownVolumeWalk_r(diskP, 0, ¶ms->part_pass_head[id]))
+ count++;
+ params->stats[0][diskP->index] = count;
+ MUTEX_ENTER(¶ms->lock);
}
- if (!*ec) {
- struct IndexFileHeader iHead;
-#if OPENAFS_VOL_STATS
- /*
- * We just read in the diskstuff part of the header. If the detailed
- * volume stats area has not yet been initialized, we should bzero the
- * area and mark it as initialized.
- */
- if (!(V_stat_initialized(vp))) {
- memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
- V_stat_initialized(vp) = 1;
- }
-#endif /* OPENAFS_VOL_STATS */
- VOL_UNLOCK;
- (void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle,
- (char *)&iHead, sizeof(iHead),
- SMALLINDEXMAGIC, SMALLINDEXVERSION);
- VOL_LOCK;
- if (*ec) {
- Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec);
- }
- }
- if (!*ec) {
- struct IndexFileHeader iHead;
- VOL_UNLOCK;
- (void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle,
- (char *)&iHead, sizeof(iHead),
- LARGEINDEXMAGIC, LARGEINDEXVERSION);
- VOL_LOCK;
- if (*ec) {
- Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec);
- }
- }
-#ifdef AFS_NAMEI_ENV
- if (!*ec) {
- struct versionStamp stamp;
- VOL_UNLOCK;
- (void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp,
- sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION);
- VOL_LOCK;
- if (*ec) {
- Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec);
- }
- }
-#endif
- if (*ec) {
- Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
- FreeVolume(vp);
- return NULL;
+ params->n_threads_complete++;
+ if (params->n_threads_complete == params->n_threads) {
+ /* notify control thread that all workers have completed pass 0 */
+ CV_SIGNAL(¶ms->master_cv);
}
- if (V_needsSalvaged(vp)) {
- if (vp->specialStatus)
- vp->specialStatus = 0;
- Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
- *ec = VSALVAGE;
- FreeVolume(vp);
- return NULL;
+ while (params->pass == 0) {
+ CV_WAIT(¶ms->cv, ¶ms->lock);
}
- if (programType == fileServer) {
-#ifndef FAST_RESTART
- if (V_inUse(vp) && VolumeWriteable(vp)) {
- if (!V_needsSalvaged(vp)) {
- V_needsSalvaged(vp) = 1;
- VUpdateVolume_r(ec, vp);
+
+ /* switch locks */
+ MUTEX_EXIT(¶ms->lock);
+ VOL_LOCK;
+
+ pass = params->pass;
+ osi_Assert(pass > 0);
+
+ /* now escalate through the more complicated shutdowns */
+ while (pass <= 3) {
+ schedule_version_save = params->schedule_version;
+ found = 0;
+ /* find a disk partition to work on */
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ if (params->part_thread_target[id] && !params->part_done_pass[id]) {
+ params->part_thread_target[id]--;
+ found = 1;
+ break;
}
- FreeVolume(vp);
- Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
- *ec = VSALVAGE;
- return NULL;
- }
-#endif /* FAST_RESTART */
- if (V_destroyMe(vp) == DESTROY_ME) {
- FreeVolume(vp);
- Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
- *ec = VNOVOL;
- return NULL;
}
- }
- AddVolumeToHashTable(vp, V_id(vp));
- vp->nextVnodeUnique = V_uniquifier(vp);
- vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL;
-#ifndef BITMAP_LATER
- if (programType == fileServer && VolumeWriteable(vp)) {
- int i;
- for (i = 0; i < nVNODECLASSES; i++) {
- VOL_UNLOCK;
- GetBitmap(ec, vp, i);
- VOL_LOCK;
- if (*ec) {
- FreeVolume(vp);
- Log("VAttachVolume: error getting bitmap for volume (%s)\n",
- path);
- return NULL;
+ if (!found) {
+ /* hmm. for some reason the controller thread couldn't find anything for
+ * us to do. let's see if there's anything we can do */
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ if (diskP->vol_list.len && !params->part_done_pass[id]) {
+ found = 1;
+ break;
+ } else if (!params->part_done_pass[id]) {
+ params->part_done_pass[id] = 1;
+ params->n_parts_done_pass++;
+ if (pass == 3) {
+ Log("VShutdown: done shutting down volumes on partition %s.\n",
+ VPartitionPath(diskP));
+ }
+ }
}
}
- }
-#endif /* BITMAP_LATER */
- if (programType == fileServer) {
- if (vp->specialStatus)
- vp->specialStatus = 0;
- if (V_blessed(vp) && V_inService(vp) && !V_needsSalvaged(vp)) {
- V_inUse(vp) = 1;
- V_offlineMessage(vp)[0] = '\0';
- }
- }
+ /* do work on this partition until either the controller
+ * creates a new schedule, or we run out of things to do
+ * on this partition */
+ if (found) {
+ count = 0;
+ while (!params->part_done_pass[id] &&
+ (schedule_version_save == params->schedule_version)) {
+ /* ShutdownVolumeWalk_r will drop the glock internally */
+ if (!ShutdownVolumeWalk_r(diskP, pass, ¶ms->part_pass_head[id])) {
+ if (!params->part_done_pass[id]) {
+ params->part_done_pass[id] = 1;
+ params->n_parts_done_pass++;
+ if (pass == 3) {
+ Log("VShutdown: done shutting down volumes on partition %s.\n",
+ VPartitionPath(diskP));
+ }
+ }
+ break;
+ }
+ count++;
+ }
- return vp;
-}
+ params->stats[pass][id] += count;
+ } else {
+ /* ok, everyone is done this pass, proceed */
+
+ /* barrier lock */
+ params->n_threads_complete++;
+ while (params->pass == pass) {
+ if (params->n_threads_complete == params->n_threads) {
+ /* we are the last thread to complete, so we will
+ * reinitialize worker pool state for the next pass */
+ params->n_threads_complete = 0;
+ params->n_parts_done_pass = 0;
+ params->pass++;
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ params->part_done_pass[id] = 0;
+ params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue);
+ }
-/* Attach an existing volume.
- The volume also normally goes online at this time.
- An offline volume must be reattached to make it go online.
- */
+ /* compute a new thread schedule before releasing all the workers */
+ ShutdownCreateSchedule(params);
+
+ /* wake up all the workers */
+ CV_BROADCAST(¶ms->cv);
+
+ VOL_UNLOCK;
+ Log("VShutdown: pass %d completed using %d threads on %d partitions\n",
+ pass, params->n_threads, params->n_parts);
+ VOL_LOCK;
+ } else {
+ VOL_CV_WAIT(¶ms->cv);
+ }
+ }
+ pass = params->pass;
+ }
+
+ /* for fairness */
+ VOL_UNLOCK;
+ pthread_yield();
+ VOL_LOCK;
+ }
-Volume *
-VAttachVolume(Error * ec, VolumeId volumeId, int mode)
-{
- Volume *retVal;
- VATTACH_LOCK;
- VOL_LOCK;
- retVal = VAttachVolume_r(ec, volumeId, mode);
VOL_UNLOCK;
- VATTACH_UNLOCK;
- return retVal;
+
+ return NULL;
}
-Volume *
-VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
+/* shut down all volumes on a given disk partition
+ *
+ * note that this function will not allow mp-fast
+ * shutdown of a partition */
+int
+VShutdownByPartition_r(struct DiskPartition64 * dp)
{
- char *part, *name;
- GetVolumePath(ec, volumeId, &part, &name);
- if (*ec) {
- register Volume *vp;
- Error error;
- vp = VGetVolume_r(&error, volumeId);
- if (vp) {
- assert(V_inUse(vp) == 0);
- VDetachVolume_r(ec, vp);
- }
- return NULL;
+ int pass;
+ int pass_stats[4];
+ int total;
+
+ /* wait for other exclusive ops to finish */
+ VVByPListWait_r(dp);
+
+ /* begin exclusive access */
+ VVByPListBeginExclusive_r(dp);
+
+ /* pick the low-hanging fruit first,
+ * then do the complicated ones last
+ * (has the advantage of keeping
+ * in-use volumes up until the bitter end) */
+ for (pass = 0, total=0; pass < 4; pass++) {
+ pass_stats[pass] = ShutdownVByPForPass_r(dp, pass);
+ total += pass_stats[pass];
}
- return VAttachVolumeByName_r(ec, part, name, mode);
+
+ /* end exclusive access */
+ VVByPListEndExclusive_r(dp);
+
+ Log("VShutdownByPartition: shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
+ total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]);
+
+ return 0;
}
-/* Increment a reference count to a volume, sans context swaps. Requires
- * possibly reading the volume header in from the disk, since there's
- * an invariant in the volume package that nUsers>0 ==> vp->header is valid.
+/* internal shutdown functionality
*
- * N.B. This call can fail if we can't read in the header!! In this case
- * we still guarantee we won't context swap, but the ref count won't be
- * incremented (otherwise we'd violate the invariant).
+ * for multi-pass shutdown:
+ * 0 to only "shutdown" {pre,un}attached and error state volumes
+ * 1 to also shutdown attached volumes w/ volume header loaded
+ * 2 to also shutdown attached volumes w/o volume header loaded
+ * 3 to also shutdown exclusive state volumes
+ *
+ * caller MUST hold exclusive access on the hash chain
+ * because we drop vol_glock_mutex internally
+ *
+ * this function is reentrant for passes 1--3
+ * (e.g. multiple threads can cooperate to
+ * shutdown a partition mp-fast)
+ *
+ * pass 0 is not scaleable because the volume state data is
+ * synchronized by vol_glock mutex, and the locking overhead
+ * is too high to drop the lock long enough to do linked list
+ * traversal
*/
static int
-VHold_r(register Volume * vp)
+ShutdownVByPForPass_r(struct DiskPartition64 * dp, int pass)
{
- Error error;
+ struct rx_queue * q = queue_First(&dp->vol_list, rx_queue);
+ int i = 0;
- if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
- VolumeReplacements++;
- ReadHeader(&error, V_diskDataHandle(vp), (char *)&V_disk(vp),
- sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
- if (error)
- return error;
- }
- vp->nUsers++;
- return 0;
+ while (ShutdownVolumeWalk_r(dp, pass, &q))
+ i++;
+
+ return i;
}
+/* conditionally shutdown one volume on partition dp
+ * returns 1 if a volume was shutdown in this pass,
+ * 0 otherwise */
static int
-VHold(register Volume * vp)
+ShutdownVolumeWalk_r(struct DiskPartition64 * dp, int pass,
+ struct rx_queue ** idx)
{
- int retVal;
- VOL_LOCK;
- retVal = VHold_r(vp);
- VOL_UNLOCK;
- return retVal;
-}
+ struct rx_queue *qp, *nqp;
+ Volume * vp;
-void
-VTakeOffline_r(register Volume * vp)
-{
- assert(vp->nUsers > 0);
- assert(programType == fileServer);
- vp->goingOffline = 1;
- V_needsSalvaged(vp) = 1;
-}
+ qp = *idx;
-void
-VTakeOffline(register Volume * vp)
-{
- VOL_LOCK;
- VTakeOffline_r(vp);
- VOL_UNLOCK;
-}
+ for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) {
+ vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list));
-void
-VPutVolume_r(register Volume * vp)
-{
- assert(--vp->nUsers >= 0);
- if (vp->nUsers == 0) {
- ReleaseVolumeHeader(vp->header);
- if (vp->goingOffline) {
- Error error;
- assert(programType == fileServer);
- vp->goingOffline = 0;
- V_inUse(vp) = 0;
- VUpdateVolume_r(&error, vp);
- VCloseVolumeHandles_r(vp);
- if (LogLevel) {
- Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
- V_name(vp));
- if (V_offlineMessage(vp)[0])
- Log(" (%s)", V_offlineMessage(vp));
- Log("\n");
+ switch (pass) {
+ case 0:
+ if ((V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+ (V_attachState(vp) != VOL_STATE_ERROR) &&
+ (V_attachState(vp) != VOL_STATE_DELETED) &&
+ (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
+ break;
}
-#ifdef AFS_PTHREAD_ENV
- assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
- LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
- }
- if (vp->shuttingDown) {
- VReleaseVolumeHandles_r(vp);
- FreeVolume(vp);
- if (programType == fileServer)
-#ifdef AFS_PTHREAD_ENV
- assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
- LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
+ case 1:
+ if ((V_attachState(vp) == VOL_STATE_ATTACHED) &&
+ (vp->header == NULL)) {
+ break;
+ }
+ case 2:
+ if (VIsExclusiveState(V_attachState(vp))) {
+ break;
+ }
+ case 3:
+ *idx = nqp;
+ DeleteVolumeFromVByPList_r(vp);
+ VShutdownVolume_r(vp);
+ vp = NULL;
+ return 1;
}
}
-}
-void
-VPutVolume(register Volume * vp)
-{
- VOL_LOCK;
- VPutVolume_r(vp);
- VOL_UNLOCK;
+ return 0;
}
-/* Get a pointer to an attached volume. The pointer is returned regardless
- of whether or not the volume is in service or on/off line. An error
- code, however, is returned with an indication of the volume's status */
-Volume *
-VGetVolume(Error * ec, VolId volumeId)
+/*
+ * shutdown a specific volume
+ */
+/* caller MUST NOT hold a heavyweight ref on vp */
+int
+VShutdownVolume_r(Volume * vp)
{
- Volume *retVal;
- VOL_LOCK;
- retVal = VGetVolume_r(ec, volumeId);
- VOL_UNLOCK;
- return retVal;
-}
+ int code;
-Volume *
-VGetVolume_r(Error * ec, VolId volumeId)
-{
- Volume *vp;
- unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V4 = 0, V5 = 0, V6 =
- 0, V7 = 0, V8 = 0, V9 = 0;
- unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0;
+ VCreateReservation_r(vp);
- for (;;) {
- *ec = 0;
- V0++;
- for (vp = VolumeHashTable[VOLUME_HASH(volumeId)];
- vp && vp->hashid != volumeId; vp = vp->hashNext)
- Vlooks++;
+ if (LogLevel >= 5) {
+ Log("VShutdownVolume_r: vid=%u, device=%d, state=%hu\n",
+ vp->hashid, vp->partition->device, V_attachState(vp));
+ }
- if (!vp) {
- V1++;
- if (VInit < 2) {
- V2++;
- /* Until we have reached an initialization level of 2
- * we don't know whether this volume exists or not.
- * We can't sleep and retry later because before a volume
- * is attached, the caller tries to get it first. Just
- * return VOFFLINE and the caller can choose whether to
- * retry the command or not. */
- *ec = VOFFLINE;
- break;
- }
+ /* wait for other blocking ops to finish */
+ VWaitExclusiveState_r(vp);
- *ec = VNOVOL;
- break;
- }
+ osi_Assert(VIsValidState(V_attachState(vp)));
- V3++;
- VolumeGets++;
- if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
- V5++;
- VolumeReplacements++;
- ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
- sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
- VOLUMEINFOVERSION);
- if (*ec) {
- V6++;
- /* Only log the error if it was a totally unexpected error. Simply
- * a missing inode is likely to be caused by the volume being deleted */
- if (errno != ENXIO || LogLevel)
- Log("Volume %u: couldn't reread volume header\n",
- vp->hashid);
- FreeVolume(vp);
- vp = 0;
- break;
- }
- }
- V7++;
- if (vp->shuttingDown) {
- V8++;
- *ec = VNOVOL;
- vp = 0;
- break;
- }
- if (programType == fileServer) {
- V9++;
- if (vp->goingOffline) {
- V10++;
-#ifdef AFS_PTHREAD_ENV
- pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex);
-#else /* AFS_PTHREAD_ENV */
- LWP_WaitProcess(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
- continue;
- }
- if (vp->specialStatus) {
- V11++;
- *ec = vp->specialStatus;
- } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
- V12++;
- *ec = VNOVOL;
- } else if (V_inUse(vp) == 0) {
- V13++;
- *ec = VOFFLINE;
- } else {
- V14++;
- }
+ switch(V_attachState(vp)) {
+ case VOL_STATE_SALVAGING:
+ /* Leave salvaging volumes alone. Any in-progress salvages will
+ * continue working after viced shuts down. This is intentional.
+ */
+
+ case VOL_STATE_PREATTACHED:
+ case VOL_STATE_ERROR:
+ VChangeState_r(vp, VOL_STATE_UNATTACHED);
+ case VOL_STATE_UNATTACHED:
+ case VOL_STATE_DELETED:
+ break;
+ case VOL_STATE_GOING_OFFLINE:
+ case VOL_STATE_SHUTTING_DOWN:
+ case VOL_STATE_ATTACHED:
+ code = VHold_r(vp);
+ if (!code) {
+ if (LogLevel >= 5)
+ Log("VShutdown: Attempting to take volume %u offline.\n",
+ vp->hashid);
+
+ /* take the volume offline (drops reference count) */
+ VOffline_r(vp, "File server was shut down");
}
break;
+ default:
+ break;
}
- V15++;
- /* if no error, bump nUsers */
- if (vp)
- vp->nUsers++;
- assert(vp || *ec);
- return vp;
+ VCancelReservation_r(vp);
+ vp = NULL;
+ return 0;
}
+#endif /* AFS_DEMAND_ATTACH_FS */
-/* For both VForceOffline and VOffline, we close all relevant handles.
- * For VOffline, if we re-attach the volume, the files may possible be
- * different than before.
+/***************************************************/
+/* Header I/O routines */
+/***************************************************/
+
+/* open a descriptor for the inode (h),
+ * read in an on-disk structure into buffer (to) of size (size),
+ * verify versionstamp in structure has magic (magic) and
+ * optionally verify version (version) if (version) is nonzero
*/
static void
-VReleaseVolumeHandles_r(Volume * vp)
+ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
+ bit32 version)
{
- DFlushVolume(V_id(vp));
- VReleaseVnodeFiles_r(vp);
+ struct versionStamp *vsn;
+ FdHandle_t *fdP;
- /* Too time consuming and unnecessary for the volserver */
- if (programType != volumeUtility) {
- IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
- IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
- IH_CONDSYNC(vp->diskDataHandle);
-#ifdef AFS_NT40_ENV
- IH_CONDSYNC(vp->linkHandle);
-#endif /* AFS_NT40_ENV */
+ *ec = 0;
+ if (h == NULL) {
+ *ec = VSALVAGE;
+ return;
}
- IH_RELEASE(vp->vnodeIndex[vLarge].handle);
- IH_RELEASE(vp->vnodeIndex[vSmall].handle);
- IH_RELEASE(vp->diskDataHandle);
- IH_RELEASE(vp->linkHandle);
-}
-
-/* Force the volume offline, set the salvage flag. No further references to
- * the volume through the volume package will be honored. */
-void
-VForceOffline_r(Volume * vp)
-{
- Error error;
- if (!V_inUse(vp))
+ fdP = IH_OPEN(h);
+ if (fdP == NULL) {
+ *ec = VSALVAGE;
return;
- strcpy(V_offlineMessage(vp),
- "Forced offline due to internal error: volume needs to be salvaged");
- Log("Volume %u forced offline: it needs salvaging!\n", V_id(vp));
- V_inUse(vp) = 0;
- vp->goingOffline = 0;
- V_needsSalvaged(vp) = 1;
- VUpdateVolume_r(&error, vp);
-#ifdef AFS_PTHREAD_ENV
- assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
- LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
+ }
- VReleaseVolumeHandles_r(vp);
+ vsn = (struct versionStamp *)to;
+ if (FDH_PREAD(fdP, to, size, 0) != size || vsn->magic != magic) {
+ *ec = VSALVAGE;
+ FDH_REALLYCLOSE(fdP);
+ return;
+ }
+ FDH_CLOSE(fdP);
+ /* Check is conditional, in case caller wants to inspect version himself */
+ if (version && vsn->version != version) {
+ *ec = VSALVAGE;
+ }
}
void
-VForceOffline(Volume * vp)
+WriteVolumeHeader_r(Error * ec, Volume * vp)
{
- VOL_LOCK;
- VForceOffline_r(vp);
- VOL_UNLOCK;
-}
+ IHandle_t *h = V_diskDataHandle(vp);
+ FdHandle_t *fdP;
-/* The opposite of VAttachVolume. The volume header is written to disk, with
- the inUse bit turned off. A copy of the header is maintained in memory,
- however (which is why this is VOffline, not VDetach).
- */
-void
-VOffline_r(Volume * vp, char *message)
-{
- Error error;
- VolumeId vid = V_id(vp);
- assert(programType != volumeUtility);
- if (!V_inUse(vp)) {
- VPutVolume_r(vp);
+ *ec = 0;
+
+ fdP = IH_OPEN(h);
+ if (fdP == NULL) {
+ *ec = VSALVAGE;
return;
}
- if (V_offlineMessage(vp)[0] == '\0')
- strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
- V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
- vp->goingOffline = 1;
- VPutVolume_r(vp);
- vp = VGetVolume_r(&error, vid); /* Wait for it to go offline */
- if (vp) /* In case it was reattached... */
- VPutVolume_r(vp);
-}
-
-void
-VOffline(Volume * vp, char *message)
-{
- VOL_LOCK;
- VOffline_r(vp, message);
- VOL_UNLOCK;
+ if (FDH_PWRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)), 0)
+ != sizeof(V_disk(vp))) {
+ *ec = VSALVAGE;
+ FDH_REALLYCLOSE(fdP);
+ return;
+ }
+ FDH_CLOSE(fdP);
}
-/* For VDetachVolume, we close all cached file descriptors, but keep
- * the Inode handles in case we need to read from a busy volume.
+/* VolumeHeaderToDisk
+ * Allows for storing 64 bit inode numbers in on-disk volume header
+ * file.
*/
-static void
-VCloseVolumeHandles_r(Volume * vp)
+/* convert in-memory representation of a volume header to the
+ * on-disk representation of a volume header */
+void
+VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
{
- DFlushVolume(V_id(vp));
- VCloseVnodeFiles_r(vp);
- /* Too time consuming and unnecessary for the volserver */
- if (programType != volumeUtility) {
- IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
- IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
- IH_CONDSYNC(vp->diskDataHandle);
-#ifdef AFS_NT40_ENV
- IH_CONDSYNC(vp->linkHandle);
-#endif /* AFS_NT40_ENV */
- }
+ memset(dh, 0, sizeof(VolumeDiskHeader_t));
+ dh->stamp = h->stamp;
+ dh->id = h->id;
+ dh->parent = h->parent;
- IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
- IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
- IH_REALLYCLOSE(vp->diskDataHandle);
- IH_REALLYCLOSE(vp->linkHandle);
+#ifdef AFS_64BIT_IOPS_ENV
+ dh->volumeInfo_lo = (afs_int32) h->volumeInfo & 0xffffffff;
+ dh->volumeInfo_hi = (afs_int32) (h->volumeInfo >> 32) & 0xffffffff;
+ dh->smallVnodeIndex_lo = (afs_int32) h->smallVnodeIndex & 0xffffffff;
+ dh->smallVnodeIndex_hi =
+ (afs_int32) (h->smallVnodeIndex >> 32) & 0xffffffff;
+ dh->largeVnodeIndex_lo = (afs_int32) h->largeVnodeIndex & 0xffffffff;
+ dh->largeVnodeIndex_hi =
+ (afs_int32) (h->largeVnodeIndex >> 32) & 0xffffffff;
+ dh->linkTable_lo = (afs_int32) h->linkTable & 0xffffffff;
+ dh->linkTable_hi = (afs_int32) (h->linkTable >> 32) & 0xffffffff;
+#else
+ dh->volumeInfo_lo = h->volumeInfo;
+ dh->smallVnodeIndex_lo = h->smallVnodeIndex;
+ dh->largeVnodeIndex_lo = h->largeVnodeIndex;
+ dh->linkTable_lo = h->linkTable;
+#endif
}
-/* This gets used for the most part by utility routines that don't want
- * to keep all the volume headers around. Generally, the file server won't
- * call this routine, because then the offline message in the volume header
- * (or other information) will still be available to clients. For NAMEI, also
- * close the file handles.
+/* DiskToVolumeHeader
+ * Converts an on-disk representation of a volume header to
+ * the in-memory representation of a volume header.
+ *
+ * Makes the assumption that AFS has *always*
+ * zero'd the volume header file so that high parts of inode
+ * numbers are 0 in older (SGI EFS) volume header files.
*/
void
-VDetachVolume_r(Error * ec, Volume * vp)
+DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
{
- VolumeId volume;
- struct DiskPartition *tpartp;
- int notifyServer, useDone;
+ memset(h, 0, sizeof(VolumeHeader_t));
+ h->stamp = dh->stamp;
+ h->id = dh->id;
+ h->parent = dh->parent;
- *ec = 0; /* always "succeeds" */
- if (programType == volumeUtility) {
- notifyServer = vp->needsPutBack;
- useDone = (V_destroyMe(vp) == DESTROY_ME);
- }
- tpartp = vp->partition;
- volume = V_id(vp);
- DeleteVolumeFromHashTable(vp);
- vp->shuttingDown = 1;
- VPutVolume_r(vp);
- /* Will be detached sometime in the future--this is OK since volume is offline */
+#ifdef AFS_64BIT_IOPS_ENV
+ h->volumeInfo =
+ (Inode) dh->volumeInfo_lo | ((Inode) dh->volumeInfo_hi << 32);
- if (programType == volumeUtility && notifyServer) {
- /*
- * Note: The server is not notified in the case of a bogus volume
- * explicitly to make it possible to create a volume, do a partial
- * restore, then abort the operation without ever putting the volume
- * online. This is essential in the case of a volume move operation
- * between two partitions on the same server. In that case, there
- * would be two instances of the same volume, one of them bogus,
- * which the file server would attempt to put on line
- */
- if (useDone)
- /* don't put online */
- FSYNC_askfs(volume, tpartp->name, FSYNC_DONE, 0);
- else {
- /* fs can use it again */
- FSYNC_askfs(volume, tpartp->name, FSYNC_ON, 0);
- /* Dettaching it so break all callbacks on it */
- if (V_BreakVolumeCallbacks) {
- Log("volume %u detached; breaking all call backs\n", volume);
- (*V_BreakVolumeCallbacks) (volume);
- }
- }
- }
+ h->smallVnodeIndex =
+ (Inode) dh->smallVnodeIndex_lo | ((Inode) dh->
+ smallVnodeIndex_hi << 32);
+
+ h->largeVnodeIndex =
+ (Inode) dh->largeVnodeIndex_lo | ((Inode) dh->
+ largeVnodeIndex_hi << 32);
+ h->linkTable =
+ (Inode) dh->linkTable_lo | ((Inode) dh->linkTable_hi << 32);
+#else
+ h->volumeInfo = dh->volumeInfo_lo;
+ h->smallVnodeIndex = dh->smallVnodeIndex_lo;
+ h->largeVnodeIndex = dh->largeVnodeIndex_lo;
+ h->linkTable = dh->linkTable_lo;
+#endif
}
-void
-VDetachVolume(Error * ec, Volume * vp)
+
+/***************************************************/
+/* Volume Attachment routines */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * pre-attach a volume given its path.
+ *
+ * @param[out] ec outbound error code
+ * @param[in] partition partition path string
+ * @param[in] name volume id string
+ *
+ * @return volume object pointer
+ *
+ * @note A pre-attached volume will only have its partition
+ * and hashid fields initialized. At first call to
+ * VGetVolume, the volume will be fully attached.
+ *
+ */
+Volume *
+VPreAttachVolumeByName(Error * ec, char *partition, char *name)
{
+ Volume * vp;
VOL_LOCK;
- VDetachVolume_r(ec, vp);
+ vp = VPreAttachVolumeByName_r(ec, partition, name);
VOL_UNLOCK;
+ return vp;
}
-
-VnodeId
-VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex
- *index)
-{
- register byte *bp, *ep;
- *ec = 0;
- /* This test is probably redundant */
+/**
+ * pre-attach a volume given its path.
+ *
+ * @param[out] ec outbound error code
+ * @param[in] partition path to vice partition
+ * @param[in] name volume id string
+ *
+ * @return volume object pointer
+ *
+ * @pre VOL_LOCK held
+ *
+ * @internal volume package internal use only.
+ */
+Volume *
+VPreAttachVolumeByName_r(Error * ec, char *partition, char *name)
+{
+ return VPreAttachVolumeById_r(ec,
+ partition,
+ VolumeNumber(name));
+}
+
+/**
+ * pre-attach a volume given its path and numeric volume id.
+ *
+ * @param[out] ec error code return
+ * @param[in] partition path to vice partition
+ * @param[in] volumeId numeric volume id
+ *
+ * @return volume object pointer
+ *
+ * @pre VOL_LOCK held
+ *
+ * @internal volume package internal use only.
+ */
+Volume *
+VPreAttachVolumeById_r(Error * ec,
+ char * partition,
+ VolId volumeId)
+{
+ Volume *vp;
+ struct DiskPartition64 *partp;
+
+ *ec = 0;
+
+ osi_Assert(programType == fileServer);
+
+ if (!(partp = VGetPartition_r(partition, 0))) {
+ *ec = VNOVOL;
+ Log("VPreAttachVolumeById_r: Error getting partition (%s)\n", partition);
+ return NULL;
+ }
+
+ vp = VLookupVolume_r(ec, volumeId, NULL);
+ if (*ec) {
+ return NULL;
+ }
+
+ return VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
+}
+
+/**
+ * preattach a volume.
+ *
+ * @param[out] ec outbound error code
+ * @param[in] partp pointer to partition object
+ * @param[in] vp pointer to volume object
+ * @param[in] vid volume id
+ *
+ * @return volume object pointer
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @warning Returned volume object pointer does not have to
+ * equal the pointer passed in as argument vp. There
+ * are potential race conditions which can result in
+ * the pointers having different values. It is up to
+ * the caller to make sure that references are handled
+ * properly in this case.
+ *
+ * @note If there is already a volume object registered with
+ * the same volume id, its pointer MUST be passed as
+ * argument vp. Failure to do so will result in a silent
+ * failure to preattach.
+ *
+ * @internal volume package internal use only.
+ */
+Volume *
+VPreAttachVolumeByVp_r(Error * ec,
+ struct DiskPartition64 * partp,
+ Volume * vp,
+ VolId vid)
+{
+ Volume *nvp = NULL;
+
+ *ec = 0;
+
+ /* check to see if pre-attach already happened */
+ if (vp &&
+ (V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+ (V_attachState(vp) != VOL_STATE_DELETED) &&
+ (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
+ !VIsErrorState(V_attachState(vp))) {
+ /*
+ * pre-attach is a no-op in all but the following cases:
+ *
+ * - volume is unattached
+ * - volume is in an error state
+ * - volume is pre-attached
+ */
+ Log("VPreattachVolumeByVp_r: volume %u not in quiescent state\n", vid);
+ goto done;
+ } else if (vp) {
+ /* we're re-attaching a volume; clear out some old state */
+ memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage));
+
+ if (V_partition(vp) != partp) {
+ /* XXX potential race */
+ DeleteVolumeFromVByPList_r(vp);
+ }
+ } else {
+ /* if we need to allocate a new Volume struct,
+ * go ahead and drop the vol glock, otherwise
+ * do the basic setup synchronised, as it's
+ * probably not worth dropping the lock */
+ VOL_UNLOCK;
+
+ /* allocate the volume structure */
+ vp = nvp = (Volume *) malloc(sizeof(Volume));
+ osi_Assert(vp != NULL);
+ memset(vp, 0, sizeof(Volume));
+ queue_Init(&vp->vnode_list);
+ CV_INIT(&V_attachCV(vp), "vp attach", CV_DEFAULT, 0);
+ }
+
+ /* link the volume with its associated vice partition */
+ vp->device = partp->device;
+ vp->partition = partp;
+
+ vp->hashid = vid;
+ vp->specialStatus = 0;
+
+ /* if we dropped the lock, reacquire the lock,
+ * check for pre-attach races, and then add
+ * the volume to the hash table */
+ if (nvp) {
+ VOL_LOCK;
+ nvp = VLookupVolume_r(ec, vid, NULL);
+ if (*ec) {
+ free(vp);
+ vp = NULL;
+ goto done;
+ } else if (nvp) { /* race detected */
+ free(vp);
+ vp = nvp;
+ goto done;
+ } else {
+ /* hack to make up for VChangeState_r() decrementing
+ * the old state counter */
+ VStats.state_levels[0]++;
+ }
+ }
+
+ /* put pre-attached volume onto the hash table
+ * and bring it up to the pre-attached state */
+ AddVolumeToHashTable(vp, vp->hashid);
+ AddVolumeToVByPList_r(vp);
+ VLRU_Init_Node_r(vp);
+ VChangeState_r(vp, VOL_STATE_PREATTACHED);
+
+ if (LogLevel >= 5)
+ Log("VPreAttachVolumeByVp_r: volume %u pre-attached\n", vp->hashid);
+
+ done:
+ if (*ec)
+ return NULL;
+ else
+ return vp;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/* Attach an existing volume, given its pathname, and return a
+ pointer to the volume header information. The volume also
+ normally goes online at this time. An offline volume
+ must be reattached to make it go online */
+Volume *
+VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
+{
+ Volume *retVal;
+ VOL_LOCK;
+ retVal = VAttachVolumeByName_r(ec, partition, name, mode);
+ VOL_UNLOCK;
+ return retVal;
+}
+
+Volume *
+VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
+{
+ Volume *vp = NULL;
+ struct DiskPartition64 *partp;
+ char path[64];
+ int isbusy = 0;
+ VolId volumeId;
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolumeStats stats_save;
+ Volume *svp = NULL;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ *ec = 0;
+
+ volumeId = VolumeNumber(name);
+
+ if (!(partp = VGetPartition_r(partition, 0))) {
+ *ec = VNOVOL;
+ Log("VAttachVolume: Error getting partition (%s)\n", partition);
+ goto done;
+ }
+
+ if (VRequiresPartLock()) {
+ osi_Assert(VInit == 3);
+ VLockPartition_r(partition);
+ } else if (programType == fileServer) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* lookup the volume in the hash table */
+ vp = VLookupVolume_r(ec, volumeId, NULL);
+ if (*ec) {
+ return NULL;
+ }
+
+ if (vp) {
+ /* save any counters that are supposed to
+ * be monotonically increasing over the
+ * lifetime of the fileserver */
+ memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
+ } else {
+ memset(&stats_save, 0, sizeof(VolumeStats));
+ }
+
+ /* if there's something in the hash table, and it's not
+ * in the pre-attach state, then we may need to detach
+ * it before proceeding */
+ if (vp && (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+
+ /* at this point state must be one of:
+ * - UNATTACHED
+ * - ATTACHED
+ * - SHUTTING_DOWN
+ * - GOING_OFFLINE
+ * - SALVAGING
+ * - ERROR
+ * - DELETED
+ */
+
+ if (vp->specialStatus == VBUSY)
+ isbusy = 1;
+
+ /* if it's already attached, see if we can return it */
+ if (V_attachState(vp) == VOL_STATE_ATTACHED) {
+ VGetVolumeByVp_r(ec, vp);
+ if (V_inUse(vp) == fileServer) {
+ VCancelReservation_r(vp);
+ return vp;
+ }
+
+ /* otherwise, we need to detach, and attempt to re-attach */
+ VDetachVolume_r(ec, vp);
+ if (*ec) {
+ Log("VAttachVolume: Error detaching old volume instance (%s)\n", name);
+ }
+ } else {
+ /* if it isn't fully attached, delete from the hash tables,
+ and let the refcounter handle the rest */
+ DeleteVolumeFromHashTable(vp);
+ DeleteVolumeFromVByPList_r(vp);
+ }
+
+ VCancelReservation_r(vp);
+ vp = NULL;
+ }
+
+ /* pre-attach volume if it hasn't been done yet */
+ if (!vp ||
+ (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+ (V_attachState(vp) == VOL_STATE_DELETED) ||
+ (V_attachState(vp) == VOL_STATE_ERROR)) {
+ svp = vp;
+ vp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
+ if (*ec) {
+ return NULL;
+ }
+ }
+
+ osi_Assert(vp != NULL);
+
+ /* handle pre-attach races
+ *
+ * multiple threads can race to pre-attach a volume,
+ * but we can't let them race beyond that
+ *
+ * our solution is to let the first thread to bring
+ * the volume into an exclusive state win; the other
+ * threads just wait until it finishes bringing the
+ * volume online, and then they do a vgetvolumebyvp
+ */
+ if (svp && (svp != vp)) {
+ /* wait for other exclusive ops to finish */
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+
+ /* get a heavyweight ref, kill the lightweight ref, and return */
+ VGetVolumeByVp_r(ec, vp);
+ VCancelReservation_r(vp);
+ return vp;
+ }
+
+ /* at this point, we are chosen as the thread to do
+ * demand attachment for this volume. all other threads
+ * doing a getvolume on vp->hashid will block until we finish */
+
+ /* make sure any old header cache entries are invalidated
+ * before proceeding */
+ FreeVolumeHeader(vp);
+
+ VChangeState_r(vp, VOL_STATE_ATTACHING);
+
+ /* restore any saved counters */
+ memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
+#else /* AFS_DEMAND_ATTACH_FS */
+ vp = VGetVolume_r(ec, volumeId);
+ if (vp) {
+ if (V_inUse(vp) == fileServer)
+ return vp;
+ if (vp->specialStatus == VBUSY)
+ isbusy = 1;
+ VDetachVolume_r(ec, vp);
+ if (*ec) {
+ Log("VAttachVolume: Error detaching volume (%s)\n", name);
+ }
+ vp = NULL;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+ }
+
+ *ec = 0;
+ strcpy(path, VPartitionPath(partp));
+
+ VOL_UNLOCK;
+
+ strcat(path, "/");
+ strcat(path, name);
+
+ if (!vp) {
+ vp = (Volume *) calloc(1, sizeof(Volume));
+ osi_Assert(vp != NULL);
+ vp->hashid = volumeId;
+ vp->device = partp->device;
+ vp->partition = partp;
+ queue_Init(&vp->vnode_list);
+#ifdef AFS_DEMAND_ATTACH_FS
+ CV_INIT(&V_attachCV(vp), "vp attach", CV_DEFAULT, 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+ }
+
+ /* attach2 is entered without any locks, and returns
+ * with vol_glock_mutex held */
+ vp = attach2(ec, volumeId, path, partp, vp, isbusy, mode);
+
+ if (VCanUseFSSYNC() && vp) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ if ((mode == V_VOLUPD) || (VolumeWriteable(vp) && (mode == V_CLONE))) {
+ /* mark volume header as in use so that volser crashes lead to a
+ * salvage attempt */
+ VUpdateVolume_r(ec, vp, 0);
+ }
+ /* for dafs, we should tell the fileserver, except for V_PEEK
+ * where we know it is not necessary */
+ if (mode == V_PEEK) {
+ vp->needsPutBack = 0;
+ } else {
+ vp->needsPutBack = 1;
+ }
+#else /* !AFS_DEMAND_ATTACH_FS */
+ /* duplicate computation in fssync.c about whether the server
+ * takes the volume offline or not. If the volume isn't
+ * offline, we must not return it when we detach the volume,
+ * or the server will abort */
+ if (mode == V_READONLY || mode == V_PEEK
+ || (!VolumeWriteable(vp) && (mode == V_CLONE || mode == V_DUMP)))
+ vp->needsPutBack = 0;
+ else
+ vp->needsPutBack = 1;
+#endif /* !AFS_DEMAND_ATTACH_FS */
+ }
+ /* OK, there's a problem here, but one that I don't know how to
+ * fix right now, and that I don't think should arise often.
+ * Basically, we should only put back this volume to the server if
+ * it was given to us by the server, but since we don't have a vp,
+ * we can't run the VolumeWriteable function to find out as we do
+ * above when computing vp->needsPutBack. So we send it back, but
+ * there's a path in VAttachVolume on the server which may abort
+ * if this volume doesn't have a header. Should be pretty rare
+ * for all of that to happen, but if it does, probably the right
+ * fix is for the server to allow the return of readonly volumes
+ * that it doesn't think are really checked out. */
+#ifdef FSSYNC_BUILD_CLIENT
+ if (VCanUseFSSYNC() && vp == NULL &&
+ mode != V_SECRETLY && mode != V_PEEK) {
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* If we couldn't attach but we scheduled a salvage, we already
+ * notified the fileserver; don't online it now */
+ if (*ec != VSALVAGING)
+#endif /* AFS_DEMAND_ATTACH_FS */
+ FSYNC_VolOp(volumeId, partition, FSYNC_VOL_ON, 0, NULL);
+ } else
+#endif
+ if (programType == fileServer && vp) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ /*
+ * we can get here in cases where we don't "own"
+ * the volume (e.g. volume owned by a utility).
+ * short circuit around potential disk header races.
+ */
+ if (V_attachState(vp) != VOL_STATE_ATTACHED) {
+ goto done;
+ }
+#endif
+ VUpdateVolume_r(ec, vp, 0);
+ if (*ec) {
+ Log("VAttachVolume: Error updating volume\n");
+ if (vp)
+ VPutVolume_r(vp);
+ goto done;
+ }
+ if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
+#ifndef AFS_DEMAND_ATTACH_FS
+ /* This is a hack: by temporarily setting the incore
+ * dontSalvage flag ON, the volume will be put back on the
+ * Update list (with dontSalvage OFF again). It will then
+ * come back in N minutes with DONT_SALVAGE eventually
+ * set. This is the way that volumes that have never had
+ * it set get it set; or that volumes that have been
+ * offline without DONT SALVAGE having been set also
+ * eventually get it set */
+ V_dontSalvage(vp) = DONT_SALVAGE;
+#endif /* !AFS_DEMAND_ATTACH_FS */
+ VAddToVolumeUpdateList_r(ec, vp);
+ if (*ec) {
+ Log("VAttachVolume: Error adding volume to update list\n");
+ if (vp)
+ VPutVolume_r(vp);
+ goto done;
+ }
+ }
+ if (LogLevel)
+ Log("VOnline: volume %u (%s) attached and online\n", V_id(vp),
+ V_name(vp));
+ }
+
+ done:
+ if (VRequiresPartLock()) {
+ VUnlockPartition_r(partition);
+ }
+ if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* attach failed; make sure we're in error state */
+ if (vp && !VIsErrorState(V_attachState(vp))) {
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+ return NULL;
+ } else {
+ return vp;
+ }
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* VAttachVolumeByVp_r
+ *
+ * finish attaching a volume that is
+ * in a less than fully attached state
+ */
+/* caller MUST hold a ref count on vp */
+static Volume *
+VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
+{
+ char name[VMAXPATHLEN];
+ int reserve = 0;
+ struct DiskPartition64 *partp;
+ char path[64];
+ int isbusy = 0;
+ VolId volumeId;
+ Volume * nvp = NULL;
+ VolumeStats stats_save;
+ *ec = 0;
+
+ /* volume utility should never call AttachByVp */
+ osi_Assert(programType == fileServer);
+
+ volumeId = vp->hashid;
+ partp = vp->partition;
+ VolumeExternalName_r(volumeId, name, sizeof(name));
+
+
+ /* if another thread is performing a blocking op, wait */
+ VWaitExclusiveState_r(vp);
+
+ memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
+
+ /* if it's already attached, see if we can return it */
+ if (V_attachState(vp) == VOL_STATE_ATTACHED) {
+ VGetVolumeByVp_r(ec, vp);
+ if (V_inUse(vp) == fileServer) {
+ return vp;
+ } else {
+ if (vp->specialStatus == VBUSY)
+ isbusy = 1;
+ VDetachVolume_r(ec, vp);
+ if (*ec) {
+ Log("VAttachVolume: Error detaching volume (%s)\n", name);
+ }
+ vp = NULL;
+ }
+ }
+
+ /* pre-attach volume if it hasn't been done yet */
+ if (!vp ||
+ (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+ (V_attachState(vp) == VOL_STATE_DELETED) ||
+ (V_attachState(vp) == VOL_STATE_ERROR)) {
+ nvp = VPreAttachVolumeByVp_r(ec, partp, vp, volumeId);
+ if (*ec) {
+ return NULL;
+ }
+ if (nvp != vp) {
+ reserve = 1;
+ VCreateReservation_r(nvp);
+ vp = nvp;
+ }
+ }
+
+ osi_Assert(vp != NULL);
+ VChangeState_r(vp, VOL_STATE_ATTACHING);
+
+ /* restore monotonically increasing stats */
+ memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
+
+ *ec = 0;
+
+ /* compute path to disk header */
+ strcpy(path, VPartitionPath(partp));
+
+ VOL_UNLOCK;
+
+ strcat(path, "/");
+ strcat(path, name);
+
+ /* do volume attach
+ *
+ * NOTE: attach2 is entered without any locks, and returns
+ * with vol_glock_mutex held */
+ vp = attach2(ec, volumeId, path, partp, vp, isbusy, mode);
+
+ /*
+ * the event that an error was encountered, or
+ * the volume was not brought to an attached state
+ * for any reason, skip to the end. We cannot
+ * safely call VUpdateVolume unless we "own" it.
+ */
+ if (*ec ||
+ (vp == NULL) ||
+ (V_attachState(vp) != VOL_STATE_ATTACHED)) {
+ goto done;
+ }
+
+ VUpdateVolume_r(ec, vp, 0);
+ if (*ec) {
+ Log("VAttachVolume: Error updating volume %u\n", vp->hashid);
+ VPutVolume_r(vp);
+ goto done;
+ }
+ if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
+#ifndef AFS_DEMAND_ATTACH_FS
+ /* This is a hack: by temporarily setting the incore
+ * dontSalvage flag ON, the volume will be put back on the
+ * Update list (with dontSalvage OFF again). It will then
+ * come back in N minutes with DONT_SALVAGE eventually
+ * set. This is the way that volumes that have never had
+ * it set get it set; or that volumes that have been
+ * offline without DONT SALVAGE having been set also
+ * eventually get it set */
+ V_dontSalvage(vp) = DONT_SALVAGE;
+#endif /* !AFS_DEMAND_ATTACH_FS */
+ VAddToVolumeUpdateList_r(ec, vp);
+ if (*ec) {
+ Log("VAttachVolume: Error adding volume %u to update list\n", vp->hashid);
+ if (vp)
+ VPutVolume_r(vp);
+ goto done;
+ }
+ }
+ if (LogLevel)
+ Log("VOnline: volume %u (%s) attached and online\n", V_id(vp),
+ V_name(vp));
+ done:
+ if (reserve) {
+ VCancelReservation_r(nvp);
+ reserve = 0;
+ }
+ if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) {
+ if (vp && !VIsErrorState(V_attachState(vp))) {
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ }
+ return NULL;
+ } else {
+ return vp;
+ }
+}
+
+/**
+ * lock a volume on disk (non-blocking).
+ *
+ * @param[in] vp The volume to lock
+ * @param[in] locktype READ_LOCK or WRITE_LOCK
+ *
+ * @return operation status
+ * @retval 0 success, lock was obtained
+ * @retval EBUSY a conflicting lock was held by another process
+ * @retval EIO error acquiring lock
+ *
+ * @pre If we're in the fileserver, vp is in an exclusive state
+ *
+ * @pre vp is not already locked
+ */
+static int
+VLockVolumeNB(Volume *vp, int locktype)
+{
+ int code;
+
+ osi_Assert(programType != fileServer || VIsExclusiveState(V_attachState(vp)));
+ osi_Assert(!(V_attachFlags(vp) & VOL_LOCKED));
+
+ code = VLockVolumeByIdNB(vp->hashid, vp->partition, locktype);
+ if (code == 0) {
+ V_attachFlags(vp) |= VOL_LOCKED;
+ }
+
+ return code;
+}
+
+/**
+ * unlock a volume on disk that was locked with VLockVolumeNB.
+ *
+ * @param[in] vp volume to unlock
+ *
+ * @pre If we're in the fileserver, vp is in an exclusive state
+ *
+ * @pre vp has already been locked
+ */
+static void
+VUnlockVolume(Volume *vp)
+{
+ osi_Assert(programType != fileServer || VIsExclusiveState(V_attachState(vp)));
+ osi_Assert((V_attachFlags(vp) & VOL_LOCKED));
+
+ VUnlockVolumeById(vp->hashid, vp->partition);
+
+ V_attachFlags(vp) &= ~VOL_LOCKED;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/**
+ * read in a vol header, possibly lock the vol header, and possibly check out
+ * the vol header from the fileserver, as part of volume attachment.
+ *
+ * @param[out] ec error code
+ * @param[in] vp volume pointer object
+ * @param[in] partp disk partition object of the attaching partition
+ * @param[in] mode attachment mode such as V_VOLUPD, V_DUMP, etc (see
+ * volume.h)
+ * @param[in] peek 1 to just try to read in the volume header and make sure
+ * we don't try to lock the vol, or check it out from
+ * FSSYNC or anything like that; 0 otherwise, for 'normal'
+ * operation
+ *
+ * @note As part of DAFS volume attachment, the volume header may be either
+ * read- or write-locked to ensure mutual exclusion of certain volume
+ * operations. In some cases in order to determine whether we need to
+ * read- or write-lock the header, we need to read in the header to see
+ * if the volume is RW or not. So, if we read in the header under a
+ * read-lock and determine that we actually need a write-lock on the
+ * volume header, this function will drop the read lock, acquire a write
+ * lock, and read the header in again.
+ */
+static void
+attach_volume_header(Error *ec, Volume *vp, struct DiskPartition64 *partp,
+ int mode, int peek)
+{
+ struct VolumeDiskHeader diskHeader;
+ struct VolumeHeader header;
+ int code;
+ int first_try = 1;
+ int lock_tries = 0, checkout_tries = 0;
+ int retry;
+ VolumeId volid = vp->hashid;
+#ifdef FSSYNC_BUILD_CLIENT
+ int checkout, done_checkout = 0;
+#endif /* FSSYNC_BUILD_CLIENT */
+#ifdef AFS_DEMAND_ATTACH_FS
+ int locktype = 0, use_locktype = -1;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ retry:
+ retry = 0;
+ *ec = 0;
+
+ if (lock_tries > VOL_MAX_CHECKOUT_RETRIES) {
+ Log("VAttachVolume: retried too many times trying to lock header for "
+ "vol %lu part %s; giving up\n", afs_printable_uint32_lu(volid),
+ VPartitionPath(partp));
+ *ec = VNOVOL;
+ goto done;
+ }
+ if (checkout_tries > VOL_MAX_CHECKOUT_RETRIES) {
+ Log("VAttachVolume: retried too many times trying to checkout "
+ "vol %lu part %s; giving up\n", afs_printable_uint32_lu(volid),
+ VPartitionPath(partp));
+ *ec = VNOVOL;
+ goto done;
+ }
+
+ if (VReadVolumeDiskHeader(volid, partp, NULL)) {
+ /* short-circuit the 'volume does not exist' case */
+ *ec = VNOVOL;
+ goto done;
+ }
+
+#ifdef FSSYNC_BUILD_CLIENT
+ checkout = !done_checkout;
+ done_checkout = 1;
+ if (!peek && checkout && VMustCheckoutVolume(mode)) {
+ SYNC_response res;
+ memset(&res, 0, sizeof(res));
+
+ if (FSYNC_VolOp(volid, VPartitionPath(partp), FSYNC_VOL_NEEDVOLUME, mode, &res)
+ != SYNC_OK) {
+
+ if (res.hdr.reason == FSYNC_SALVAGE) {
+ Log("VAttachVolume: file server says volume %lu is salvaging\n",
+ afs_printable_uint32_lu(volid));
+ *ec = VSALVAGING;
+ } else {
+ Log("VAttachVolume: attach of volume %lu apparently denied by file server\n",
+ afs_printable_uint32_lu(volid));
+ *ec = VNOVOL; /* XXXX */
+ }
+ goto done;
+ }
+ }
+#endif
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (use_locktype < 0) {
+ /* don't know whether vol is RO or RW; assume it's RO and we can retry
+ * if it turns out to be RW */
+ locktype = VVolLockType(mode, 0);
+
+ } else {
+ /* a previous try says we should use use_locktype to lock the volume,
+ * so use that */
+ locktype = use_locktype;
+ }
+
+ if (!peek && locktype) {
+ code = VLockVolumeNB(vp, locktype);
+ if (code) {
+ if (code == EBUSY) {
+ Log("VAttachVolume: another program has vol %lu locked\n",
+ afs_printable_uint32_lu(volid));
+ } else {
+ Log("VAttachVolume: error %d trying to lock vol %lu\n",
+ code, afs_printable_uint32_lu(volid));
+ }
+
+ *ec = VNOVOL;
+ goto done;
+ }
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ code = VReadVolumeDiskHeader(volid, partp, &diskHeader);
+ if (code) {
+ if (code == EIO) {
+ *ec = VSALVAGE;
+ } else {
+ *ec = VNOVOL;
+ }
+ goto done;
+ }
+
+ DiskToVolumeHeader(&header, &diskHeader);
+
+ IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header.parent,
+ header.largeVnodeIndex);
+ IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header.parent,
+ header.smallVnodeIndex);
+ IH_INIT(vp->diskDataHandle, partp->device, header.parent,
+ header.volumeInfo);
+ IH_INIT(vp->linkHandle, partp->device, header.parent, header.linkTable);
+
+ if (first_try) {
+ /* only need to do this once */
+ VOL_LOCK;
+ GetVolumeHeader(vp);
+ VOL_UNLOCK;
+ }
+
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
+ /* demand attach changes the V_PEEK mechanism
+ *
+ * we can now suck the current disk data structure over
+ * the fssync interface without going to disk
+ *
+ * (technically, we don't need to restrict this feature
+ * to demand attach fileservers. However, I'm trying
+ * to limit the number of common code changes)
+ */
+ if (VCanUseFSSYNC() && (mode == V_PEEK || peek)) {
+ SYNC_response res;
+ res.payload.len = sizeof(VolumeDiskData);
+ res.payload.buf = &vp->header->diskstuff;
+
+ if (FSYNC_VolOp(vp->hashid,
+ partp->name,
+ FSYNC_VOL_QUERY_HDR,
+ FSYNC_WHATEVER,
+ &res) == SYNC_OK) {
+ goto disk_header_loaded;
+ }
+ }
+#endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */
+ (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+ sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* update stats */
+ VOL_LOCK;
+ IncUInt64(&VStats.hdr_loads);
+ IncUInt64(&vp->stats.hdr_loads);
+ VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ if (*ec) {
+ Log("VAttachVolume: Error reading diskDataHandle header for vol %lu; "
+ "error=%u\n", afs_printable_uint32_lu(volid), *ec);
+ goto done;
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+# ifdef FSSYNC_BUILD_CLIENT
+ disk_header_loaded:
+# endif /* FSSYNC_BUILD_CLIENT */
+
+ /* if the lock type we actually used to lock the volume is different than
+ * the lock type we should have used, retry with the lock type we should
+ * use */
+ use_locktype = VVolLockType(mode, VolumeWriteable(vp));
+ if (locktype != use_locktype) {
+ retry = 1;
+ lock_tries++;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ *ec = 0;
+
+ done:
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
+ if (!peek && *ec == 0 && retry == 0 && VMustCheckoutVolume(mode)) {
+
+ code = FSYNC_VerifyCheckout(volid, VPartitionPath(partp), FSYNC_VOL_NEEDVOLUME, mode);
+
+ if (code == SYNC_DENIED) {
+ /* must retry checkout; fileserver no longer thinks we have
+ * the volume */
+ retry = 1;
+ checkout_tries++;
+ done_checkout = 0;
+
+ } else if (code != SYNC_OK) {
+ *ec = VNOVOL;
+ }
+ }
+#endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */
+
+ if (*ec || retry) {
+ /* either we are going to be called again for a second pass, or we
+ * encountered an error; clean up in either case */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if ((V_attachFlags(vp) & VOL_LOCKED)) {
+ VUnlockVolume(vp);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+ if (vp->linkHandle) {
+ IH_RELEASE(vp->vnodeIndex[vLarge].handle);
+ IH_RELEASE(vp->vnodeIndex[vSmall].handle);
+ IH_RELEASE(vp->diskDataHandle);
+ IH_RELEASE(vp->linkHandle);
+ }
+ }
+
+ if (*ec) {
+ return;
+ }
+ if (retry) {
+ first_try = 0;
+ goto retry;
+ }
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+attach_check_vop(Error *ec, VolumeId volid, struct DiskPartition64 *partp,
+ Volume *vp)
+{
+ *ec = 0;
+
+ if (vp->pending_vol_op) {
+
+ VOL_LOCK;
+
+ if (vp->pending_vol_op->vol_op_state == FSSYNC_VolOpRunningUnknown) {
+ int code;
+ code = VVolOpLeaveOnlineNoHeader_r(vp, vp->pending_vol_op);
+ if (code == 1) {
+ vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOnline;
+ } else if (code == 0) {
+ vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOffline;
+
+ } else {
+ /* we need the vol header to determine if the volume can be
+ * left online for the vop, so... get the header */
+
+ VOL_UNLOCK;
+
+ /* attach header with peek=1 to avoid checking out the volume
+ * or locking it; we just want the header info, we're not
+ * messing with the volume itself at all */
+ attach_volume_header(ec, vp, partp, V_PEEK, 1);
+ if (*ec) {
+ return;
+ }
+
+ VOL_LOCK;
+
+ if (VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+ vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOnline;
+ } else {
+ vp->pending_vol_op->vol_op_state = FSSYNC_VolOpRunningOffline;
+ }
+
+ /* make sure we grab a new vol header and re-open stuff on
+ * actual attachment; we can't keep the data we grabbed, since
+ * it was not done under a lock and thus not safe */
+ FreeVolumeHeader(vp);
+ VReleaseVolumeHandles_r(vp);
+ }
+ }
+ /* see if the pending volume op requires exclusive access */
+ switch (vp->pending_vol_op->vol_op_state) {
+ case FSSYNC_VolOpPending:
+ /* this should never happen */
+ osi_Assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpPending);
+ break;
+
+ case FSSYNC_VolOpRunningUnknown:
+ /* this should never happen; we resolved 'unknown' above */
+ osi_Assert(vp->pending_vol_op->vol_op_state != FSSYNC_VolOpRunningUnknown);
+ break;
+
+ case FSSYNC_VolOpRunningOffline:
+ /* mark the volume down */
+ *ec = VOFFLINE;
+ VChangeState_r(vp, VOL_STATE_UNATTACHED);
+
+ /* do not set V_offlineMessage here; we don't have ownership of
+ * the volume (and probably do not have the header loaded), so we
+ * can't alter the disk header */
+
+ /* check to see if we should set the specialStatus flag */
+ if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) {
+ vp->specialStatus = VBUSY;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ VOL_UNLOCK;
+ }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/**
+ * volume attachment helper function.
+ *
+ * @param[out] ec error code
+ * @param[in] volumeId volume ID of the attaching volume
+ * @param[in] path full path to the volume header .vol file
+ * @param[in] partp disk partition object for the attaching partition
+ * @param[in] vp volume object; vp->hashid, vp->device, vp->partition,
+ * vp->vnode_list, and V_attachCV (for DAFS) should already
+ * be initialized
+ * @param[in] isbusy 1 if vp->specialStatus should be set to VBUSY; that is,
+ * if there is a volume operation running for this volume
+ * that should set the volume to VBUSY during its run. 0
+ * otherwise. (see VVolOpSetVBusy_r)
+ * @param[in] mode attachment mode such as V_VOLUPD, V_DUMP, etc (see
+ * volume.h)
+ *
+ * @return pointer to the semi-attached volume pointer
+ * @retval NULL an error occurred (check value of *ec)
+ * @retval vp volume successfully attaching
+ *
+ * @pre no locks held
+ *
+ * @post VOL_LOCK held
+ */
+static Volume *
+attach2(Error * ec, VolId volumeId, char *path, struct DiskPartition64 *partp,
+ Volume * vp, int isbusy, int mode)
+{
+ /* have we read in the header successfully? */
+ int read_header = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* should we FreeVolume(vp) instead of VCheckFree(vp) in the error
+ * cleanup? */
+ int forcefree = 0;
+
+ /* in the case of an error, to what state should the volume be
+ * transitioned? */
+ VolState error_state = VOL_STATE_ERROR;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ *ec = 0;
+
+ vp->vnodeIndex[vLarge].handle = NULL;
+ vp->vnodeIndex[vSmall].handle = NULL;
+ vp->diskDataHandle = NULL;
+ vp->linkHandle = NULL;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ attach_check_vop(ec, volumeId, partp, vp);
+ if (!*ec) {
+ attach_volume_header(ec, vp, partp, mode, 0);
+ }
+#else
+ attach_volume_header(ec, vp, partp, mode, 0);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+ if (*ec == VNOVOL) {
+ /* if the volume doesn't exist, skip straight to 'error' so we don't
+ * request a salvage */
+ goto unlocked_error;
+ }
+
+ if (!*ec) {
+ read_header = 1;
+
+ vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
+ vp->shuttingDown = 0;
+ vp->goingOffline = 0;
+ vp->nUsers = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+ vp->stats.last_attach = FT_ApproxTime();
+ vp->stats.attaches++;
+#endif
+
+ VOL_LOCK;
+ IncUInt64(&VStats.attaches);
+ vp->cacheCheck = ++VolumeCacheCheck;
+ /* just in case this ever rolls over */
+ if (!vp->cacheCheck)
+ vp->cacheCheck = ++VolumeCacheCheck;
+ VOL_UNLOCK;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(vp) |= VOL_HDR_LOADED;
+ vp->stats.last_hdr_load = vp->stats.last_attach;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ }
+
+ if (!*ec) {
+ struct IndexFileHeader iHead;
+
+#if OPENAFS_VOL_STATS
+ /*
+ * We just read in the diskstuff part of the header. If the detailed
+ * volume stats area has not yet been initialized, we should bzero the
+ * area and mark it as initialized.
+ */
+ if (!(V_stat_initialized(vp))) {
+ memset((V_stat_area(vp)), 0, VOL_STATS_BYTES);
+ V_stat_initialized(vp) = 1;
+ }
+#endif /* OPENAFS_VOL_STATS */
+
+ (void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle,
+ (char *)&iHead, sizeof(iHead),
+ SMALLINDEXMAGIC, SMALLINDEXVERSION);
+
+ if (*ec) {
+ Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec);
+ }
+ }
+
+ if (!*ec) {
+ struct IndexFileHeader iHead;
+
+ (void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle,
+ (char *)&iHead, sizeof(iHead),
+ LARGEINDEXMAGIC, LARGEINDEXVERSION);
+
+ if (*ec) {
+ Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec);
+ }
+ }
+
+#ifdef AFS_NAMEI_ENV
+ if (!*ec) {
+ struct versionStamp stamp;
+
+ (void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp,
+ sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION);
+
+ if (*ec) {
+ Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec);
+ }
+ }
+#endif /* AFS_NAMEI_ENV */
+
+#if defined(AFS_DEMAND_ATTACH_FS)
+ if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) {
+ VOL_LOCK;
+ if (!VCanScheduleSalvage()) {
+ Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
+ }
+ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+ VOL_SALVAGE_NO_OFFLINE);
+ vp->nUsers = 0;
+
+ goto locked_error;
+ } else if (*ec) {
+ /* volume operation in progress */
+ goto unlocked_error;
+ }
+#else /* AFS_DEMAND_ATTACH_FS */
+ if (*ec) {
+ Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
+ goto unlocked_error;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ if (V_needsSalvaged(vp)) {
+ if (vp->specialStatus)
+ vp->specialStatus = 0;
+ VOL_LOCK;
+#if defined(AFS_DEMAND_ATTACH_FS)
+ if (!VCanScheduleSalvage()) {
+ Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
+ }
+ VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER |
+ VOL_SALVAGE_NO_OFFLINE);
+ vp->nUsers = 0;
+
+#else /* AFS_DEMAND_ATTACH_FS */
+ *ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ goto locked_error;
+ }
+
+ VOL_LOCK;
+ vp->nextVnodeUnique = V_uniquifier(vp);
+
+ if (VShouldCheckInUse(mode) && V_inUse(vp) && VolumeWriteable(vp)) {
+ if (!V_needsSalvaged(vp)) {
+ V_needsSalvaged(vp) = 1;
+ VUpdateVolume_r(ec, vp, 0);
+ }
+#if defined(AFS_DEMAND_ATTACH_FS)
+ if (!VCanScheduleSalvage()) {
+ Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
+ }
+ VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER |
+ VOL_SALVAGE_NO_OFFLINE);
+ vp->nUsers = 0;
+
+#else /* AFS_DEMAND_ATTACH_FS */
+ Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
+ *ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ goto locked_error;
+ }
+
+ if (programType == fileServer && V_destroyMe(vp) == DESTROY_ME) {
+ /* Only check destroyMe if we are the fileserver, since the
+ * volserver et al sometimes need to work with volumes with
+ * destroyMe set. Examples are 'temporary' volumes the
+ * volserver creates, and when we create a volume (destroyMe
+ * is set on creation; sometimes a separate volserver
+ * transaction is created to clear destroyMe).
+ */
+
+#if defined(AFS_DEMAND_ATTACH_FS)
+ /* schedule a salvage so the volume goes away on disk */
+ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+ VOL_SALVAGE_NO_OFFLINE);
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ vp->nUsers = 0;
+ forcefree = 1;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
+ *ec = VNOVOL;
+ goto locked_error;
+ }
+
+ vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL;
+#ifndef BITMAP_LATER
+ if (programType == fileServer && VolumeWriteable(vp)) {
+ int i;
+ for (i = 0; i < nVNODECLASSES; i++) {
+ VGetBitmap_r(ec, vp, i);
+ if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+ VOL_SALVAGE_NO_OFFLINE);
+ vp->nUsers = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ Log("VAttachVolume: error getting bitmap for volume (%s)\n",
+ path);
+ goto locked_error;
+ }
+ }
+ }
+#endif /* BITMAP_LATER */
+
+ if (VInit >= 2 && V_needsCallback(vp)) {
+ if (V_BreakVolumeCallbacks) {
+ Log("VAttachVolume: Volume %lu was changed externally; breaking callbacks\n",
+ afs_printable_uint32_lu(V_id(vp)));
+ V_needsCallback(vp) = 0;
+ VOL_UNLOCK;
+ (*V_BreakVolumeCallbacks) (V_id(vp));
+ VOL_LOCK;
+
+ VUpdateVolume_r(ec, vp, 0);
+ }
+#ifdef FSSYNC_BUILD_CLIENT
+ else if (VCanUseFSSYNC()) {
+ afs_int32 fsync_code;
+
+ V_needsCallback(vp) = 0;
+ VOL_UNLOCK;
+ fsync_code = FSYNC_VolOp(V_id(vp), NULL, FSYNC_VOL_BREAKCBKS, FSYNC_WHATEVER, NULL);
+ VOL_LOCK;
+
+ if (fsync_code) {
+ V_needsCallback(vp) = 1;
+ Log("Error trying to tell the fileserver to break callbacks for "
+ "changed volume %lu; error code %ld\n",
+ afs_printable_uint32_lu(V_id(vp)),
+ afs_printable_int32_ld(fsync_code));
+ } else {
+ VUpdateVolume_r(ec, vp, 0);
+ }
+ }
+#endif /* FSSYNC_BUILD_CLIENT */
+
+ if (*ec) {
+ Log("VAttachVolume: error %d clearing needsCallback on volume "
+ "%lu; needs salvage\n", (int)*ec,
+ afs_printable_uint32_lu(V_id(vp)));
+#ifdef AFS_DEMAND_ATTACH_FS
+ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER |
+ VOL_SALVAGE_NO_OFFLINE);
+ vp->nUsers = 0;
+#else /* !AFS_DEMAND_ATTACH_FS */
+ *ec = VSALVAGE;
+#endif /* !AFS_DEMAND_ATTACh_FS */
+ goto locked_error;
+ }
+ }
+
+ if (programType == fileServer) {
+ if (vp->specialStatus)
+ vp->specialStatus = 0;
+ if (V_blessed(vp) && V_inService(vp) && !V_needsSalvaged(vp)) {
+ V_inUse(vp) = fileServer;
+ V_offlineMessage(vp)[0] = '\0';
+ }
+ if (!V_inUse(vp)) {
+ *ec = VNOVOL;
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* Put the vol into PREATTACHED state, so if someone tries to
+ * access it again, we try to attach, see that we're not blessed,
+ * and give a VNOVOL error again. Putting it into UNATTACHED state
+ * would result in a VOFFLINE error instead. */
+ error_state = VOL_STATE_PREATTACHED;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ /* mimic e.g. GetVolume errors */
+ if (!V_blessed(vp)) {
+ Log("Volume %lu offline: not blessed\n", afs_printable_uint32_lu(V_id(vp)));
+ FreeVolumeHeader(vp);
+ } else if (!V_inService(vp)) {
+ Log("Volume %lu offline: not in service\n", afs_printable_uint32_lu(V_id(vp)));
+ FreeVolumeHeader(vp);
+ } else {
+ Log("Volume %lu offline: needs salvage\n", afs_printable_uint32_lu(V_id(vp)));
+ *ec = VSALVAGE;
+#ifdef AFS_DEMAND_ATTACH_FS
+ error_state = VOL_STATE_ERROR;
+ /* see if we can recover */
+ VRequestSalvage_r(ec, vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
+#endif
+ }
+#ifdef AFS_DEMAND_ATTACH_FS
+ vp->nUsers = 0;
+#endif
+ goto locked_error;
+ }
+ } else {
+#ifdef AFS_DEMAND_ATTACH_FS
+ if ((mode != V_PEEK) && (mode != V_SECRETLY))
+ V_inUse(vp) = programType;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ V_checkoutMode(vp) = mode;
+ }
+
+ AddVolumeToHashTable(vp, V_id(vp));
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (VCanUnlockAttached() && (V_attachFlags(vp) & VOL_LOCKED)) {
+ VUnlockVolume(vp);
+ }
+ if ((programType != fileServer) ||
+ (V_inUse(vp) == fileServer)) {
+ AddVolumeToVByPList_r(vp);
+ VLRU_Add_r(vp);
+ VChangeState_r(vp, VOL_STATE_ATTACHED);
+ } else {
+ VChangeState_r(vp, VOL_STATE_UNATTACHED);
+ }
+#endif
+
+ return vp;
+
+unlocked_error:
+ VOL_LOCK;
+locked_error:
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (!VIsErrorState(V_attachState(vp))) {
+ VChangeState_r(vp, error_state);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ if (read_header) {
+ VReleaseVolumeHandles_r(vp);
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VCheckSalvage(vp);
+ if (forcefree) {
+ FreeVolume(vp);
+ } else {
+ VCheckFree(vp);
+ }
+#else /* !AFS_DEMAND_ATTACH_FS */
+ FreeVolume(vp);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+ return NULL;
+}
+
+/* Attach an existing volume.
+ The volume also normally goes online at this time.
+ An offline volume must be reattached to make it go online.
+ */
+
+Volume *
+VAttachVolume(Error * ec, VolumeId volumeId, int mode)
+{
+ Volume *retVal;
+ VOL_LOCK;
+ retVal = VAttachVolume_r(ec, volumeId, mode);
+ VOL_UNLOCK;
+ return retVal;
+}
+
+Volume *
+VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
+{
+ char *part, *name;
+ VGetVolumePath(ec, volumeId, &part, &name);
+ if (*ec) {
+ Volume *vp;
+ Error error;
+ vp = VGetVolume_r(&error, volumeId);
+ if (vp) {
+ osi_Assert(V_inUse(vp) == 0);
+ VDetachVolume_r(ec, vp);
+ }
+ return NULL;
+ }
+ return VAttachVolumeByName_r(ec, part, name, mode);
+}
+
+/* Increment a reference count to a volume, sans context swaps. Requires
+ * possibly reading the volume header in from the disk, since there's
+ * an invariant in the volume package that nUsers>0 ==> vp->header is valid.
+ *
+ * N.B. This call can fail if we can't read in the header!! In this case
+ * we still guarantee we won't context swap, but the ref count won't be
+ * incremented (otherwise we'd violate the invariant).
+ */
+/* NOTE: with the demand attach fileserver extensions, the global lock
+ * is dropped within VHold */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VHold_r(Volume * vp)
+{
+ Error error;
+
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+
+ LoadVolumeHeader(&error, vp);
+ if (error) {
+ VCancelReservation_r(vp);
+ return error;
+ }
+ vp->nUsers++;
+ VCancelReservation_r(vp);
+ return 0;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VHold_r(Volume * vp)
+{
+ Error error;
+
+ LoadVolumeHeader(&error, vp);
+ if (error)
+ return error;
+ vp->nUsers++;
+ return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#if 0
+static int
+VHold(Volume * vp)
+{
+ int retVal;
+ VOL_LOCK;
+ retVal = VHold_r(vp);
+ VOL_UNLOCK;
+ return retVal;
+}
+#endif
+
+
+/***************************************************/
+/* get and put volume routines */
+/***************************************************/
+
+/**
+ * put back a heavyweight reference to a volume object.
+ *
+ * @param[in] vp volume object pointer
+ *
+ * @pre VOL_LOCK held
+ *
+ * @post heavyweight volume reference put back.
+ * depending on state, volume may have been taken offline,
+ * detached, salvaged, freed, etc.
+ *
+ * @internal volume package internal use only
+ */
+void
+VPutVolume_r(Volume * vp)
+{
+ osi_Assert(--vp->nUsers >= 0);
+ if (vp->nUsers == 0) {
+ VCheckOffline(vp);
+ ReleaseVolumeHeader(vp->header);
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (!VCheckDetach(vp)) {
+ VCheckSalvage(vp);
+ VCheckFree(vp);
+ }
+#else /* AFS_DEMAND_ATTACH_FS */
+ VCheckDetach(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
+ }
+}
+
+void
+VPutVolume(Volume * vp)
+{
+ VOL_LOCK;
+ VPutVolume_r(vp);
+ VOL_UNLOCK;
+}
+
+
+/* Get a pointer to an attached volume. The pointer is returned regardless
+ of whether or not the volume is in service or on/off line. An error
+ code, however, is returned with an indication of the volume's status */
+Volume *
+VGetVolume(Error * ec, Error * client_ec, VolId volumeId)
+{
+ Volume *retVal;
+ VOL_LOCK;
+ retVal = GetVolume(ec, client_ec, volumeId, NULL, 0);
+ VOL_UNLOCK;
+ return retVal;
+}
+
+/* same as VGetVolume, but if a volume is waiting to go offline, we return
+ * that it is actually offline, instead of waiting for it to go offline */
+Volume *
+VGetVolumeNoWait(Error * ec, Error * client_ec, VolId volumeId)
+{
+ Volume *retVal;
+ VOL_LOCK;
+ retVal = GetVolume(ec, client_ec, volumeId, NULL, 1);
+ VOL_UNLOCK;
+ return retVal;
+}
+
+Volume *
+VGetVolume_r(Error * ec, VolId volumeId)
+{
+ return GetVolume(ec, NULL, volumeId, NULL, 0);
+}
+
+/* try to get a volume we've previously looked up */
+/* for demand attach fs, caller MUST NOT hold a ref count on vp */
+Volume *
+VGetVolumeByVp_r(Error * ec, Volume * vp)
+{
+ return GetVolume(ec, NULL, vp->hashid, vp, 0);
+}
+
+/**
+ * private interface for getting a volume handle
+ *
+ * @param[out] ec error code (0 if no error)
+ * @param[out] client_ec wire error code to be given to clients
+ * @param[in] volumeId ID of the volume we want
+ * @param[in] hint optional hint for hash lookups, or NULL
+ * @param[in] nowait 0 to wait for a 'goingOffline' volume to go offline
+ * before returning, 1 to return immediately
+ *
+ * @return a volume handle for the specified volume
+ * @retval NULL an error occurred, or the volume is in such a state that
+ * we cannot load a header or return any volume struct
+ *
+ * @note for DAFS, caller must NOT hold a ref count on 'hint'
+ */
+static Volume *
+GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int nowait)
+{
+ Volume *vp = hint;
+ /* pull this profiling/debugging code out of regular builds */
+#ifdef notdef
+#define VGET_CTR_INC(x) x++
+ unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V5 = 0, V6 =
+ 0, V7 = 0, V8 = 0, V9 = 0;
+ unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0;
+#else
+#define VGET_CTR_INC(x)
+#endif
+#ifdef AFS_DEMAND_ATTACH_FS
+ Volume *avp, * rvp = hint;
+#endif
+
+ /*
+ * if VInit is zero, the volume package dynamic
+ * data structures have not been initialized yet,
+ * and we must immediately return an error
+ */
+ if (VInit == 0) {
+ vp = NULL;
+ *ec = VOFFLINE;
+ if (client_ec) {
+ *client_ec = VOFFLINE;
+ }
+ goto not_inited;
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (rvp) {
+ VCreateReservation_r(rvp);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ for (;;) {
+ *ec = 0;
+ if (client_ec)
+ *client_ec = 0;
+ VGET_CTR_INC(V0);
+
+ vp = VLookupVolume_r(ec, volumeId, vp);
+ if (*ec) {
+ vp = NULL;
+ break;
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (rvp && (rvp != vp)) {
+ /* break reservation on old vp */
+ VCancelReservation_r(rvp);
+ rvp = NULL;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ if (!vp) {
+ VGET_CTR_INC(V1);
+ if (VInit < 2) {
+ VGET_CTR_INC(V2);
+ /* Until we have reached an initialization level of 2
+ * we don't know whether this volume exists or not.
+ * We can't sleep and retry later because before a volume
+ * is attached, the caller tries to get it first. Just
+ * return VOFFLINE and the caller can choose whether to
+ * retry the command or not. */
+ *ec = VOFFLINE;
+ break;
+ }
+
+ *ec = VNOVOL;
+ break;
+ }
+
+ VGET_CTR_INC(V3);
+ IncUInt64(&VStats.hdr_gets);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* block if someone else is performing an exclusive op on this volume */
+ if (rvp != vp) {
+ rvp = vp;
+ VCreateReservation_r(rvp);
+ }
+ VWaitExclusiveState_r(vp);
+
+ /* short circuit with VNOVOL in the following circumstances:
+ *
+ * - VOL_STATE_ERROR
+ * - VOL_STATE_SHUTTING_DOWN
+ */
+ if ((V_attachState(vp) == VOL_STATE_ERROR) ||
+ (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN) ||
+ (V_attachState(vp) == VOL_STATE_GOING_OFFLINE)) {
+ *ec = VNOVOL;
+ vp = NULL;
+ break;
+ }
+
+ /*
+ * short circuit with VOFFLINE for VOL_STATE_UNATTACHED and
+ * VNOVOL for VOL_STATE_DELETED
+ */
+ if ((V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+ (V_attachState(vp) == VOL_STATE_DELETED)) {
+ if (vp->specialStatus) {
+ *ec = vp->specialStatus;
+ } else if (V_attachState(vp) == VOL_STATE_DELETED) {
+ *ec = VNOVOL;
+ } else {
+ *ec = VOFFLINE;
+ }
+ vp = NULL;
+ break;
+ }
+
+ /* allowable states:
+ * - PREATTACHED
+ * - ATTACHED
+ * - SALVAGING
+ * - SALVAGE_REQ
+ */
+
+ if (vp->salvage.requested) {
+ VUpdateSalvagePriority_r(vp);
+ }
+
+ if (V_attachState(vp) == VOL_STATE_PREATTACHED) {
+ avp = VAttachVolumeByVp_r(ec, vp, 0);
+ if (avp) {
+ if (vp != avp) {
+ /* VAttachVolumeByVp_r can return a pointer
+ * != the vp passed to it under certain
+ * conditions; make sure we don't leak
+ * reservations if that happens */
+ vp = avp;
+ VCancelReservation_r(rvp);
+ rvp = avp;
+ VCreateReservation_r(rvp);
+ }
+ VPutVolume_r(avp);
+ }
+ if (*ec) {
+ int endloop = 0;
+ switch (*ec) {
+ case VSALVAGING:
+ break;
+ case VOFFLINE:
+ if (!vp->pending_vol_op) {
+ endloop = 1;
+ }
+ break;
+ default:
+ *ec = VNOVOL;
+ endloop = 1;
+ }
+ if (endloop) {
+ vp = NULL;
+ break;
+ }
+ }
+ }
+
+ if (VIsSalvaging(vp) || (*ec == VSALVAGING)) {
+ if (client_ec) {
+ /* see CheckVnode() in afsfileprocs.c for an explanation
+ * of this error code logic */
+ afs_uint32 now = FT_ApproxTime();
+ if ((vp->stats.last_salvage + (10 * 60)) >= now) {
+ *client_ec = VBUSY;
+ } else {
+ *client_ec = VRESTARTING;
+ }
+ }
+ *ec = VSALVAGING;
+ vp = NULL;
+ break;
+ }
+#endif
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /*
+ * this test MUST happen after VAttachVolymeByVp, so vol_op_state is
+ * not VolOpRunningUnknown (attach2 would have converted it to Online
+ * or Offline)
+ */
+
+ /* only valid before/during demand attachment */
+ osi_Assert(!vp->pending_vol_op || vp->pending_vol_op->vol_op_state != FSSYNC_VolOpRunningUnknown);
+
+ /* deny getvolume due to running mutually exclusive vol op */
+ if (vp->pending_vol_op && vp->pending_vol_op->vol_op_state==FSSYNC_VolOpRunningOffline) {
+ /*
+ * volume cannot remain online during this volume operation.
+ * notify client.
+ */
+ if (vp->specialStatus) {
+ /*
+ * special status codes outrank normal VOFFLINE code
+ */
+ *ec = vp->specialStatus;
+ if (client_ec) {
+ *client_ec = vp->specialStatus;
+ }
+ } else {
+ if (client_ec) {
+ /* see CheckVnode() in afsfileprocs.c for an explanation
+ * of this error code logic */
+ afs_uint32 now = FT_ApproxTime();
+ if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
+ *client_ec = VBUSY;
+ } else {
+ *client_ec = VRESTARTING;
+ }
+ }
+ *ec = VOFFLINE;
+ }
+ VChangeState_r(vp, VOL_STATE_UNATTACHED);
+ FreeVolumeHeader(vp);
+ vp = NULL;
+ break;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ LoadVolumeHeader(ec, vp);
+ if (*ec) {
+ VGET_CTR_INC(V6);
+ /* Only log the error if it was a totally unexpected error. Simply
+ * a missing inode is likely to be caused by the volume being deleted */
+ if (errno != ENXIO || LogLevel)
+ Log("Volume %u: couldn't reread volume header\n",
+ vp->hashid);
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (VCanScheduleSalvage()) {
+ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+ } else {
+ FreeVolume(vp);
+ vp = NULL;
+ }
+#else /* AFS_DEMAND_ATTACH_FS */
+ FreeVolume(vp);
+ vp = NULL;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ break;
+ }
+
+ VGET_CTR_INC(V7);
+ if (vp->shuttingDown) {
+ VGET_CTR_INC(V8);
+ *ec = VNOVOL;
+ vp = NULL;
+ break;
+ }
+
+ if (programType == fileServer) {
+ VGET_CTR_INC(V9);
+ if (vp->goingOffline && !nowait) {
+ VGET_CTR_INC(V10);
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* wait for the volume to go offline */
+ if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+ VWaitStateChange_r(vp);
+ }
+#elif defined(AFS_PTHREAD_ENV)
+ VOL_CV_WAIT(&vol_put_volume_cond);
+#else /* AFS_PTHREAD_ENV */
+ LWP_WaitProcess(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+ continue;
+ }
+ if (vp->specialStatus) {
+ VGET_CTR_INC(V11);
+ *ec = vp->specialStatus;
+ } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
+ VGET_CTR_INC(V12);
+ *ec = VNOVOL;
+ } else if (V_inUse(vp) == 0 || vp->goingOffline) {
+ VGET_CTR_INC(V13);
+ *ec = VOFFLINE;
+ } else {
+ VGET_CTR_INC(V14);
+ }
+ }
+ break;
+ }
+ VGET_CTR_INC(V15);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* if no error, bump nUsers */
+ if (vp) {
+ vp->nUsers++;
+ VLRU_UpdateAccess_r(vp);
+ }
+ if (rvp) {
+ VCancelReservation_r(rvp);
+ rvp = NULL;
+ }
+ if (client_ec && !*client_ec) {
+ *client_ec = *ec;
+ }
+#else /* AFS_DEMAND_ATTACH_FS */
+ /* if no error, bump nUsers */
+ if (vp) {
+ vp->nUsers++;
+ }
+ if (client_ec) {
+ *client_ec = *ec;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ not_inited:
+ osi_Assert(vp || *ec);
+ return vp;
+}
+
+
+/***************************************************/
+/* Volume offline/detach routines */
+/***************************************************/
+
+/* caller MUST hold a heavyweight ref on vp */
+#ifdef AFS_DEMAND_ATTACH_FS
+void
+VTakeOffline_r(Volume * vp)
+{
+ Error error;
+
+ osi_Assert(vp->nUsers > 0);
+ osi_Assert(programType == fileServer);
+
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+
+ vp->goingOffline = 1;
+ V_needsSalvaged(vp) = 1;
+
+ VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, 0);
+ VCancelReservation_r(vp);
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+void
+VTakeOffline_r(Volume * vp)
+{
+ osi_Assert(vp->nUsers > 0);
+ osi_Assert(programType == fileServer);
+
+ vp->goingOffline = 1;
+ V_needsSalvaged(vp) = 1;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+void
+VTakeOffline(Volume * vp)
+{
+ VOL_LOCK;
+ VTakeOffline_r(vp);
+ VOL_UNLOCK;
+}
+
+/**
+ * force a volume offline.
+ *
+ * @param[in] vp volume object pointer
+ * @param[in] flags flags (see note below)
+ *
+ * @note the flag VOL_FORCEOFF_NOUPDATE is a recursion control flag
+ * used when VUpdateVolume_r needs to call VForceOffline_r
+ * (which in turn would normally call VUpdateVolume_r)
+ *
+ * @see VUpdateVolume_r
+ *
+ * @pre VOL_LOCK must be held.
+ * for DAFS, caller must hold ref.
+ *
+ * @note for DAFS, it _is safe_ to call this function from an
+ * exclusive state
+ *
+ * @post needsSalvaged flag is set.
+ * for DAFS, salvage is requested.
+ * no further references to the volume through the volume
+ * package will be honored.
+ * all file descriptor and vnode caches are invalidated.
+ *
+ * @warning this is a heavy-handed interface. it results in
+ * a volume going offline regardless of the current
+ * reference count state.
+ *
+ * @internal volume package internal use only
+ */
+void
+VForceOffline_r(Volume * vp, int flags)
+{
+ Error error;
+ if (!V_inUse(vp)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ VChangeState_r(vp, VOL_STATE_ERROR);
+#endif
+ return;
+ }
+
+ strcpy(V_offlineMessage(vp),
+ "Forced offline due to internal error: volume needs to be salvaged");
+ Log("Volume %u forced offline: it needs salvaging!\n", V_id(vp));
+
+ V_inUse(vp) = 0;
+ vp->goingOffline = 0;
+ V_needsSalvaged(vp) = 1;
+ if (!(flags & VOL_FORCEOFF_NOUPDATE)) {
+ VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF);
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VRequestSalvage_r(&error, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#ifdef AFS_PTHREAD_ENV
+ CV_BROADCAST(&vol_put_volume_cond);
+#else /* AFS_PTHREAD_ENV */
+ LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+
+ VReleaseVolumeHandles_r(vp);
+}
+
+/**
+ * force a volume offline.
+ *
+ * @param[in] vp volume object pointer
+ *
+ * @see VForceOffline_r
+ */
+void
+VForceOffline(Volume * vp)
+{
+ VOL_LOCK;
+ VForceOffline_r(vp, 0);
+ VOL_UNLOCK;
+}
+
+/* The opposite of VAttachVolume. The volume header is written to disk, with
+ the inUse bit turned off. A copy of the header is maintained in memory,
+ however (which is why this is VOffline, not VDetach).
+ */
+void
+VOffline_r(Volume * vp, char *message)
+{
+#ifndef AFS_DEMAND_ATTACH_FS
+ Error error;
+ VolumeId vid = V_id(vp);
+#endif
+
+ osi_Assert(programType != volumeUtility && programType != volumeServer);
+ if (!V_inUse(vp)) {
+ VPutVolume_r(vp);
+ return;
+ }
+ if (V_offlineMessage(vp)[0] == '\0')
+ strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
+ V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
+ vp->goingOffline = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+ VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
+ VCreateReservation_r(vp);
+ VPutVolume_r(vp);
+
+ /* wait for the volume to go offline */
+ if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+ VWaitStateChange_r(vp);
+ }
+ VCancelReservation_r(vp);
+#else /* AFS_DEMAND_ATTACH_FS */
+ VPutVolume_r(vp);
+ vp = VGetVolume_r(&error, vid); /* Wait for it to go offline */
+ if (vp) /* In case it was reattached... */
+ VPutVolume_r(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * Take a volume offline in order to perform a volume operation.
+ *
+ * @param[inout] ec address in which to store error code
+ * @param[in] vp volume object pointer
+ * @param[in] message volume offline status message
+ *
+ * @pre
+ * - VOL_LOCK is held
+ * - caller MUST hold a heavyweight ref on vp
+ *
+ * @post
+ * - volume is taken offline
+ * - if possible, volume operation is promoted to running state
+ * - on failure, *ec is set to nonzero
+ *
+ * @note Although this function does not return any value, it may
+ * still fail to promote our pending volume operation to
+ * a running state. Any caller MUST check the value of *ec,
+ * and MUST NOT blindly assume success.
+ *
+ * @warning if the caller does not hold a lightweight ref on vp,
+ * then it MUST NOT reference vp after this function
+ * returns to the caller.
+ *
+ * @internal volume package internal use only
+ */
+void
+VOfflineForVolOp_r(Error *ec, Volume *vp, char *message)
+{
+ osi_Assert(vp->pending_vol_op);
+ if (!V_inUse(vp)) {
+ VPutVolume_r(vp);
+ *ec = 1;
+ return;
+ }
+ if (V_offlineMessage(vp)[0] == '\0')
+ strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
+ V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
+ vp->goingOffline = 1;
+ VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
+ VCreateReservation_r(vp);
+ VPutVolume_r(vp);
+
+ /* Wait for the volume to go offline */
+ while (!VIsOfflineState(V_attachState(vp))) {
+ /* do not give corrupted volumes to the volserver */
+ if (vp->salvage.requested && vp->pending_vol_op->com.programType != salvageServer) {
+ *ec = 1;
+ goto error;
+ }
+ VWaitStateChange_r(vp);
+ }
+ *ec = 0;
+ error:
+ VCancelReservation_r(vp);
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+void
+VOffline(Volume * vp, char *message)
+{
+ VOL_LOCK;
+ VOffline_r(vp, message);
+ VOL_UNLOCK;
+}
+
+/* This gets used for the most part by utility routines that don't want
+ * to keep all the volume headers around. Generally, the file server won't
+ * call this routine, because then the offline message in the volume header
+ * (or other information) won't be available to clients. For NAMEI, also
+ * close the file handles. However, the fileserver does call this during
+ * an attach following a volume operation.
+ */
+void
+VDetachVolume_r(Error * ec, Volume * vp)
+{
+#ifdef FSSYNC_BUILD_CLIENT
+ VolumeId volume;
+ struct DiskPartition64 *tpartp;
+ int notifyServer = 0;
+ int useDone = FSYNC_VOL_ON;
+
+ if (VCanUseFSSYNC()) {
+ notifyServer = vp->needsPutBack;
+ if (V_destroyMe(vp) == DESTROY_ME)
+ useDone = FSYNC_VOL_DONE;
+#ifdef AFS_DEMAND_ATTACH_FS
+ else if (!V_blessed(vp) || !V_inService(vp))
+ useDone = FSYNC_VOL_LEAVE_OFF;
+#endif
+ }
+ tpartp = vp->partition;
+ volume = V_id(vp);
+#endif /* FSSYNC_BUILD_CLIENT */
+
+ *ec = 0; /* always "succeeds" */
+ DeleteVolumeFromHashTable(vp);
+ vp->shuttingDown = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+ DeleteVolumeFromVByPList_r(vp);
+ VLRU_Delete_r(vp);
+ VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN);
+#else
+ if (programType != fileServer)
+ V_inUse(vp) = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ VPutVolume_r(vp);
+ /* Will be detached sometime in the future--this is OK since volume is offline */
+
+ /* XXX the following code should really be moved to VCheckDetach() since the volume
+ * is not technically detached until the refcounts reach zero
+ */
+#ifdef FSSYNC_BUILD_CLIENT
+ if (VCanUseFSSYNC() && notifyServer) {
+ /*
+ * Note: The server is not notified in the case of a bogus volume
+ * explicitly to make it possible to create a volume, do a partial
+ * restore, then abort the operation without ever putting the volume
+ * online. This is essential in the case of a volume move operation
+ * between two partitions on the same server. In that case, there
+ * would be two instances of the same volume, one of them bogus,
+ * which the file server would attempt to put on line
+ */
+ FSYNC_VolOp(volume, tpartp->name, useDone, 0, NULL);
+ /* XXX this code path is only hit by volume utilities, thus
+ * V_BreakVolumeCallbacks will always be NULL. if we really
+ * want to break callbacks in this path we need to use FSYNC_VolOp() */
+#ifdef notdef
+ /* Dettaching it so break all callbacks on it */
+ if (V_BreakVolumeCallbacks) {
+ Log("volume %u detached; breaking all call backs\n", volume);
+ (*V_BreakVolumeCallbacks) (volume);
+ }
+#endif
+ }
+#endif /* FSSYNC_BUILD_CLIENT */
+}
+
+void
+VDetachVolume(Error * ec, Volume * vp)
+{
+ VOL_LOCK;
+ VDetachVolume_r(ec, vp);
+ VOL_UNLOCK;
+}
+
+
+/***************************************************/
+/* Volume fd/inode handle closing routines */
+/***************************************************/
+
+/* For VDetachVolume, we close all cached file descriptors, but keep
+ * the Inode handles in case we need to read from a busy volume.
+ */
+/* for demand attach, caller MUST hold ref count on vp */
+static void
+VCloseVolumeHandles_r(Volume * vp)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+
+ state_save = VChangeState_r(vp, VOL_STATE_OFFLINING);
+#endif
+
+ /* demand attach fs
+ *
+ * XXX need to investigate whether we can perform
+ * DFlushVolume outside of vol_glock_mutex...
+ *
+ * VCloseVnodeFiles_r drops the glock internally */
+ DFlushVolume(vp->hashid);
+ VCloseVnodeFiles_r(vp);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_UNLOCK;
+#endif
+
+ /* Too time consuming and unnecessary for the volserver */
+ if (programType == fileServer) {
+ IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
+ IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
+ IH_CONDSYNC(vp->diskDataHandle);
+#ifdef AFS_NT40_ENV
+ IH_CONDSYNC(vp->linkHandle);
+#endif /* AFS_NT40_ENV */
+ }
+
+ IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
+ IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
+ IH_REALLYCLOSE(vp->diskDataHandle);
+ IH_REALLYCLOSE(vp->linkHandle);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if ((V_attachFlags(vp) & VOL_LOCKED)) {
+ VUnlockVolume(vp);
+ }
+
+ VOL_LOCK;
+ VChangeState_r(vp, state_save);
+#endif
+}
+
+/* For both VForceOffline and VOffline, we close all relevant handles.
+ * For VOffline, if we re-attach the volume, the files may possible be
+ * different than before.
+ */
+/* for demand attach, caller MUST hold a ref count on vp */
+static void
+VReleaseVolumeHandles_r(Volume * vp)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+
+ state_save = VChangeState_r(vp, VOL_STATE_DETACHING);
+#endif
+
+ /* XXX need to investigate whether we can perform
+ * DFlushVolume outside of vol_glock_mutex... */
+ DFlushVolume(vp->hashid);
+
+ VReleaseVnodeFiles_r(vp); /* releases the glock internally */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_UNLOCK;
+#endif
+
+ /* Too time consuming and unnecessary for the volserver */
+ if (programType == fileServer) {
+ IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
+ IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
+ IH_CONDSYNC(vp->diskDataHandle);
+#ifdef AFS_NT40_ENV
+ IH_CONDSYNC(vp->linkHandle);
+#endif /* AFS_NT40_ENV */
+ }
+
+ IH_RELEASE(vp->vnodeIndex[vLarge].handle);
+ IH_RELEASE(vp->vnodeIndex[vSmall].handle);
+ IH_RELEASE(vp->diskDataHandle);
+ IH_RELEASE(vp->linkHandle);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if ((V_attachFlags(vp) & VOL_LOCKED)) {
+ VUnlockVolume(vp);
+ }
+
+ VOL_LOCK;
+ VChangeState_r(vp, state_save);
+#endif
+}
+
+
+/***************************************************/
+/* Volume write and fsync routines */
+/***************************************************/
+
+void
+VUpdateVolume_r(Error * ec, Volume * vp, int flags)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+
+ if (flags & VOL_UPDATE_WAIT) {
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+ }
+#endif
+
+ *ec = 0;
+ if (programType == fileServer)
+ V_uniquifier(vp) =
+ (V_inUse(vp) ? V_nextVnodeUnique(vp) +
+ 200 : V_nextVnodeUnique(vp));
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
+ VOL_UNLOCK;
+#endif
+
+ WriteVolumeHeader_r(ec, vp);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_LOCK;
+ VChangeState_r(vp, state_save);
+ if (flags & VOL_UPDATE_WAIT) {
+ VCancelReservation_r(vp);
+ }
+#endif
+
+ if (*ec) {
+ Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
+ V_id(vp), V_name(vp));
+ /* try to update on-disk header,
+ * while preventing infinite recursion */
+ if (!(flags & VOL_UPDATE_NOFORCEOFF)) {
+ VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE);
+ }
+ }
+}
+
+void
+VUpdateVolume(Error * ec, Volume * vp)
+{
+ VOL_LOCK;
+ VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
+ VOL_UNLOCK;
+}
+
+void
+VSyncVolume_r(Error * ec, Volume * vp, int flags)
+{
+ FdHandle_t *fdP;
+ int code;
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+#endif
+
+ if (flags & VOL_SYNC_WAIT) {
+ VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
+ } else {
+ VUpdateVolume_r(ec, vp, 0);
+ }
+ if (!*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
+ VOL_UNLOCK;
+#endif
+ fdP = IH_OPEN(V_diskDataHandle(vp));
+ osi_Assert(fdP != NULL);
+ code = FDH_SYNC(fdP);
+ osi_Assert(code == 0);
+ FDH_CLOSE(fdP);
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_LOCK;
+ VChangeState_r(vp, state_save);
+#endif
+ }
+}
+
+void
+VSyncVolume(Error * ec, Volume * vp)
+{
+ VOL_LOCK;
+ VSyncVolume_r(ec, vp, VOL_SYNC_WAIT);
+ VOL_UNLOCK;
+}
+
+
+/***************************************************/
+/* Volume dealloaction routines */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+FreeVolume(Volume * vp)
+{
+ /* free the heap space, iff it's safe.
+ * otherwise, pull it out of the hash table, so it
+ * will get deallocated when all refs to it go away */
+ if (!VCheckFree(vp)) {
+ DeleteVolumeFromHashTable(vp);
+ DeleteVolumeFromVByPList_r(vp);
+
+ /* make sure we invalidate the header cache entry */
+ FreeVolumeHeader(vp);
+ }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static void
+ReallyFreeVolume(Volume * vp)
+{
+ int i;
+ if (!vp)
+ return;
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* debug */
+ VChangeState_r(vp, VOL_STATE_FREED);
+ if (vp->pending_vol_op)
+ free(vp->pending_vol_op);
+#endif /* AFS_DEMAND_ATTACH_FS */
+ for (i = 0; i < nVNODECLASSES; i++)
+ if (vp->vnodeIndex[i].bitmap)
+ free(vp->vnodeIndex[i].bitmap);
+ FreeVolumeHeader(vp);
+#ifndef AFS_DEMAND_ATTACH_FS
+ DeleteVolumeFromHashTable(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
+ free(vp);
+}
+
+/* check to see if we should shutdown this volume
+ * returns 1 if volume was freed, 0 otherwise */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VCheckDetach(Volume * vp)
+{
+ int ret = 0;
+ Error ec = 0;
+
+ if (vp->nUsers || vp->nWaiters)
+ return ret;
+
+ if (vp->shuttingDown) {
+ ret = 1;
+ if ((programType != fileServer) &&
+ (V_inUse(vp) == programType) &&
+ ((V_checkoutMode(vp) == V_VOLUPD) ||
+ (V_checkoutMode(vp) == V_SECRETLY) ||
+ ((V_checkoutMode(vp) == V_CLONE) &&
+ (VolumeWriteable(vp))))) {
+ V_inUse(vp) = 0;
+ VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
+ if (ec) {
+ Log("VCheckDetach: volume header update for volume %u "
+ "failed with errno %d\n", vp->hashid, errno);
+ }
+ }
+ VReleaseVolumeHandles_r(vp);
+ VCheckSalvage(vp);
+ ReallyFreeVolume(vp);
+ if (programType == fileServer) {
+ CV_BROADCAST(&vol_put_volume_cond);
+ }
+ }
+ return ret;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VCheckDetach(Volume * vp)
+{
+ int ret = 0;
+ Error ec = 0;
+
+ if (vp->nUsers)
+ return ret;
+
+ if (vp->shuttingDown) {
+ ret = 1;
+ if ((programType != fileServer) &&
+ (V_inUse(vp) == programType) &&
+ ((V_checkoutMode(vp) == V_VOLUPD) ||
+ (V_checkoutMode(vp) == V_SECRETLY) ||
+ ((V_checkoutMode(vp) == V_CLONE) &&
+ (VolumeWriteable(vp))))) {
+ V_inUse(vp) = 0;
+ VUpdateVolume_r(&ec, vp, VOL_UPDATE_NOFORCEOFF);
+ if (ec) {
+ Log("VCheckDetach: volume header update for volume %u failed with errno %d\n",
+ vp->hashid, errno);
+ }
+ }
+ VReleaseVolumeHandles_r(vp);
+ ReallyFreeVolume(vp);
+ if (programType == fileServer) {
+#if defined(AFS_PTHREAD_ENV)
+ CV_BROADCAST(&vol_put_volume_cond);
+#else /* AFS_PTHREAD_ENV */
+ LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+ }
+ }
+ return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/* check to see if we should offline this volume
+ * return 1 if volume went offline, 0 otherwise */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VCheckOffline(Volume * vp)
+{
+ int ret = 0;
+
+ if (vp->goingOffline && !vp->nUsers) {
+ Error error;
+ osi_Assert(programType == fileServer);
+ osi_Assert((V_attachState(vp) != VOL_STATE_ATTACHED) &&
+ (V_attachState(vp) != VOL_STATE_FREED) &&
+ (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
+ (V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+ (V_attachState(vp) != VOL_STATE_DELETED));
+
+ /* valid states:
+ *
+ * VOL_STATE_GOING_OFFLINE
+ * VOL_STATE_SHUTTING_DOWN
+ * VIsErrorState(V_attachState(vp))
+ * VIsExclusiveState(V_attachState(vp))
+ */
+
+ VCreateReservation_r(vp);
+ VChangeState_r(vp, VOL_STATE_OFFLINING);
+
+ ret = 1;
+ /* must clear the goingOffline flag before we drop the glock */
+ vp->goingOffline = 0;
+ V_inUse(vp) = 0;
+
+ VLRU_Delete_r(vp);
+
+ /* perform async operations */
+ VUpdateVolume_r(&error, vp, 0);
+ VCloseVolumeHandles_r(vp);
+
+ if (LogLevel) {
+ if (V_offlineMessage(vp)[0]) {
+ Log("VOffline: Volume %lu (%s) is now offline (%s)\n",
+ afs_printable_uint32_lu(V_id(vp)), V_name(vp),
+ V_offlineMessage(vp));
+ } else {
+ Log("VOffline: Volume %lu (%s) is now offline\n",
+ afs_printable_uint32_lu(V_id(vp)), V_name(vp));
+ }
+ }
+
+ /* invalidate the volume header cache entry */
+ FreeVolumeHeader(vp);
+
+ /* if nothing changed state to error or salvaging,
+ * drop state to unattached */
+ if (!VIsErrorState(V_attachState(vp))) {
+ VChangeState_r(vp, VOL_STATE_UNATTACHED);
+ }
+ VCancelReservation_r(vp);
+ /* no usage of vp is safe beyond this point */
+ }
+ return ret;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VCheckOffline(Volume * vp)
+{
+ int ret = 0;
+
+ if (vp->goingOffline && !vp->nUsers) {
+ Error error;
+ osi_Assert(programType == fileServer);
+
+ ret = 1;
+ vp->goingOffline = 0;
+ V_inUse(vp) = 0;
+ VUpdateVolume_r(&error, vp, 0);
+ VCloseVolumeHandles_r(vp);
+ if (LogLevel) {
+ if (V_offlineMessage(vp)[0]) {
+ Log("VOffline: Volume %lu (%s) is now offline (%s)\n",
+ afs_printable_uint32_lu(V_id(vp)), V_name(vp),
+ V_offlineMessage(vp));
+ } else {
+ Log("VOffline: Volume %lu (%s) is now offline\n",
+ afs_printable_uint32_lu(V_id(vp)), V_name(vp));
+ }
+ }
+ FreeVolumeHeader(vp);
+#ifdef AFS_PTHREAD_ENV
+ CV_BROADCAST(&vol_put_volume_cond);
+#else /* AFS_PTHREAD_ENV */
+ LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+ }
+ return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/***************************************************/
+/* demand attach fs ref counting routines */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* the following two functions handle reference counting for
+ * asynchronous operations on volume structs.
+ *
+ * their purpose is to prevent a VDetachVolume or VShutdown
+ * from free()ing the Volume struct during an async i/o op */
+
+/* register with the async volume op ref counter */
+/* VCreateReservation_r moved into inline code header because it
+ * is now needed in vnode.c -- tkeiser 11/20/2007
+ */
+
+/**
+ * decrement volume-package internal refcount.
+ *
+ * @param vp volume object pointer
+ *
+ * @internal volume package internal use only
+ *
+ * @pre
+ * @arg VOL_LOCK is held
+ * @arg lightweight refcount held
+ *
+ * @post volume waiters refcount is decremented; volume may
+ * have been deallocated/shutdown/offlined/salvaged/
+ * whatever during the process
+ *
+ * @warning once you have tossed your last reference (you can acquire
+ * lightweight refs recursively) it is NOT SAFE to reference
+ * a volume object pointer ever again
+ *
+ * @see VCreateReservation_r
+ *
+ * @note DEMAND_ATTACH_FS only
+ */
+void
+VCancelReservation_r(Volume * vp)
+{
+ osi_Assert(--vp->nWaiters >= 0);
+ if (vp->nWaiters == 0) {
+ VCheckOffline(vp);
+ if (!VCheckDetach(vp)) {
+ VCheckSalvage(vp);
+ VCheckFree(vp);
+ }
+ }
+}
+
+/* check to see if we should free this volume now
+ * return 1 if volume was freed, 0 otherwise */
+static int
+VCheckFree(Volume * vp)
+{
+ int ret = 0;
+ if ((vp->nUsers == 0) &&
+ (vp->nWaiters == 0) &&
+ !(V_attachFlags(vp) & (VOL_IN_HASH |
+ VOL_ON_VBYP_LIST |
+ VOL_IS_BUSY |
+ VOL_ON_VLRU))) {
+ ReallyFreeVolume(vp);
+ ret = 1;
+ }
+ return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* online volume operations routines */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * register a volume operation on a given volume.
+ *
+ * @param[in] vp volume object
+ * @param[in] vopinfo volume operation info object
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @post volume operation info object attached to volume object.
+ * volume operation statistics updated.
+ *
+ * @note by "attached" we mean a copy of the passed in object is made
+ *
+ * @internal volume package internal use only
+ */
+int
+VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+ FSSYNC_VolOp_info * info;
+
+ /* attach a vol op info node to the volume struct */
+ info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info));
+ osi_Assert(info != NULL);
+ memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info));
+ vp->pending_vol_op = info;
+
+ /* update stats */
+ vp->stats.last_vol_op = FT_ApproxTime();
+ vp->stats.vol_ops++;
+ IncUInt64(&VStats.vol_ops);
+
+ return 0;
+}
+
+/**
+ * deregister the volume operation attached to this volume.
+ *
+ * @param[in] vp volume object pointer
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @post the volume operation info object is detached from the volume object
+ *
+ * @internal volume package internal use only
+ */
+int
+VDeregisterVolOp_r(Volume * vp)
+{
+ if (vp->pending_vol_op) {
+ free(vp->pending_vol_op);
+ vp->pending_vol_op = NULL;
+ }
+ return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/**
+ * determine whether it is safe to leave a volume online during
+ * the volume operation described by the vopinfo object.
+ *
+ * @param[in] vp volume object
+ * @param[in] vopinfo volume operation info object
+ *
+ * @return whether it is safe to leave volume online
+ * @retval 0 it is NOT SAFE to leave the volume online
+ * @retval 1 it is safe to leave the volume online during the operation
+ *
+ * @pre
+ * @arg VOL_LOCK is held
+ * @arg disk header attached to vp (heavyweight ref on vp will guarantee
+ * this condition is met)
+ *
+ * @internal volume package internal use only
+ */
+int
+VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+ return (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline ||
+ (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+ (vopinfo->com.reason == V_READONLY ||
+ (!VolumeWriteable(vp) &&
+ (vopinfo->com.reason == V_CLONE ||
+ vopinfo->com.reason == V_DUMP)))));
+}
+
+/**
+ * same as VVolOpLeaveOnline_r, but does not require a volume with an attached
+ * header.
+ *
+ * @param[in] vp volume object
+ * @param[in] vopinfo volume operation info object
+ *
+ * @return whether it is safe to leave volume online
+ * @retval 0 it is NOT SAFE to leave the volume online
+ * @retval 1 it is safe to leave the volume online during the operation
+ * @retval -1 unsure; volume header is required in order to know whether or
+ * not is is safe to leave the volume online
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @internal volume package internal use only
+ */
+int
+VVolOpLeaveOnlineNoHeader_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+ /* follow the logic in VVolOpLeaveOnline_r; this is the same, except
+ * assume that we don't know VolumeWriteable; return -1 if the answer
+ * depends on VolumeWriteable */
+
+ if (vopinfo->vol_op_state == FSSYNC_VolOpRunningOnline) {
+ return 1;
+ }
+ if (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+ vopinfo->com.reason == V_READONLY) {
+
+ return 1;
+ }
+ if (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+ (vopinfo->com.reason == V_CLONE ||
+ vopinfo->com.reason == V_DUMP)) {
+
+ /* must know VolumeWriteable */
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * determine whether VBUSY should be set during this volume operation.
+ *
+ * @param[in] vp volume object
+ * @param[in] vopinfo volume operation info object
+ *
+ * @return whether VBUSY should be set
+ * @retval 0 VBUSY does NOT need to be set
+ * @retval 1 VBUSY SHOULD be set
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @internal volume package internal use only
+ */
+int
+VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
+{
+ return ((vopinfo->com.command == FSYNC_VOL_OFF &&
+ vopinfo->com.reason == FSYNC_SALVAGE) ||
+ (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+ (vopinfo->com.reason == V_CLONE ||
+ vopinfo->com.reason == V_DUMP)));
+}
+
+
+/***************************************************/
+/* online salvager routines */
+/***************************************************/
+#if defined(AFS_DEMAND_ATTACH_FS)
+
+/**
+ * offline a volume to let it be salvaged.
+ *
+ * @param[in] vp Volume to offline
+ *
+ * @return whether we offlined the volume successfully
+ * @retval 0 volume was not offlined
+ * @retval 1 volume is now offline
+ *
+ * @note This is similar to VCheckOffline, but slightly different. We do not
+ * deal with vp->goingOffline, and we try to avoid touching the volume
+ * header except just to set needsSalvaged
+ *
+ * @pre VOL_LOCK held
+ * @pre vp->nUsers == 0
+ * @pre V_attachState(vp) == VOL_STATE_SALVAGE_REQ
+ */
+static int
+VOfflineForSalvage_r(struct Volume *vp)
+{
+ Error error;
+
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+
+ if (vp->nUsers || V_attachState(vp) == VOL_STATE_SALVAGING) {
+ /* Someone's using the volume, or someone got to scheduling the salvage
+ * before us. I don't think either of these should be possible, as we
+ * should gain no new heavyweight references while we're trying to
+ * salvage, but just to be sure... */
+ VCancelReservation_r(vp);
+ return 0;
+ }
+
+ VChangeState_r(vp, VOL_STATE_OFFLINING);
+
+ VLRU_Delete_r(vp);
+ if (vp->header) {
+ V_needsSalvaged(vp) = 1;
+ /* ignore error; updating needsSalvaged is just best effort */
+ VUpdateVolume_r(&error, vp, VOL_UPDATE_NOFORCEOFF);
+ }
+ VCloseVolumeHandles_r(vp);
+
+ FreeVolumeHeader(vp);
+
+ /* volume has been effectively offlined; we can mark it in the SALVAGING
+ * state now, which lets FSSYNC give it away */
+ VChangeState_r(vp, VOL_STATE_SALVAGING);
+
+ VCancelReservation_r(vp);
+
+ return 1;
+}
+
+/**
+ * check whether a salvage needs to be performed on this volume.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @return status code
+ * @retval 0 no salvage scheduled
+ * @retval 1 a salvage has been scheduled with the salvageserver
+ *
+ * @pre VOL_LOCK is held
+ *
+ * @post if salvage request flag is set and nUsers and nWaiters are zero,
+ * then a salvage will be requested
+ *
+ * @note this is one of the event handlers called by VCancelReservation_r
+ *
+ * @note the caller must check if the volume needs to be freed after calling
+ * this; the volume may not have any references or be on any lists after
+ * we return, and we do not free it
+ *
+ * @see VCancelReservation_r
+ *
+ * @internal volume package internal use only.
+ */
+static int
+VCheckSalvage(Volume * vp)
+{
+ int ret = 0;
+#if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT)
+ if (vp->nUsers)
+ return ret;
+ if (!vp->salvage.requested) {
+ return ret;
+ }
+
+ /* prevent recursion; some of the code below creates and removes
+ * lightweight refs, which can call VCheckSalvage */
+ if (vp->salvage.scheduling) {
+ return ret;
+ }
+ vp->salvage.scheduling = 1;
+
+ if (V_attachState(vp) == VOL_STATE_SALVAGE_REQ) {
+ if (!VOfflineForSalvage_r(vp)) {
+ vp->salvage.scheduling = 0;
+ return ret;
+ }
+ }
+
+ if (vp->salvage.requested) {
+ VScheduleSalvage_r(vp);
+ ret = 1;
+ }
+ vp->salvage.scheduling = 0;
+#endif /* SALVSYNC_BUILD_CLIENT || FSSYNC_BUILD_CLIENT */
+ return ret;
+}
+
+/**
+ * request volume salvage.
+ *
+ * @param[out] ec computed client error code
+ * @param[in] vp volume object pointer
+ * @param[in] reason reason code (passed to salvageserver via SALVSYNC)
+ * @param[in] flags see flags note below
+ *
+ * @note flags:
+ * VOL_SALVAGE_INVALIDATE_HEADER causes volume header cache entry
+ * to be invalidated.
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post volume state is changed.
+ * for fileserver, salvage will be requested once refcount reaches zero.
+ *
+ * @return operation status code
+ * @retval 0 volume salvage will occur
+ * @retval 1 volume salvage could not be scheduled
+ *
+ * @note DAFS only
+ *
+ * @note in the fileserver, this call does not synchronously schedule a volume
+ * salvage. rather, it sets volume state so that when volume refcounts
+ * reach zero, a volume salvage will occur. by "refcounts", we mean both
+ * nUsers and nWaiters must be zero.
+ *
+ * @internal volume package internal use only.
+ */
+int
+VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags)
+{
+ int code = 0;
+ /*
+ * for DAFS volume utilities that are not supposed to schedule salvages,
+ * just transition to error state instead
+ */
+ if (!VCanScheduleSalvage()) {
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ *ec = VSALVAGE;
+ return 1;
+ }
+
+ if (programType != fileServer && !VCanUseFSSYNC()) {
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ *ec = VSALVAGE;
+ return 1;
+ }
+
+ if (!vp->salvage.requested) {
+ vp->salvage.requested = 1;
+ vp->salvage.reason = reason;
+ vp->stats.last_salvage = FT_ApproxTime();
+
+ /* Note that it is not possible for us to reach this point if a
+ * salvage is already running on this volume (even if the fileserver
+ * was restarted during the salvage). If a salvage were running, the
+ * salvager would have write-locked the volume header file, so when
+ * we tried to lock the volume header, the lock would have failed,
+ * and we would have failed during attachment prior to calling
+ * VRequestSalvage. So we know that we can schedule salvages without
+ * fear of a salvage already running for this volume. */
+
+ if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
+
+ /* if we don't need to offline the volume, we can go directly
+ * to SALVAGING. SALVAGING says the volume is offline and is
+ * either salvaging or ready to be handed to the salvager.
+ * SALVAGE_REQ says that we want to salvage the volume, but we
+ * are waiting for it to go offline first. */
+ if (flags & VOL_SALVAGE_NO_OFFLINE) {
+ VChangeState_r(vp, VOL_STATE_SALVAGING);
+ } else {
+ VChangeState_r(vp, VOL_STATE_SALVAGE_REQ);
+ if (vp->nUsers == 0) {
+ /* normally VOfflineForSalvage_r would be called from
+ * PutVolume et al when nUsers reaches 0, but if
+ * it's already 0, just do it ourselves, since PutVolume
+ * isn't going to get called */
+ VOfflineForSalvage_r(vp);
+ }
+ }
+ *ec = VSALVAGING;
+ } else {
+ Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
+
+ /* make sure neither VScheduleSalvage_r nor
+ * VUpdateSalvagePriority_r try to schedule another salvage */
+ vp->salvage.requested = vp->salvage.scheduled = 0;
+
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ *ec = VSALVAGE;
+ code = 1;
+ }
+ if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
+ /* Instead of ReleaseVolumeHeader, we do FreeVolumeHeader()
+ so that the the next VAttachVolumeByVp_r() invocation
+ of attach2() will pull in a cached header
+ entry and fail, then load a fresh one from disk and attach
+ it to the volume.
+ */
+ FreeVolumeHeader(vp);
+ }
+ }
+ return code;
+}
+
+/**
+ * update salvageserver scheduling priority for a volume.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @return operation status
+ * @retval 0 success
+ * @retval 1 request denied, or SALVSYNC communications failure
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post in-core salvage priority counter is incremented. if at least
+ * SALVAGE_PRIO_UPDATE_INTERVAL seconds have elapsed since the
+ * last SALVSYNC_RAISEPRIO request, we contact the salvageserver
+ * to update its priority queue. if no salvage is scheduled,
+ * this function is a no-op.
+ *
+ * @note DAFS fileserver only
+ *
+ * @note this should be called whenever a VGetVolume fails due to a
+ * pending salvage request
+ *
+ * @todo should set exclusive state and drop glock around salvsync call
+ *
+ * @internal volume package internal use only.
+ */
+int
+VUpdateSalvagePriority_r(Volume * vp)
+{
+ int ret=0;
+
+#ifdef SALVSYNC_BUILD_CLIENT
+ afs_uint32 now;
+ int code;
+
+ vp->salvage.prio++;
+ now = FT_ApproxTime();
+
+ /* update the salvageserver priority queue occasionally so that
+ * frequently requested volumes get moved to the head of the queue
+ */
+ if ((vp->salvage.scheduled) &&
+ (vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) {
+ code = SALVSYNC_SalvageVolume(vp->hashid,
+ VPartitionPath(vp->partition),
+ SALVSYNC_RAISEPRIO,
+ vp->salvage.reason,
+ vp->salvage.prio,
+ NULL);
+ vp->stats.last_salvage_req = now;
+ if (code != SYNC_OK) {
+ ret = 1;
+ }
+ }
+#endif /* SALVSYNC_BUILD_CLIENT */
+ return ret;
+}
+
+
+#if defined(SALVSYNC_BUILD_CLIENT) || defined(FSSYNC_BUILD_CLIENT)
+
+/* A couple of little helper functions. These return true if we tried to
+ * use this mechanism to schedule a salvage, false if we haven't tried.
+ * If we did try a salvage then the results are contained in code.
+ */
+
+static_inline int
+try_SALVSYNC(Volume *vp, char *partName, int *code) {
+#ifdef SALVSYNC_BUILD_CLIENT
+ if (VCanUseSALVSYNC()) {
+ Log("Scheduling salvage for volume %lu on part %s over SALVSYNC\n",
+ afs_printable_uint32_lu(vp->hashid), partName);
+
+ /* can't use V_id() since there's no guarantee
+ * we have the disk data header at this point */
+ *code = SALVSYNC_SalvageVolume(vp->hashid,
+ partName,
+ SALVSYNC_SALVAGE,
+ vp->salvage.reason,
+ vp->salvage.prio,
+ NULL);
+ return 1;
+ }
+#endif
+ return 0;
+}
+
+static_inline int
+try_FSSYNC(Volume *vp, char *partName, int *code) {
+#ifdef FSSYNC_BUILD_CLIENT
+ if (VCanUseFSSYNC()) {
+ Log("Scheduling salvage for volume %lu on part %s over FSSYNC\n",
+ afs_printable_uint32_lu(vp->hashid), partName);
+
+ /*
+ * If we aren't the fileserver, tell the fileserver the volume
+ * needs to be salvaged. We could directly tell the
+ * salvageserver, but the fileserver keeps track of some stats
+ * related to salvages, and handles some other salvage-related
+ * complications for us.
+ */
+ *code = FSYNC_VolOp(vp->hashid, partName,
+ FSYNC_VOL_FORCE_ERROR, FSYNC_SALVAGE, NULL);
+ return 1;
+ }
+#endif /* FSSYNC_BUILD_CLIENT */
+ return 0;
+}
+
+/**
+ * schedule a salvage with the salvage server or fileserver.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @return operation status
+ * @retval 0 salvage scheduled successfully
+ * @retval 1 salvage not scheduled, or SALVSYNC/FSSYNC com error
+ *
+ * @pre
+ * @arg VOL_LOCK is held.
+ * @arg nUsers and nWaiters should be zero.
+ *
+ * @post salvageserver or fileserver is sent a salvage request
+ *
+ * @note If we are the fileserver, the request will be sent to the salvage
+ * server over SALVSYNC. If we are not the fileserver, the request will be
+ * sent to the fileserver over FSSYNC (FSYNC_VOL_FORCE_ERROR/FSYNC_SALVAGE).
+ *
+ * @note the caller must check if the volume needs to be freed after calling
+ * this; the volume may not have any references or be on any lists after
+ * we return, and we do not free it
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static int
+VScheduleSalvage_r(Volume * vp)
+{
+ int ret=0;
+ int code;
+ VolState state_save;
+ VThreadOptions_t * thread_opts;
+ char partName[16];
+
+ osi_Assert(VCanUseSALVSYNC() || VCanUseFSSYNC());
+
+ if (vp->nWaiters || vp->nUsers) {
+ return 1;
+ }
+
+ /* prevent endless salvage,attach,salvage,attach,... loops */
+ if (vp->stats.salvages >= SALVAGE_COUNT_MAX)
+ return 1;
+
+ /*
+ * don't perform salvsync ops on certain threads
+ */
+ thread_opts = pthread_getspecific(VThread_key);
+ if (thread_opts == NULL) {
+ thread_opts = &VThread_defaults;
+ }
+ if (thread_opts->disallow_salvsync || vol_disallow_salvsync) {
+ return 1;
+ }
+
+ if (vp->salvage.scheduled) {
+ return ret;
+ }
+
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+
+ /*
+ * XXX the scheduling process should really be done asynchronously
+ * to avoid fssync deadlocks
+ */
+ if (!vp->salvage.scheduled) {
+ /* if we haven't previously scheduled a salvage, do so now
+ *
+ * set the volume to an exclusive state and drop the lock
+ * around the SALVSYNC call
+ */
+ strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName));
+ state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
+ VOL_UNLOCK;
+
+ osi_Assert(try_SALVSYNC(vp, partName, &code) ||
+ try_FSSYNC(vp, partName, &code));
+
+ VOL_LOCK;
+ VChangeState_r(vp, state_save);
+
+ if (code == SYNC_OK) {
+ vp->salvage.scheduled = 1;
+ vp->stats.last_salvage_req = FT_ApproxTime();
+ if (VCanUseSALVSYNC()) {
+ /* don't record these stats for non-fileservers; let the
+ * fileserver take care of these */
+ vp->stats.salvages++;
+ IncUInt64(&VStats.salvages);
+ }
+ } else {
+ ret = 1;
+ switch(code) {
+ case SYNC_BAD_COMMAND:
+ case SYNC_COM_ERROR:
+ break;
+ case SYNC_DENIED:
+ Log("VScheduleSalvage_r: Salvage request for volume %lu "
+ "denied\n", afs_printable_uint32_lu(vp->hashid));
+ break;
+ default:
+ Log("VScheduleSalvage_r: Salvage request for volume %lu "
+ "received unknown protocol error %d\n",
+ afs_printable_uint32_lu(vp->hashid), code);
+ break;
+ }
+
+ if (VCanUseFSSYNC()) {
+ VChangeState_r(vp, VOL_STATE_ERROR);
+ }
+ }
+ }
+
+ /* NB: this is cancelling the reservation we obtained above, but we do
+ * not call VCancelReservation_r, since that may trigger the vp dtor,
+ * possibly free'ing the vp. We need to keep the vp around after
+ * this, as the caller may reference vp without any refs. Instead, it
+ * is the duty of the caller to inspect 'vp' after we return to see if
+ * needs to be freed. */
+ osi_Assert(--vp->nWaiters >= 0);
+ return ret;
+}
+#endif /* SALVSYNC_BUILD_CLIENT || FSSYNC_BUILD_CLIENT */
+
+#ifdef SALVSYNC_BUILD_CLIENT
+
+/**
+ * connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @post connection to salvageserver SYNC service established
+ *
+ * @see VConnectSALV_r
+ * @see VDisconnectSALV
+ * @see VReconnectSALV
+ */
+int
+VConnectSALV(void)
+{
+ int retVal;
+ VOL_LOCK;
+ retVal = VConnectSALV_r();
+ VOL_UNLOCK;
+ return retVal;
+}
+
+/**
+ * connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post connection to salvageserver SYNC service established
+ *
+ * @see VConnectSALV
+ * @see VDisconnectSALV_r
+ * @see VReconnectSALV_r
+ * @see SALVSYNC_clientInit
+ *
+ * @internal volume package internal use only.
+ */
+int
+VConnectSALV_r(void)
+{
+ return SALVSYNC_clientInit();
+}
+
+/**
+ * disconnect from the salvageserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 success
+ *
+ * @pre client should have a live connection to the salvageserver
+ *
+ * @post connection to salvageserver SYNC service destroyed
+ *
+ * @see VDisconnectSALV_r
+ * @see VConnectSALV
+ * @see VReconnectSALV
+ */
+int
+VDisconnectSALV(void)
+{
+ VOL_LOCK;
+ VDisconnectSALV_r();
+ VOL_UNLOCK;
+ return 0;
+}
+
+/**
+ * disconnect from the salvageserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 success
+ *
+ * @pre
+ * @arg VOL_LOCK is held.
+ * @arg client should have a live connection to the salvageserver.
+ *
+ * @post connection to salvageserver SYNC service destroyed
+ *
+ * @see VDisconnectSALV
+ * @see VConnectSALV_r
+ * @see VReconnectSALV_r
+ * @see SALVSYNC_clientFinis
+ *
+ * @internal volume package internal use only.
+ */
+int
+VDisconnectSALV_r(void)
+{
+ return SALVSYNC_clientFinis();
+}
+
+/**
+ * disconnect and then re-connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre client should have a live connection to the salvageserver
+ *
+ * @post old connection is dropped, and a new one is established
+ *
+ * @see VConnectSALV
+ * @see VDisconnectSALV
+ * @see VReconnectSALV_r
+ */
+int
+VReconnectSALV(void)
+{
+ int retVal;
+ VOL_LOCK;
+ retVal = VReconnectSALV_r();
+ VOL_UNLOCK;
+ return retVal;
+}
+
+/**
+ * disconnect and then re-connect to the salvageserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre
+ * @arg VOL_LOCK is held.
+ * @arg client should have a live connection to the salvageserver.
+ *
+ * @post old connection is dropped, and a new one is established
+ *
+ * @see VConnectSALV_r
+ * @see VDisconnectSALV
+ * @see VReconnectSALV
+ * @see SALVSYNC_clientReconnect
+ *
+ * @internal volume package internal use only.
+ */
+int
+VReconnectSALV_r(void)
+{
+ return SALVSYNC_clientReconnect();
+}
+#endif /* SALVSYNC_BUILD_CLIENT */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* FSSYNC routines */
+/***************************************************/
+
+/* This must be called by any volume utility which needs to run while the
+ file server is also running. This is separated from VInitVolumePackage2 so
+ that a utility can fork--and each of the children can independently
+ initialize communication with the file server */
+#ifdef FSSYNC_BUILD_CLIENT
+/**
+ * connect to the fileserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre
+ * @arg VInit must equal 2.
+ * @arg Program Type must not be fileserver or salvager.
+ *
+ * @post connection to fileserver SYNC service established
+ *
+ * @see VConnectFS_r
+ * @see VDisconnectFS
+ * @see VChildProcReconnectFS
+ */
+int
+VConnectFS(void)
+{
+ int retVal;
+ VOL_LOCK;
+ retVal = VConnectFS_r();
+ VOL_UNLOCK;
+ return retVal;
+}
+
+/**
+ * connect to the fileserver SYNC service.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre
+ * @arg VInit must equal 2.
+ * @arg Program Type must not be fileserver or salvager.
+ * @arg VOL_LOCK is held.
+ *
+ * @post connection to fileserver SYNC service established
+ *
+ * @see VConnectFS
+ * @see VDisconnectFS_r
+ * @see VChildProcReconnectFS_r
+ *
+ * @internal volume package internal use only.
+ */
+int
+VConnectFS_r(void)
+{
+ int rc;
+ osi_Assert((VInit == 2) &&
+ (programType != fileServer) &&
+ (programType != salvager));
+ rc = FSYNC_clientInit();
+ if (rc) {
+ VSetVInit_r(3);
+ }
+ return rc;
+}
+
+/**
+ * disconnect from the fileserver SYNC service.
+ *
+ * @pre
+ * @arg client should have a live connection to the fileserver.
+ * @arg VOL_LOCK is held.
+ * @arg Program Type must not be fileserver or salvager.
+ *
+ * @post connection to fileserver SYNC service destroyed
+ *
+ * @see VDisconnectFS
+ * @see VConnectFS_r
+ * @see VChildProcReconnectFS_r
+ *
+ * @internal volume package internal use only.
+ */
+void
+VDisconnectFS_r(void)
+{
+ osi_Assert((programType != fileServer) &&
+ (programType != salvager));
+ FSYNC_clientFinis();
+ VSetVInit_r(2);
+}
+
+/**
+ * disconnect from the fileserver SYNC service.
+ *
+ * @pre
+ * @arg client should have a live connection to the fileserver.
+ * @arg Program Type must not be fileserver or salvager.
+ *
+ * @post connection to fileserver SYNC service destroyed
+ *
+ * @see VDisconnectFS_r
+ * @see VConnectFS
+ * @see VChildProcReconnectFS
+ */
+void
+VDisconnectFS(void)
+{
+ VOL_LOCK;
+ VDisconnectFS_r();
+ VOL_UNLOCK;
+}
+
+/**
+ * connect to the fileserver SYNC service from a child process following a fork.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre
+ * @arg VOL_LOCK is held.
+ * @arg current FSYNC handle is shared with a parent process
+ *
+ * @post current FSYNC handle is discarded and a new connection to the
+ * fileserver SYNC service is established
+ *
+ * @see VChildProcReconnectFS
+ * @see VConnectFS_r
+ * @see VDisconnectFS_r
+ *
+ * @internal volume package internal use only.
+ */
+int
+VChildProcReconnectFS_r(void)
+{
+ return FSYNC_clientChildProcReconnect();
+}
+
+/**
+ * connect to the fileserver SYNC service from a child process following a fork.
+ *
+ * @return operation status
+ * @retval 0 failure
+ * @retval 1 success
+ *
+ * @pre current FSYNC handle is shared with a parent process
+ *
+ * @post current FSYNC handle is discarded and a new connection to the
+ * fileserver SYNC service is established
+ *
+ * @see VChildProcReconnectFS_r
+ * @see VConnectFS
+ * @see VDisconnectFS
+ */
+int
+VChildProcReconnectFS(void)
+{
+ int ret;
+ VOL_LOCK;
+ ret = VChildProcReconnectFS_r();
+ VOL_UNLOCK;
+ return ret;
+}
+#endif /* FSSYNC_BUILD_CLIENT */
+
+
+/***************************************************/
+/* volume bitmap routines */
+/***************************************************/
+
+/**
+ * allocate a vnode bitmap number for the vnode
+ *
+ * @param[out] ec error code
+ * @param[in] vp volume object pointer
+ * @param[in] index vnode index number for the vnode
+ * @param[in] flags flag values described in note
+ *
+ * @note for DAFS, flags parameter controls locking behavior.
+ * If (flags & VOL_ALLOC_BITMAP_WAIT) is set, then this function
+ * will create a reservation and block on any other exclusive
+ * operations. Otherwise, this function assumes the caller
+ * already has exclusive access to vp, and we just change the
+ * volume state.
+ *
+ * @pre VOL_LOCK held
+ *
+ * @return bit number allocated
+ */
+/*
+
+ */
+int
+VAllocBitmapEntry_r(Error * ec, Volume * vp,
+ struct vnodeIndex *index, int flags)
+{
+ int ret = 0;
+ byte *bp, *ep;
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ *ec = 0;
+
+ /* This test is probably redundant */
if (!VolumeWriteable(vp)) {
*ec = (bit32) VREADONLY;
- return 0;
+ return ret;
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (flags & VOL_ALLOC_BITMAP_WAIT) {
+ VCreateReservation_r(vp);
+ VWaitExclusiveState_r(vp);
+ }
+ state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#ifdef BITMAP_LATER
+ if ((programType == fileServer) && !index->bitmap) {
+ int i;
+#ifndef AFS_DEMAND_ATTACH_FS
+ /* demand attach fs uses the volume state to avoid races.
+ * specialStatus field is not used at all */
+ int wasVBUSY = 0;
+ if (vp->specialStatus == VBUSY) {
+ if (vp->goingOffline) { /* vos dump waiting for the volume to
+ * go offline. We probably come here
+ * from AddNewReadableResidency */
+ wasVBUSY = 1;
+ } else {
+ while (vp->specialStatus == VBUSY) {
+#ifdef AFS_PTHREAD_ENV
+ VOL_UNLOCK;
+ sleep(2);
+ VOL_LOCK;
+#else /* !AFS_PTHREAD_ENV */
+ IOMGR_Sleep(2);
+#endif /* !AFS_PTHREAD_ENV */
+ }
+ }
+ }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+ if (!index->bitmap) {
+#ifndef AFS_DEMAND_ATTACH_FS
+ vp->specialStatus = VBUSY; /* Stop anyone else from using it. */
+#endif /* AFS_DEMAND_ATTACH_FS */
+ for (i = 0; i < nVNODECLASSES; i++) {
+ VGetBitmap_r(ec, vp, i);
+ if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+ VRequestSalvage_r(ec, vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+#else /* AFS_DEMAND_ATTACH_FS */
+ DeleteVolumeFromHashTable(vp);
+ vp->shuttingDown = 1; /* Let who has it free it. */
+ vp->specialStatus = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ goto done;
+ }
+ }
+#ifndef AFS_DEMAND_ATTACH_FS
+ if (!wasVBUSY)
+ vp->specialStatus = 0; /* Allow others to have access. */
+#endif /* AFS_DEMAND_ATTACH_FS */
+ }
+ }
+#endif /* BITMAP_LATER */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ bp = index->bitmap + index->bitmapOffset;
+ ep = index->bitmap + index->bitmapSize;
+ while (bp < ep) {
+ if ((*(bit32 *) bp) != (bit32) 0xffffffff) {
+ int o;
+ index->bitmapOffset = (afs_uint32) (bp - index->bitmap);
+ while (*bp == 0xff)
+ bp++;
+ o = ffs(~*bp) - 1; /* ffs is documented in BSTRING(3) */
+ *bp |= (1 << o);
+ ret = ((bp - index->bitmap) * 8 + o);
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_LOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+ goto done;
+ }
+ bp += sizeof(bit32) /* i.e. 4 */ ;
+ }
+ /* No bit map entry--must grow bitmap */
+ bp = (byte *)
+ realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE);
+ osi_Assert(bp != NULL);
+ index->bitmap = bp;
+ bp += index->bitmapSize;
+ memset(bp, 0, VOLUME_BITMAP_GROWSIZE);
+ index->bitmapOffset = index->bitmapSize;
+ index->bitmapSize += VOLUME_BITMAP_GROWSIZE;
+ *bp = 1;
+ ret = index->bitmapOffset * 8;
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_LOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ done:
+#ifdef AFS_DEMAND_ATTACH_FS
+ VChangeState_r(vp, state_save);
+ if (flags & VOL_ALLOC_BITMAP_WAIT) {
+ VCancelReservation_r(vp);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+ return ret;
+}
+
+int
+VAllocBitmapEntry(Error * ec, Volume * vp, struct vnodeIndex * index)
+{
+ int retVal;
+ VOL_LOCK;
+ retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT);
+ VOL_UNLOCK;
+ return retVal;
+}
+
+void
+VFreeBitMapEntry_r(Error * ec, struct vnodeIndex *index,
+ unsigned bitNumber)
+{
+ unsigned int offset;
+
+ *ec = 0;
+#ifdef BITMAP_LATER
+ if (!index->bitmap)
+ return;
+#endif /* BITMAP_LATER */
+ offset = bitNumber >> 3;
+ if (offset >= index->bitmapSize) {
+ *ec = VNOVNODE;
+ return;
+ }
+ if (offset < index->bitmapOffset)
+ index->bitmapOffset = offset & ~3; /* Truncate to nearest bit32 */
+ *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7));
+}
+
+void
+VFreeBitMapEntry(Error * ec, struct vnodeIndex *index,
+ unsigned bitNumber)
+{
+ VOL_LOCK;
+ VFreeBitMapEntry_r(ec, index, bitNumber);
+ VOL_UNLOCK;
+}
+
+/* this function will drop the glock internally.
+ * for old pthread fileservers, this is safe thanks to vbusy.
+ *
+ * for demand attach fs, caller must have already called
+ * VCreateReservation_r and VWaitExclusiveState_r */
+static void
+VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
+{
+ StreamHandle_t *file;
+ afs_sfsize_t nVnodes, size;
+ struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
+ struct vnodeIndex *vip = &vp->vnodeIndex[class];
+ struct VnodeDiskObject *vnode;
+ unsigned int unique = 0;
+ FdHandle_t *fdP;
+#ifdef BITMAP_LATER
+ byte *BitMap = 0;
+#endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ *ec = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
+#endif /* AFS_DEMAND_ATTACH_FS */
+ VOL_UNLOCK;
+
+ fdP = IH_OPEN(vip->handle);
+ osi_Assert(fdP != NULL);
+ file = FDH_FDOPEN(fdP, "r");
+ osi_Assert(file != NULL);
+ vnode = (VnodeDiskObject *) malloc(vcp->diskSize);
+ osi_Assert(vnode != NULL);
+ size = OS_SIZE(fdP->fd_fd);
+ osi_Assert(size != -1);
+ nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize)
+ >> vcp->logSize;
+ vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4; /* The 10 is a little extra so
+ * a few files can be created in this volume,
+ * the whole thing is rounded up to nearest 4
+ * bytes, because the bit map allocator likes
+ * it that way */
+#ifdef BITMAP_LATER
+ BitMap = (byte *) calloc(1, vip->bitmapSize);
+ osi_Assert(BitMap != NULL);
+#else /* BITMAP_LATER */
+ vip->bitmap = (byte *) calloc(1, vip->bitmapSize);
+ osi_Assert(vip->bitmap != NULL);
+ vip->bitmapOffset = 0;
+#endif /* BITMAP_LATER */
+ if (STREAM_ASEEK(file, vcp->diskSize) != -1) {
+ int bitNumber = 0;
+ for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) {
+ if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1)
+ break;
+ if (vnode->type != vNull) {
+ if (vnode->vnodeMagic != vcp->magic) {
+ Log("GetBitmap: addled vnode index in volume %s; volume needs salvage\n", V_name(vp));
+ *ec = VSALVAGE;
+ break;
+ }
+#ifdef BITMAP_LATER
+ *(BitMap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
+#else /* BITMAP_LATER */
+ *(vip->bitmap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
+#endif /* BITMAP_LATER */
+ if (unique <= vnode->uniquifier)
+ unique = vnode->uniquifier + 1;
+ }
+#ifndef AFS_PTHREAD_ENV
+ if ((bitNumber & 0x00ff) == 0x0ff) { /* every 256 iterations */
+ IOMGR_Poll();
+ }
+#endif /* !AFS_PTHREAD_ENV */
+ }
+ }
+ if (vp->nextVnodeUnique < unique) {
+ Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
+ *ec = VSALVAGE;
+ }
+ /* Paranoia, partly justified--I think fclose after fdopen
+ * doesn't seem to close fd. In any event, the documentation
+ * doesn't specify, so it's safer to close it twice.
+ */
+ STREAM_CLOSE(file);
+ FDH_CLOSE(fdP);
+ free(vnode);
+
+ VOL_LOCK;
+#ifdef BITMAP_LATER
+ /* There may have been a racing condition with some other thread, both
+ * creating the bitmaps for this volume. If the other thread was faster
+ * the pointer to bitmap should already be filled and we can free ours.
+ */
+ if (vip->bitmap == NULL) {
+ vip->bitmap = BitMap;
+ vip->bitmapOffset = 0;
+ } else
+ free((byte *) BitMap);
+#endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+ VChangeState_r(vp, state_save);
+#endif /* AFS_DEMAND_ATTACH_FS */
+}
+
+
+/***************************************************/
+/* Volume Path and Volume Number utility routines */
+/***************************************************/
+
+/**
+ * find the first occurrence of a volume header file and return the path.
+ *
+ * @param[out] ec outbound error code
+ * @param[in] volumeId volume id to find
+ * @param[out] partitionp pointer to disk partition path string
+ * @param[out] namep pointer to volume header file name string
+ *
+ * @post path to first occurrence of volume header is returned in partitionp
+ * and namep, or ec is set accordingly.
+ *
+ * @warning this function is NOT re-entrant -- partitionp and namep point to
+ * static data segments
+ *
+ * @note if a volume utility inadvertently leaves behind a stale volume header
+ * on a vice partition, it is possible for callers to get the wrong one,
+ * depending on the order of the disk partition linked list.
+ *
+ */
+void
+VGetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
+{
+ static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
+ char path[VMAXPATHLEN];
+ int found = 0;
+ struct DiskPartition64 *dp;
+
+ *ec = 0;
+ name[0] = '/';
+ (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, afs_printable_uint32_lu(volumeId));
+ for (dp = DiskPartitionList; dp; dp = dp->next) {
+ struct afs_stat status;
+ strcpy(path, VPartitionPath(dp));
+ strcat(path, name);
+ if (afs_stat(path, &status) == 0) {
+ strcpy(partition, dp->name);
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ *ec = VNOVOL;
+ *partitionp = *namep = NULL;
+ } else {
+ *partitionp = partition;
+ *namep = name;
+ }
+}
+
+/**
+ * extract a volume number from a volume header filename string.
+ *
+ * @param[in] name volume header filename string
+ *
+ * @return volume number
+ *
+ * @note the string must be of the form VFORMAT. the only permissible
+ * deviation is a leading '/' character.
+ *
+ * @see VFORMAT
+ */
+int
+VolumeNumber(char *name)
+{
+ if (*name == '/')
+ name++;
+ return atoi(name + 1);
+}
+
+/**
+ * compute the volume header filename.
+ *
+ * @param[in] volumeId
+ *
+ * @return volume header filename
+ *
+ * @post volume header filename string is constructed
+ *
+ * @warning this function is NOT re-entrant -- the returned string is
+ * stored in a static char array. see VolumeExternalName_r
+ * for a re-entrant equivalent.
+ *
+ * @see VolumeExternalName_r
+ *
+ * @deprecated due to the above re-entrancy warning, this interface should
+ * be considered deprecated. Please use VolumeExternalName_r
+ * in its stead.
+ */
+char *
+VolumeExternalName(VolumeId volumeId)
+{
+ static char name[VMAXPATHLEN];
+ (void)afs_snprintf(name, sizeof name, VFORMAT, afs_printable_uint32_lu(volumeId));
+ return name;
+}
+
+/**
+ * compute the volume header filename.
+ *
+ * @param[in] volumeId
+ * @param[inout] name array in which to store filename
+ * @param[in] len length of name array
+ *
+ * @return result code from afs_snprintf
+ *
+ * @see VolumeExternalName
+ * @see afs_snprintf
+ *
+ * @note re-entrant equivalent of VolumeExternalName
+ */
+int
+VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
+{
+ return afs_snprintf(name, len, VFORMAT, afs_printable_uint32_lu(volumeId));
+}
+
+
+/***************************************************/
+/* Volume Usage Statistics routines */
+/***************************************************/
+
+#if OPENAFS_VOL_STATS
+#define OneDay (86400) /* 24 hours' worth of seconds */
+#else
+#define OneDay (24*60*60) /* 24 hours */
+#endif /* OPENAFS_VOL_STATS */
+
+static time_t
+Midnight(time_t t) {
+ struct tm local, *l;
+ time_t midnight;
+
+#if defined(AFS_PTHREAD_ENV) && !defined(AFS_NT40_ENV)
+ l = localtime_r(&t, &local);
+#else
+ l = localtime(&t);
+#endif
+
+ if (l != NULL) {
+ /* the following is strictly speaking problematic on the
+ switching day to daylight saving time, after the switch,
+ as tm_isdst does not match. Similarly, on the looong day when
+ switching back the OneDay check will not do what naively expected!
+ The effects are minor, though, and more a matter of interpreting
+ the numbers. */
+#ifndef AFS_PTHREAD_ENV
+ local = *l;
+#endif
+ local.tm_hour = local.tm_min=local.tm_sec = 0;
+ midnight = mktime(&local);
+ if (midnight != (time_t) -1) return(midnight);
+ }
+ return( (t/OneDay)*OneDay );
+
+}
+
+/*------------------------------------------------------------------------
+ * [export] VAdjustVolumeStatistics
+ *
+ * Description:
+ * If we've passed midnight, we need to update all the day use
+ * statistics as well as zeroing the detailed volume statistics
+ * (if we are implementing them).
+ *
+ * Arguments:
+ * vp : Pointer to the volume structure describing the lucky
+ * volume being considered for update.
+ *
+ * Returns:
+ * 0 (always!)
+ *
+ * Environment:
+ * Nothing interesting.
+ *
+ * Side Effects:
+ * As described.
+ *------------------------------------------------------------------------*/
+
+int
+VAdjustVolumeStatistics_r(Volume * vp)
+{
+ unsigned int now = FT_ApproxTime();
+
+ if (now - V_dayUseDate(vp) > OneDay) {
+ int ndays, i;
+
+ ndays = (now - V_dayUseDate(vp)) / OneDay;
+ for (i = 6; i > ndays - 1; i--)
+ V_weekUse(vp)[i] = V_weekUse(vp)[i - ndays];
+ for (i = 0; i < ndays - 1 && i < 7; i++)
+ V_weekUse(vp)[i] = 0;
+ if (ndays <= 7)
+ V_weekUse(vp)[ndays - 1] = V_dayUse(vp);
+ V_dayUse(vp) = 0;
+ V_dayUseDate(vp) = Midnight(now);
+
+#if OPENAFS_VOL_STATS
+ /*
+ * All we need to do is bzero the entire VOL_STATS_BYTES of
+ * the detailed volume statistics area.
+ */
+ memset((V_stat_area(vp)), 0, VOL_STATS_BYTES);
+#endif /* OPENAFS_VOL_STATS */
+ }
+
+ /*It's been more than a day of collection */
+ /*
+ * Always return happily.
+ */
+ return (0);
+} /*VAdjustVolumeStatistics */
+
+int
+VAdjustVolumeStatistics(Volume * vp)
+{
+ int retVal;
+ VOL_LOCK;
+ retVal = VAdjustVolumeStatistics_r(vp);
+ VOL_UNLOCK;
+ return retVal;
+}
+
+void
+VBumpVolumeUsage_r(Volume * vp)
+{
+ unsigned int now = FT_ApproxTime();
+ V_accessDate(vp) = now;
+ if (now - V_dayUseDate(vp) > OneDay)
+ VAdjustVolumeStatistics_r(vp);
+ /*
+ * Save the volume header image to disk after every 128 bumps to dayUse.
+ */
+ if ((V_dayUse(vp)++ & 127) == 0) {
+ Error error;
+ VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT);
+ }
+}
+
+void
+VBumpVolumeUsage(Volume * vp)
+{
+ VOL_LOCK;
+ VBumpVolumeUsage_r(vp);
+ VOL_UNLOCK;
+}
+
+void
+VSetDiskUsage_r(void)
+{
+#ifndef AFS_DEMAND_ATTACH_FS
+ static int FifteenMinuteCounter = 0;
+#endif
+
+ while (VInit < 2) {
+ /* NOTE: Don't attempt to access the partitions list until the
+ * initialization level indicates that all volumes are attached,
+ * which implies that all partitions are initialized. */
+#ifdef AFS_PTHREAD_ENV
+ VOL_CV_WAIT(&vol_vinit_cond);
+#else /* AFS_PTHREAD_ENV */
+ IOMGR_Sleep(10);
+#endif /* AFS_PTHREAD_ENV */
+ }
+
+ VResetDiskUsage_r();
+
+#ifndef AFS_DEMAND_ATTACH_FS
+ if (++FifteenMinuteCounter == 3) {
+ FifteenMinuteCounter = 0;
+ VScanUpdateList();
+ }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+}
+
+void
+VSetDiskUsage(void)
+{
+ VOL_LOCK;
+ VSetDiskUsage_r();
+ VOL_UNLOCK;
+}
+
+
+/***************************************************/
+/* Volume Update List routines */
+/***************************************************/
+
+/* The number of minutes that a volume hasn't been updated before the
+ * "Dont salvage" flag in the volume header will be turned on */
+#define SALVAGE_INTERVAL (10*60)
+
+/*
+ * demand attach fs
+ *
+ * volume update list functionality has been moved into the VLRU
+ * the DONT_SALVAGE flag is now set during VLRU demotion
+ */
+
+#ifndef AFS_DEMAND_ATTACH_FS
+static VolumeId *UpdateList = NULL; /* Pointer to array of Volume ID's */
+static int nUpdatedVolumes = 0; /* Updated with entry in UpdateList, salvage after crash flag on */
+static int updateSize = 0; /* number of entries possible */
+#define UPDATE_LIST_SIZE 128 /* initial size increment (must be a power of 2!) */
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+void
+VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
+{
+ *ec = 0;
+ vp->updateTime = FT_ApproxTime();
+ if (V_dontSalvage(vp) == 0)
+ return;
+ V_dontSalvage(vp) = 0;
+ VSyncVolume_r(ec, vp, 0);
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(vp) &= ~(VOL_HDR_DONTSALV);
+#else /* !AFS_DEMAND_ATTACH_FS */
+ if (*ec)
+ return;
+ if (UpdateList == NULL) {
+ updateSize = UPDATE_LIST_SIZE;
+ UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize);
+ } else {
+ if (nUpdatedVolumes == updateSize) {
+ updateSize <<= 1;
+ if (updateSize > 524288) {
+ Log("warning: there is likely a bug in the volume update scanner\n");
+ return;
+ }
+ UpdateList =
+ (VolumeId *) realloc(UpdateList,
+ sizeof(VolumeId) * updateSize);
+ }
+ }
+ osi_Assert(UpdateList != NULL);
+ UpdateList[nUpdatedVolumes++] = V_id(vp);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+}
+
+#ifndef AFS_DEMAND_ATTACH_FS
+static void
+VScanUpdateList(void)
+{
+ int i, gap;
+ Volume *vp;
+ Error error;
+ afs_uint32 now = FT_ApproxTime();
+ /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */
+ for (i = gap = 0; i < nUpdatedVolumes; i++) {
+ if (gap)
+ UpdateList[i - gap] = UpdateList[i];
+
+ /* XXX this routine needlessly messes up the Volume LRU by
+ * breaking the LRU temporal-locality assumptions.....
+ * we should use a special volume header allocator here */
+ vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]);
+ if (error) {
+ gap++;
+ } else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) {
+ V_dontSalvage(vp) = DONT_SALVAGE;
+ VUpdateVolume_r(&error, vp, 0); /* No need to fsync--not critical */
+ gap++;
+ }
+
+ if (vp) {
+ VPutVolume_r(vp);
+ }
+
+#ifndef AFS_PTHREAD_ENV
+ IOMGR_Poll();
+#endif /* !AFS_PTHREAD_ENV */
+ }
+ nUpdatedVolumes -= gap;
+}
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume LRU routines */
+/***************************************************/
+
+/* demand attach fs
+ * volume LRU
+ *
+ * with demand attach fs, we attempt to soft detach(1)
+ * volumes which have not been accessed in a long time
+ * in order to speed up fileserver shutdown
+ *
+ * (1) by soft detach we mean a process very similar
+ * to VOffline, except the final state of the
+ * Volume will be VOL_STATE_PREATTACHED, instead
+ * of the usual VOL_STATE_UNATTACHED
+ */
+#ifdef AFS_DEMAND_ATTACH_FS
+
+/* implementation is reminiscent of a generational GC
+ *
+ * queue 0 is newly attached volumes. this queue is
+ * sorted by attach timestamp
+ *
+ * queue 1 is volumes that have been around a bit
+ * longer than queue 0. this queue is sorted by
+ * attach timestamp
+ *
+ * queue 2 is volumes tha have been around the longest.
+ * this queue is unsorted
+ *
+ * queue 3 is volumes that have been marked as
+ * candidates for soft detachment. this queue is
+ * unsorted
+ */
+#define VLRU_GENERATIONS 3 /**< number of generations in VLRU */
+#define VLRU_QUEUES 5 /**< total number of VLRU queues */
+
+/**
+ * definition of a VLRU queue.
+ */
+struct VLRU_q {
+ volatile struct rx_queue q;
+ volatile int len;
+ volatile int busy;
+ pthread_cond_t cv;
+};
+
+/**
+ * main VLRU data structure.
+ */
+struct VLRU {
+ struct VLRU_q q[VLRU_QUEUES]; /**< VLRU queues */
+
+ /* VLRU config */
+ /** time interval (in seconds) between promotion passes for
+ * each young generation queue. */
+ afs_uint32 promotion_interval[VLRU_GENERATIONS-1];
+
+ /** time interval (in seconds) between soft detach candidate
+ * scans for each generation queue.
+ *
+ * scan_interval[VLRU_QUEUE_CANDIDATE] defines how frequently
+ * we perform a soft detach pass. */
+ afs_uint32 scan_interval[VLRU_GENERATIONS+1];
+
+ /* scheduler state */
+ int next_idx; /**< next queue to receive attention */
+ afs_uint32 last_promotion[VLRU_GENERATIONS-1]; /**< timestamp of last promotion scan */
+ afs_uint32 last_scan[VLRU_GENERATIONS+1]; /**< timestamp of last detach scan */
+
+ int scanner_state; /**< state of scanner thread */
+ pthread_cond_t cv; /**< state transition CV */
+};
+
+/** global VLRU state */
+static struct VLRU volume_LRU;
+
+/**
+ * defined states for VLRU scanner thread.
+ */
+typedef enum {
+ VLRU_SCANNER_STATE_OFFLINE = 0, /**< vlru scanner thread is offline */
+ VLRU_SCANNER_STATE_ONLINE = 1, /**< vlru scanner thread is online */
+ VLRU_SCANNER_STATE_SHUTTING_DOWN = 2, /**< vlru scanner thread is shutting down */
+ VLRU_SCANNER_STATE_PAUSING = 3, /**< vlru scanner thread is getting ready to pause */
+ VLRU_SCANNER_STATE_PAUSED = 4 /**< vlru scanner thread is paused */
+} vlru_thread_state_t;
+
+/* vlru disk data header stuff */
+#define VLRU_DISK_MAGIC 0x7a8b9cad /**< vlru disk entry magic number */
+#define VLRU_DISK_VERSION 1 /**< vlru disk entry version number */
+
+/** vlru default expiration time (for eventual fs state serialization of vlru data) */
+#define VLRU_DUMP_EXPIRATION_TIME (60*60*24*7) /* expire vlru data after 1 week */
+
+
+/** minimum volume inactivity (in seconds) before a volume becomes eligible for
+ * soft detachment. */
+static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH;
+
+/** time interval (in seconds) between VLRU scanner thread soft detach passes. */
+static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL;
+
+/** maximum number of volumes to soft detach in a VLRU soft detach pass. */
+static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX;
+
+/** VLRU control flag. non-zero value implies VLRU subsystem is activated. */
+static afs_uint32 VLRU_enabled = 1;
+
+/* queue synchronization routines */
+static void VLRU_BeginExclusive_r(struct VLRU_q * q);
+static void VLRU_EndExclusive_r(struct VLRU_q * q);
+static void VLRU_Wait_r(struct VLRU_q * q);
+
+/**
+ * set VLRU subsystem tunable parameters.
+ *
+ * @param[in] option tunable option to modify
+ * @param[in] val new value for tunable parameter
+ *
+ * @pre @c VInitVolumePackage2 has not yet been called.
+ *
+ * @post tunable parameter is modified
+ *
+ * @note DAFS only
+ *
+ * @note valid option parameters are:
+ * @arg @c VLRU_SET_THRESH
+ * set the period of inactivity after which
+ * volumes are eligible for soft detachment
+ * @arg @c VLRU_SET_INTERVAL
+ * set the time interval between calls
+ * to the volume LRU "garbage collector"
+ * @arg @c VLRU_SET_MAX
+ * set the max number of volumes to deallocate
+ * in one GC pass
+ */
+void
+VLRU_SetOptions(int option, afs_uint32 val)
+{
+ if (option == VLRU_SET_THRESH) {
+ VLRU_offline_thresh = val;
+ } else if (option == VLRU_SET_INTERVAL) {
+ VLRU_offline_interval = val;
+ } else if (option == VLRU_SET_MAX) {
+ VLRU_offline_max = val;
+ } else if (option == VLRU_SET_ENABLED) {
+ VLRU_enabled = val;
+ }
+ VLRU_ComputeConstants();
+}
+
+/**
+ * compute VLRU internal timing parameters.
+ *
+ * @post VLRU scanner thread internal timing parameters are computed
+ *
+ * @note computes internal timing parameters based upon user-modifiable
+ * tunable parameters.
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VLRU_ComputeConstants(void)
+{
+ afs_uint32 factor = VLRU_offline_thresh / VLRU_offline_interval;
+
+ /* compute the candidate scan interval */
+ volume_LRU.scan_interval[VLRU_QUEUE_CANDIDATE] = VLRU_offline_interval;
+
+ /* compute the promotion intervals */
+ volume_LRU.promotion_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh * 2;
+ volume_LRU.promotion_interval[VLRU_QUEUE_MID] = VLRU_offline_thresh * 4;
+
+ if (factor > 16) {
+ /* compute the gen 0 scan interval */
+ volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh / 8;
+ } else {
+ /* compute the gen 0 scan interval */
+ volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_interval * 2;
+ }
+}
+
+/**
+ * initialize VLRU subsystem.
+ *
+ * @pre this function has not yet been called
+ *
+ * @post VLRU subsystem is initialized and VLRU scanner thread is starting
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VInitVLRU(void)
+{
+ pthread_t tid;
+ pthread_attr_t attrs;
+ int i;
+
+ if (!VLRU_enabled) {
+ Log("VLRU: disabled\n");
+ return;
+ }
+
+ /* initialize each of the VLRU queues */
+ for (i = 0; i < VLRU_QUEUES; i++) {
+ queue_Init(&volume_LRU.q[i]);
+ volume_LRU.q[i].len = 0;
+ volume_LRU.q[i].busy = 0;
+ CV_INIT(&volume_LRU.q[i].cv, "vol lru", CV_DEFAULT, 0);
+ }
+
+ /* setup the timing constants */
+ VLRU_ComputeConstants();
+
+ /* XXX put inside LogLevel check? */
+ Log("VLRU: starting scanner with the following configuration parameters:\n");
+ Log("VLRU: offlining volumes after minimum of %d seconds of inactivity\n", VLRU_offline_thresh);
+ Log("VLRU: running VLRU soft detach pass every %d seconds\n", VLRU_offline_interval);
+ Log("VLRU: taking up to %d volumes offline per pass\n", VLRU_offline_max);
+ Log("VLRU: scanning generation 0 for inactive volumes every %d seconds\n", volume_LRU.scan_interval[0]);
+ Log("VLRU: scanning for promotion/demotion between generations 0 and 1 every %d seconds\n", volume_LRU.promotion_interval[0]);
+ Log("VLRU: scanning for promotion/demotion between generations 1 and 2 every %d seconds\n", volume_LRU.promotion_interval[1]);
+
+ /* start up the VLRU scanner */
+ volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
+ if (programType == fileServer) {
+ CV_INIT(&volume_LRU.cv, "vol lru", CV_DEFAULT, 0);
+ osi_Assert(pthread_attr_init(&attrs) == 0);
+ osi_Assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+ osi_Assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0);
+ }
+}
+
+/**
+ * initialize the VLRU-related fields of a newly allocated volume object.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre
+ * @arg @c VOL_LOCK is held.
+ * @arg volume object is not on a VLRU queue.
+ *
+ * @post VLRU fields are initialized to indicate that volume object is not
+ * currently registered with the VLRU subsystem
+ *
+ * @note DAFS only
+ *
+ * @internal volume package interal use only.
+ */
+static void
+VLRU_Init_Node_r(Volume * vp)
+{
+ if (!VLRU_enabled)
+ return;
+
+ osi_Assert(queue_IsNotOnQueue(&vp->vlru));
+ vp->vlru.idx = VLRU_QUEUE_INVALID;
+}
+
+/**
+ * add a volume object to a VLRU queue.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre
+ * @arg @c VOL_LOCK is held.
+ * @arg caller MUST hold a lightweight ref on @p vp.
+ * @arg caller MUST NOT hold exclusive ownership of the VLRU queue.
+ *
+ * @post the volume object is added to the appropriate VLRU queue
+ *
+ * @note if @c vp->vlru.idx contains the index of a valid VLRU queue,
+ * then the volume is added to that queue. Otherwise, the value
+ * @c VLRU_QUEUE_NEW is stored into @c vp->vlru.idx and the
+ * volume is added to the NEW generation queue.
+ *
+ * @note @c VOL_LOCK may be dropped internally
+ *
+ * @note Volume state is temporarily set to @c VOL_STATE_VLRU_ADD
+ * during the add operation, and is restored to the previous
+ * state prior to return.
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VLRU_Add_r(Volume * vp)
+{
+ int idx;
+ VolState state_save;
+
+ if (!VLRU_enabled)
+ return;
+
+ if (queue_IsOnQueue(&vp->vlru))
+ return;
+
+ state_save = VChangeState_r(vp, VOL_STATE_VLRU_ADD);
+
+ idx = vp->vlru.idx;
+ if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) {
+ idx = VLRU_QUEUE_NEW;
+ }
+
+ VLRU_Wait_r(&volume_LRU.q[idx]);
+
+ /* repeat check since VLRU_Wait_r may have dropped
+ * the glock */
+ if (queue_IsNotOnQueue(&vp->vlru)) {
+ vp->vlru.idx = idx;
+ queue_Prepend(&volume_LRU.q[idx], &vp->vlru);
+ volume_LRU.q[idx].len++;
+ V_attachFlags(vp) |= VOL_ON_VLRU;
+ vp->stats.last_promote = FT_ApproxTime();
+ }
+
+ VChangeState_r(vp, state_save);
+}
+
+/**
+ * delete a volume object from a VLRU queue.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre
+ * @arg @c VOL_LOCK is held.
+ * @arg caller MUST hold a lightweight ref on @p vp.
+ * @arg caller MUST NOT hold exclusive ownership of the VLRU queue.
+ *
+ * @post volume object is removed from the VLRU queue
+ *
+ * @note @c VOL_LOCK may be dropped internally
+ *
+ * @note DAFS only
+ *
+ * @todo We should probably set volume state to something exlcusive
+ * (as @c VLRU_Add_r does) prior to dropping @c VOL_LOCK.
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VLRU_Delete_r(Volume * vp)
+{
+ int idx;
+
+ if (!VLRU_enabled)
+ return;
+
+ if (queue_IsNotOnQueue(&vp->vlru))
+ return;
+
+ /* handle races */
+ do {
+ idx = vp->vlru.idx;
+ if (idx == VLRU_QUEUE_INVALID)
+ return;
+ VLRU_Wait_r(&volume_LRU.q[idx]);
+ } while (idx != vp->vlru.idx);
+
+ /* now remove from the VLRU and update
+ * the appropriate counter */
+ queue_Remove(&vp->vlru);
+ volume_LRU.q[idx].len--;
+ vp->vlru.idx = VLRU_QUEUE_INVALID;
+ V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+}
+
+/**
+ * tell the VLRU subsystem that a volume was just accessed.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre
+ * @arg @c VOL_LOCK is held
+ * @arg caller MUST hold a lightweight ref on @p vp
+ * @arg caller MUST NOT hold exclusive ownership of any VLRU queue
+ *
+ * @post volume VLRU access statistics are updated. If the volume was on
+ * the VLRU soft detach candidate queue, it is moved to the NEW
+ * generation queue.
+ *
+ * @note @c VOL_LOCK may be dropped internally
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VLRU_UpdateAccess_r(Volume * vp)
+{
+ Volume * rvp = NULL;
+
+ if (!VLRU_enabled)
+ return;
+
+ if (queue_IsNotOnQueue(&vp->vlru))
+ return;
+
+ osi_Assert(V_attachFlags(vp) & VOL_ON_VLRU);
+
+ /* update the access timestamp */
+ vp->stats.last_get = FT_ApproxTime();
+
+ /*
+ * if the volume is on the soft detach candidate
+ * list, we need to safely move it back to a
+ * regular generation. this has to be done
+ * carefully so we don't race against the scanner
+ * thread.
+ */
+
+ /* if this volume is on the soft detach candidate queue,
+ * then grab exclusive access to the necessary queues */
+ if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
+ rvp = vp;
+ VCreateReservation_r(rvp);
+
+ VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+ VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
}
-#ifdef BITMAP_LATER
- if ((programType == fileServer) && !index->bitmap) {
+
+ /* make sure multiple threads don't race to update */
+ if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
+ VLRU_SwitchQueues(vp, VLRU_QUEUE_NEW, 1);
+ }
+
+ if (rvp) {
+ VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+ VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+ VCancelReservation_r(rvp);
+ }
+}
+
+/**
+ * switch a volume between two VLRU queues.
+ *
+ * @param[in] vp pointer to volume object
+ * @param[in] new_idx index of VLRU queue onto which the volume will be moved
+ * @param[in] append controls whether the volume will be appended or
+ * prepended to the queue. A nonzero value means it will
+ * be appended; zero means it will be prepended.
+ *
+ * @pre The new (and old, if applicable) queue(s) must either be owned
+ * exclusively by the calling thread for asynchronous manipulation,
+ * or the queue(s) must be quiescent and VOL_LOCK must be held.
+ * Please see VLRU_BeginExclusive_r, VLRU_EndExclusive_r and VLRU_Wait_r
+ * for further details of the queue asynchronous processing mechanism.
+ *
+ * @post If the volume object was already on a VLRU queue, it is
+ * removed from the queue. Depending on the value of the append
+ * parameter, the volume object is either appended or prepended
+ * to the VLRU queue referenced by the new_idx parameter.
+ *
+ * @note DAFS only
+ *
+ * @see VLRU_BeginExclusive_r
+ * @see VLRU_EndExclusive_r
+ * @see VLRU_Wait_r
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VLRU_SwitchQueues(Volume * vp, int new_idx, int append)
+{
+ if (queue_IsNotOnQueue(&vp->vlru))
+ return;
+
+ queue_Remove(&vp->vlru);
+ volume_LRU.q[vp->vlru.idx].len--;
+
+ /* put the volume back on the correct generational queue */
+ if (append) {
+ queue_Append(&volume_LRU.q[new_idx], &vp->vlru);
+ } else {
+ queue_Prepend(&volume_LRU.q[new_idx], &vp->vlru);
+ }
+
+ volume_LRU.q[new_idx].len++;
+ vp->vlru.idx = new_idx;
+}
+
+/**
+ * VLRU background thread.
+ *
+ * The VLRU Scanner Thread is responsible for periodically scanning through
+ * each VLRU queue looking for volumes which should be moved to another
+ * queue, or soft detached.
+ *
+ * @param[in] args unused thread arguments parameter
+ *
+ * @return unused thread return value
+ * @retval NULL always
+ *
+ * @internal volume package internal use only.
+ */
+static void *
+VLRU_ScannerThread(void * args)
+{
+ afs_uint32 now, min_delay, delay;
+ int i, min_idx, min_op, overdue, state;
+
+ /* set t=0 for promotion cycle to be
+ * fileserver startup */
+ now = FT_ApproxTime();
+ for (i=0; i < VLRU_GENERATIONS-1; i++) {
+ volume_LRU.last_promotion[i] = now;
+ }
+
+ /* don't start the scanner until VLRU_offline_thresh
+ * plus a small delay for VInitVolumePackage2 to finish
+ * has gone by */
+
+ sleep(VLRU_offline_thresh + 60);
+
+ /* set t=0 for scan cycle to be now */
+ now = FT_ApproxTime();
+ for (i=0; i < VLRU_GENERATIONS+1; i++) {
+ volume_LRU.last_scan[i] = now;
+ }
+
+ VOL_LOCK;
+ if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_OFFLINE) {
+ volume_LRU.scanner_state = VLRU_SCANNER_STATE_ONLINE;
+ }
+
+ while ((state = volume_LRU.scanner_state) != VLRU_SCANNER_STATE_SHUTTING_DOWN) {
+ /* check to see if we've been asked to pause */
+ if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) {
+ volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED;
+ CV_BROADCAST(&volume_LRU.cv);
+ do {
+ VOL_CV_WAIT(&volume_LRU.cv);
+ } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED);
+ }
+
+ /* scheduling can happen outside the glock */
+ VOL_UNLOCK;
+
+ /* figure out what is next on the schedule */
+
+ /* figure out a potential schedule for the new generation first */
+ overdue = 0;
+ min_delay = volume_LRU.scan_interval[0] + volume_LRU.last_scan[0] - now;
+ min_idx = 0;
+ min_op = 0;
+ if (min_delay > volume_LRU.scan_interval[0]) {
+ /* unsigned overflow -- we're overdue to run this scan */
+ min_delay = 0;
+ overdue = 1;
+ }
+
+ /* if we're not overdue for gen 0, figure out schedule for candidate gen */
+ if (!overdue) {
+ i = VLRU_QUEUE_CANDIDATE;
+ delay = volume_LRU.scan_interval[i] + volume_LRU.last_scan[i] - now;
+ if (delay < min_delay) {
+ min_delay = delay;
+ min_idx = i;
+ }
+ if (delay > volume_LRU.scan_interval[i]) {
+ /* unsigned overflow -- we're overdue to run this scan */
+ min_delay = 0;
+ min_idx = i;
+ overdue = 1;
+ }
+ }
+
+ /* if we're still not overdue for something, figure out schedules for promotions */
+ for (i=0; !overdue && i < VLRU_GENERATIONS-1; i++) {
+ delay = volume_LRU.promotion_interval[i] + volume_LRU.last_promotion[i] - now;
+ if (delay < min_delay) {
+ min_delay = delay;
+ min_idx = i;
+ min_op = 1;
+ }
+ if (delay > volume_LRU.promotion_interval[i]) {
+ /* unsigned overflow -- we're overdue to run this promotion */
+ min_delay = 0;
+ min_idx = i;
+ min_op = 1;
+ overdue = 1;
+ break;
+ }
+ }
+
+ /* sleep as needed */
+ if (min_delay) {
+ sleep(min_delay);
+ }
+
+ /* do whatever is next */
+ VOL_LOCK;
+ if (min_op) {
+ VLRU_Promote_r(min_idx);
+ VLRU_Demote_r(min_idx+1);
+ } else {
+ VLRU_Scan_r(min_idx);
+ }
+ now = FT_ApproxTime();
+ }
+
+ Log("VLRU scanner asked to go offline (scanner_state=%d)\n", state);
+
+ /* signal that scanner is down */
+ volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
+ CV_BROADCAST(&volume_LRU.cv);
+ VOL_UNLOCK;
+ return NULL;
+}
+
+/**
+ * promote volumes from one VLRU generation to the next.
+ *
+ * This routine scans a VLRU generation looking for volumes which are
+ * eligible to be promoted to the next generation. All volumes which
+ * meet the eligibility requirement are promoted.
+ *
+ * Promotion eligibility is based upon meeting both of the following
+ * requirements:
+ *
+ * @arg The volume has been accessed since the last promotion:
+ * @c (vp->stats.last_get >= vp->stats.last_promote)
+ * @arg The last promotion occurred at least
+ * @c volume_LRU.promotion_interval[idx] seconds ago
+ *
+ * As a performance optimization, promotions are "globbed". In other
+ * words, we promote arbitrarily large contiguous sublists of elements
+ * as one operation.
+ *
+ * @param[in] idx VLRU queue index to scan
+ *
+ * @note DAFS only
+ *
+ * @internal VLRU internal use only.
+ */
+static void
+VLRU_Promote_r(int idx)
+{
+ int len, chaining, promote;
+ afs_uint32 now, thresh;
+ struct rx_queue *qp, *nqp;
+ Volume * vp, *start = NULL, *end = NULL;
+
+ /* get exclusive access to two chains, and drop the glock */
+ VLRU_Wait_r(&volume_LRU.q[idx]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+ VLRU_Wait_r(&volume_LRU.q[idx+1]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[idx+1]);
+ VOL_UNLOCK;
+
+ thresh = volume_LRU.promotion_interval[idx];
+ now = FT_ApproxTime();
+
+ len = chaining = 0;
+ for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+ vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+ promote = (((vp->stats.last_promote + thresh) <= now) &&
+ (vp->stats.last_get >= vp->stats.last_promote));
+
+ if (chaining) {
+ if (promote) {
+ vp->vlru.idx++;
+ len++;
+ start = vp;
+ } else {
+ /* promote and prepend chain */
+ queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
+ chaining = 0;
+ }
+ } else {
+ if (promote) {
+ vp->vlru.idx++;
+ len++;
+ chaining = 1;
+ start = end = vp;
+ }
+ }
+ }
+
+ if (chaining) {
+ /* promote and prepend */
+ queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
+ }
+
+ if (len) {
+ volume_LRU.q[idx].len -= len;
+ volume_LRU.q[idx+1].len += len;
+ }
+
+ /* release exclusive access to the two chains */
+ VOL_LOCK;
+ volume_LRU.last_promotion[idx] = now;
+ VLRU_EndExclusive_r(&volume_LRU.q[idx+1]);
+ VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+}
+
+/* run the demotions */
+static void
+VLRU_Demote_r(int idx)
+{
+ Error ec;
+ int len, chaining, demote;
+ afs_uint32 now, thresh;
+ struct rx_queue *qp, *nqp;
+ Volume * vp, *start = NULL, *end = NULL;
+ Volume ** salv_flag_vec = NULL;
+ int salv_vec_offset = 0;
+
+ osi_Assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD);
+
+ /* get exclusive access to two chains, and drop the glock */
+ VLRU_Wait_r(&volume_LRU.q[idx-1]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[idx-1]);
+ VLRU_Wait_r(&volume_LRU.q[idx]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+ VOL_UNLOCK;
+
+ /* no big deal if this allocation fails */
+ if (volume_LRU.q[idx].len) {
+ salv_flag_vec = (Volume **) malloc(volume_LRU.q[idx].len * sizeof(Volume *));
+ }
+
+ now = FT_ApproxTime();
+ thresh = volume_LRU.promotion_interval[idx-1];
+
+ len = chaining = 0;
+ for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+ vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+ demote = (((vp->stats.last_promote + thresh) <= now) &&
+ (vp->stats.last_get < (now - thresh)));
+
+ /* we now do volume update list DONT_SALVAGE flag setting during
+ * demotion passes */
+ if (salv_flag_vec &&
+ !(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
+ demote &&
+ (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
+ (V_attachState(vp) == VOL_STATE_ATTACHED)) {
+ salv_flag_vec[salv_vec_offset++] = vp;
+ VCreateReservation_r(vp);
+ }
+
+ if (chaining) {
+ if (demote) {
+ vp->vlru.idx--;
+ len++;
+ start = vp;
+ } else {
+ /* demote and append chain */
+ queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
+ chaining = 0;
+ }
+ } else {
+ if (demote) {
+ vp->vlru.idx--;
+ len++;
+ chaining = 1;
+ start = end = vp;
+ }
+ }
+ }
+
+ if (chaining) {
+ queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
+ }
+
+ if (len) {
+ volume_LRU.q[idx].len -= len;
+ volume_LRU.q[idx-1].len += len;
+ }
+
+ /* release exclusive access to the two chains */
+ VOL_LOCK;
+ VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+ VLRU_EndExclusive_r(&volume_LRU.q[idx-1]);
+
+ /* now go back and set the DONT_SALVAGE flags as appropriate */
+ if (salv_flag_vec) {
int i;
- int wasVBUSY = 0;
- if (vp->specialStatus == VBUSY) {
- if (vp->goingOffline) { /* vos dump waiting for the volume to
- * go offline. We probably come here
- * from AddNewReadableResidency */
- wasVBUSY = 1;
+ for (i = 0; i < salv_vec_offset; i++) {
+ vp = salv_flag_vec[i];
+ if (!(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
+ (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
+ (V_attachState(vp) == VOL_STATE_ATTACHED)) {
+ ec = VHold_r(vp);
+ if (!ec) {
+ V_attachFlags(vp) |= VOL_HDR_DONTSALV;
+ V_dontSalvage(vp) = DONT_SALVAGE;
+ VUpdateVolume_r(&ec, vp, 0);
+ VPutVolume_r(vp);
+ }
+ }
+ VCancelReservation_r(vp);
+ }
+ free(salv_flag_vec);
+ }
+}
+
+/* run a pass of the VLRU GC scanner */
+static void
+VLRU_Scan_r(int idx)
+{
+ afs_uint32 now, thresh;
+ struct rx_queue *qp, *nqp;
+ Volume * vp;
+ int i, locked = 1;
+
+ osi_Assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE);
+
+ /* gain exclusive access to the idx VLRU */
+ VLRU_Wait_r(&volume_LRU.q[idx]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+
+ if (idx != VLRU_QUEUE_CANDIDATE) {
+ /* gain exclusive access to the candidate VLRU */
+ VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+ }
+
+ now = FT_ApproxTime();
+ thresh = now - VLRU_offline_thresh;
+
+ /* perform candidate selection and soft detaching */
+ if (idx == VLRU_QUEUE_CANDIDATE) {
+ /* soft detach some volumes from the candidate pool */
+ VOL_UNLOCK;
+ locked = 0;
+
+ for (i=0,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+ vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+ if (i >= VLRU_offline_max) {
+ break;
+ }
+ /* check timestamp to see if it's a candidate for soft detaching */
+ if (vp->stats.last_get <= thresh) {
+ VOL_LOCK;
+ if (VCheckSoftDetach(vp, thresh))
+ i++;
+ VOL_UNLOCK;
+ }
+ }
+ } else {
+ /* scan for volumes to become soft detach candidates */
+ for (i=1,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue),i++) {
+ vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+
+ /* check timestamp to see if it's a candidate for soft detaching */
+ if (vp->stats.last_get <= thresh) {
+ VCheckSoftDetachCandidate(vp, thresh);
+ }
+
+ if (!(i&0x7f)) { /* lock coarsening optimization */
+ VOL_UNLOCK;
+ pthread_yield();
+ VOL_LOCK;
+ }
+ }
+ }
+
+ /* relinquish exclusive access to the VLRU chains */
+ if (!locked) {
+ VOL_LOCK;
+ }
+ volume_LRU.last_scan[idx] = now;
+ if (idx != VLRU_QUEUE_CANDIDATE) {
+ VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+ }
+ VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+}
+
+/* check whether volume is safe to soft detach
+ * caller MUST NOT hold a ref count on vp */
+static int
+VCheckSoftDetach(Volume * vp, afs_uint32 thresh)
+{
+ int ret=0;
+
+ if (vp->nUsers || vp->nWaiters)
+ return 0;
+
+ if (vp->stats.last_get <= thresh) {
+ ret = VSoftDetachVolume_r(vp, thresh);
+ }
+
+ return ret;
+}
+
+/* check whether volume should be made a
+ * soft detach candidate */
+static int
+VCheckSoftDetachCandidate(Volume * vp, afs_uint32 thresh)
+{
+ int idx, ret = 0;
+ if (vp->nUsers || vp->nWaiters)
+ return 0;
+
+ idx = vp->vlru.idx;
+
+ osi_Assert(idx == VLRU_QUEUE_NEW);
+
+ if (vp->stats.last_get <= thresh) {
+ /* move to candidate pool */
+ queue_Remove(&vp->vlru);
+ volume_LRU.q[VLRU_QUEUE_NEW].len--;
+ queue_Prepend(&volume_LRU.q[VLRU_QUEUE_CANDIDATE], &vp->vlru);
+ vp->vlru.idx = VLRU_QUEUE_CANDIDATE;
+ volume_LRU.q[VLRU_QUEUE_CANDIDATE].len++;
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+/* begin exclusive access on VLRU */
+static void
+VLRU_BeginExclusive_r(struct VLRU_q * q)
+{
+ osi_Assert(q->busy == 0);
+ q->busy = 1;
+}
+
+/* end exclusive access on VLRU */
+static void
+VLRU_EndExclusive_r(struct VLRU_q * q)
+{
+ osi_Assert(q->busy);
+ q->busy = 0;
+ CV_BROADCAST(&q->cv);
+}
+
+/* wait for another thread to end exclusive access on VLRU */
+static void
+VLRU_Wait_r(struct VLRU_q * q)
+{
+ while(q->busy) {
+ VOL_CV_WAIT(&q->cv);
+ }
+}
+
+/* demand attach fs
+ * volume soft detach
+ *
+ * caller MUST NOT hold a ref count on vp */
+static int
+VSoftDetachVolume_r(Volume * vp, afs_uint32 thresh)
+{
+ afs_uint32 ts_save;
+ int ret = 0;
+
+ osi_Assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+
+ ts_save = vp->stats.last_get;
+ if (ts_save > thresh)
+ return 0;
+
+ if (vp->nUsers || vp->nWaiters)
+ return 0;
+
+ if (VIsExclusiveState(V_attachState(vp))) {
+ return 0;
+ }
+
+ switch (V_attachState(vp)) {
+ case VOL_STATE_UNATTACHED:
+ case VOL_STATE_PREATTACHED:
+ case VOL_STATE_ERROR:
+ case VOL_STATE_GOING_OFFLINE:
+ case VOL_STATE_SHUTTING_DOWN:
+ case VOL_STATE_SALVAGING:
+ case VOL_STATE_DELETED:
+ volume_LRU.q[vp->vlru.idx].len--;
+
+ /* create and cancel a reservation to
+ * give the volume an opportunity to
+ * be deallocated */
+ VCreateReservation_r(vp);
+ queue_Remove(&vp->vlru);
+ vp->vlru.idx = VLRU_QUEUE_INVALID;
+ V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+ VCancelReservation_r(vp);
+ return 0;
+ default:
+ break;
+ }
+
+ /* hold the volume and take it offline.
+ * no need for reservations, as VHold_r
+ * takes care of that internally. */
+ if (VHold_r(vp) == 0) {
+ /* vhold drops the glock, so now we should
+ * check to make sure we aren't racing against
+ * other threads. if we are racing, offlining vp
+ * would be wasteful, and block the scanner for a while
+ */
+ if (vp->nWaiters ||
+ (vp->nUsers > 1) ||
+ (vp->shuttingDown) ||
+ (vp->goingOffline) ||
+ (vp->stats.last_get != ts_save)) {
+ /* looks like we're racing someone else. bail */
+ VPutVolume_r(vp);
+ vp = NULL;
+ } else {
+ /* pull it off the VLRU */
+ osi_Assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+ volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--;
+ queue_Remove(&vp->vlru);
+ vp->vlru.idx = VLRU_QUEUE_INVALID;
+ V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+
+ /* take if offline */
+ VOffline_r(vp, "volume has been soft detached");
+
+ /* invalidate the volume header cache */
+ FreeVolumeHeader(vp);
+
+ /* update stats */
+ IncUInt64(&VStats.soft_detaches);
+ vp->stats.soft_detaches++;
+
+ /* put in pre-attached state so demand
+ * attacher can work on it */
+ VChangeState_r(vp, VOL_STATE_PREATTACHED);
+ ret = 1;
+ }
+ }
+ return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume Header Cache routines */
+/***************************************************/
+
+/**
+ * volume header cache.
+ */
+struct volume_hdr_LRU_t volume_hdr_LRU;
+
+/**
+ * initialize the volume header cache.
+ *
+ * @param[in] howMany number of header cache entries to preallocate
+ *
+ * @pre VOL_LOCK held. Function has never been called before.
+ *
+ * @post howMany cache entries are allocated, initialized, and added
+ * to the LRU list. Header cache statistics are initialized.
+ *
+ * @note only applicable to fileServer program type. Should only be
+ * called once during volume package initialization.
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VInitVolumeHeaderCache(afs_uint32 howMany)
+{
+ struct volHeader *hp;
+ if (programType != fileServer)
+ return;
+ queue_Init(&volume_hdr_LRU);
+ volume_hdr_LRU.stats.free = 0;
+ volume_hdr_LRU.stats.used = howMany;
+ volume_hdr_LRU.stats.attached = 0;
+ hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
+ osi_Assert(hp != NULL);
+
+ while (howMany--)
+ /* We are using ReleaseVolumeHeader to initialize the values on the header list
+ * to ensure they have the right values
+ */
+ ReleaseVolumeHeader(hp++);
+}
+
+/**
+ * get a volume header and attach it to the volume object.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @return cache entry status
+ * @retval 0 volume header was newly attached; cache data is invalid
+ * @retval 1 volume header was previously attached; cache data is valid
+ *
+ * @pre VOL_LOCK held. For DAFS, lightweight ref must be held on volume object.
+ *
+ * @post volume header attached to volume object. if necessary, header cache
+ * entry on LRU is synchronized to disk. Header is removed from LRU list.
+ *
+ * @note VOL_LOCK may be dropped
+ *
+ * @warning this interface does not load header data from disk. it merely
+ * attaches a header object to the volume object, and may sync the old
+ * header cache data out to disk in the process.
+ *
+ * @internal volume package internal use only.
+ */
+static int
+GetVolumeHeader(Volume * vp)
+{
+ Error error;
+ struct volHeader *hd;
+ int old;
+ static int everLogged = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState vp_save = 0, back_save = 0;
+
+ /* XXX debug 9/19/05 we've apparently got
+ * a ref counting bug somewhere that's
+ * breaking the nUsers == 0 => header on LRU
+ * assumption */
+ if (vp->header && queue_IsNotOnQueue(vp->header)) {
+ Log("nUsers == 0, but header not on LRU\n");
+ return 1;
+ }
+#endif
+
+ old = (vp->header != NULL); /* old == volume already has a header */
+
+ if (programType != fileServer) {
+ /* for volume utilities, we allocate volHeaders as needed */
+ if (!vp->header) {
+ hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
+ osi_Assert(hd != NULL);
+ vp->header = hd;
+ hd->back = vp;
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+#endif
+ }
+ } else {
+ /* for the fileserver, we keep a volume header cache */
+ if (old) {
+ /* the header we previously dropped in the lru is
+ * still available. pull it off the lru and return */
+ hd = vp->header;
+ queue_Remove(hd);
+ osi_Assert(hd->back == vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(vp) &= ~(VOL_HDR_IN_LRU);
+#endif
+ } else {
+ /* we need to grab a new element off the LRU */
+ if (queue_IsNotEmpty(&volume_hdr_LRU)) {
+ /* grab an element and pull off of LRU */
+ hd = queue_First(&volume_hdr_LRU, volHeader);
+ queue_Remove(hd);
} else {
- VOL_UNLOCK;
- while (vp->specialStatus == VBUSY)
-#ifdef AFS_PTHREAD_ENV
- sleep(2);
-#else /* AFS_PTHREAD_ENV */
- IOMGR_Sleep(2);
-#endif /* AFS_PTHREAD_ENV */
- VOL_LOCK;
+ /* LRU is empty, so allocate a new volHeader
+ * this is probably indicative of a leak, so let the user know */
+ hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
+ osi_Assert(hd != NULL);
+ if (!everLogged) {
+ Log("****Allocated more volume headers, probably leak****\n");
+ everLogged = 1;
+ }
+ volume_hdr_LRU.stats.free++;
}
- }
- if (!index->bitmap) {
- vp->specialStatus = VBUSY; /* Stop anyone else from using it. */
- for (i = 0; i < nVNODECLASSES; i++) {
- VOL_UNLOCK;
- GetBitmap(ec, vp, i);
- VOL_LOCK;
- if (*ec) {
- vp->specialStatus = 0;
- vp->shuttingDown = 1; /* Let who has it free it. */
- return NULL;
+ if (hd->back) {
+ /* this header used to belong to someone else.
+ * we'll need to check if the header needs to
+ * be sync'd out to disk */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* if hd->back were in an exclusive state, then
+ * its volHeader would not be on the LRU... */
+ osi_Assert(!VIsExclusiveState(V_attachState(hd->back)));
+#endif
+
+ if (hd->diskstuff.inUse) {
+ /* volume was in use, so we'll need to sync
+ * its header to disk */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING);
+ vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING);
+ VCreateReservation_r(hd->back);
+ VOL_UNLOCK;
+#endif
+
+ WriteVolumeHeader_r(&error, hd->back);
+ /* Ignore errors; catch them later */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ VOL_LOCK;
+#endif
+ }
+
+ hd->back->header = NULL;
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU);
+
+ if (hd->diskstuff.inUse) {
+ VChangeState_r(hd->back, back_save);
+ VCancelReservation_r(hd->back);
+ VChangeState_r(vp, vp_save);
}
+#endif
+ } else {
+ volume_hdr_LRU.stats.attached++;
}
- if (!wasVBUSY)
- vp->specialStatus = 0; /* Allow others to have access. */
+ hd->back = vp;
+ vp->header = hd;
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+#endif
}
+ volume_hdr_LRU.stats.free--;
+ volume_hdr_LRU.stats.used++;
}
-#endif /* BITMAP_LATER */
- bp = index->bitmap + index->bitmapOffset;
- ep = index->bitmap + index->bitmapSize;
- while (bp < ep) {
- if ((*(bit32 *) bp) != (bit32) 0xffffffff) {
- int o;
- index->bitmapOffset = (afs_uint32) (bp - index->bitmap);
- while (*bp == 0xff)
- bp++;
- o = ffs(~*bp) - 1; /* ffs is documented in BSTRING(3) */
- *bp |= (1 << o);
- return (VnodeId) ((bp - index->bitmap) * 8 + o);
+ IncUInt64(&VStats.hdr_gets);
+#ifdef AFS_DEMAND_ATTACH_FS
+ IncUInt64(&vp->stats.hdr_gets);
+ vp->stats.last_hdr_get = FT_ApproxTime();
+#endif
+ return old;
+}
+
+
+/**
+ * make sure volume header is attached and contains valid cache data.
+ *
+ * @param[out] ec outbound error code
+ * @param[in] vp pointer to volume object
+ *
+ * @pre VOL_LOCK held. For DAFS, lightweight ref held on vp.
+ *
+ * @post header cache entry attached, and loaded with valid data, or
+ * *ec is nonzero, and the header is released back into the LRU.
+ *
+ * @internal volume package internal use only.
+ */
+static void
+LoadVolumeHeader(Error * ec, Volume * vp)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+ VolState state_save;
+ afs_uint32 now;
+ *ec = 0;
+
+ if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
+ IncUInt64(&VStats.hdr_loads);
+ state_save = VChangeState_r(vp, VOL_STATE_HDR_LOADING);
+ VOL_UNLOCK;
+
+ ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+ sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
+ VOLUMEINFOVERSION);
+ IncUInt64(&vp->stats.hdr_loads);
+ now = FT_ApproxTime();
+
+ VOL_LOCK;
+ if (!*ec) {
+ V_attachFlags(vp) |= VOL_HDR_LOADED;
+ vp->stats.last_hdr_load = now;
}
- bp += sizeof(bit32) /* i.e. 4 */ ;
+ VChangeState_r(vp, state_save);
}
- /* No bit map entry--must grow bitmap */
- bp = (byte *)
- realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE);
- assert(bp != NULL);
- index->bitmap = bp;
- bp += index->bitmapSize;
- memset(bp, 0, VOLUME_BITMAP_GROWSIZE);
- index->bitmapOffset = index->bitmapSize;
- index->bitmapSize += VOLUME_BITMAP_GROWSIZE;
- *bp = 1;
- return index->bitmapOffset * 8;
+#else /* AFS_DEMAND_ATTACH_FS */
+ *ec = 0;
+ if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
+ IncUInt64(&VStats.hdr_loads);
+
+ ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+ sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
+ VOLUMEINFOVERSION);
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+ if (*ec) {
+ /* maintain (nUsers==0) => header in LRU invariant */
+ FreeVolumeHeader(vp);
+ }
+}
+
+/**
+ * release a header cache entry back into the LRU list.
+ *
+ * @param[in] hd pointer to volume header cache object
+ *
+ * @pre VOL_LOCK held.
+ *
+ * @post header cache object appended onto end of LRU list.
+ *
+ * @note only applicable to fileServer program type.
+ *
+ * @note used to place a header cache entry back into the
+ * LRU pool without invalidating it as a cache entry.
+ *
+ * @internal volume package internal use only.
+ */
+static void
+ReleaseVolumeHeader(struct volHeader *hd)
+{
+ if (programType != fileServer)
+ return;
+ if (!hd || queue_IsOnQueue(hd)) /* no header, or header already released */
+ return;
+ queue_Append(&volume_hdr_LRU, hd);
+#ifdef AFS_DEMAND_ATTACH_FS
+ if (hd->back) {
+ V_attachFlags(hd->back) |= VOL_HDR_IN_LRU;
+ }
+#endif
+ volume_hdr_LRU.stats.free++;
+ volume_hdr_LRU.stats.used--;
+}
+
+/**
+ * free/invalidate a volume header cache entry.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post For fileserver, header cache entry is returned to LRU, and it is
+ * invalidated as a cache entry. For volume utilities, the header
+ * cache entry is freed.
+ *
+ * @note For fileserver, this should be utilized instead of ReleaseVolumeHeader
+ * whenever it is necessary to invalidate the header cache entry.
+ *
+ * @see ReleaseVolumeHeader
+ *
+ * @internal volume package internal use only.
+ */
+static void
+FreeVolumeHeader(Volume * vp)
+{
+ struct volHeader *hd = vp->header;
+ if (!hd)
+ return;
+ if (programType == fileServer) {
+ ReleaseVolumeHeader(hd);
+ hd->back = NULL;
+ } else {
+ free(hd);
+ }
+#ifdef AFS_DEMAND_ATTACH_FS
+ V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED);
+#endif
+ volume_hdr_LRU.stats.attached--;
+ vp->header = NULL;
+}
+
+
+/***************************************************/
+/* Volume Hash Table routines */
+/***************************************************/
+
+/**
+ * set size of volume object hash table.
+ *
+ * @param[in] logsize log(2) of desired hash table size
+ *
+ * @return operation status
+ * @retval 0 success
+ * @retval -1 failure
+ *
+ * @pre MUST be called prior to VInitVolumePackage2
+ *
+ * @post Volume Hash Table will have 2^logsize buckets
+ */
+int
+VSetVolHashSize(int logsize)
+{
+ /* 64 to 268435456 hash buckets seems like a reasonable range */
+ if ((logsize < 6 ) || (logsize > 28)) {
+ return -1;
+ }
+
+ if (!VInit) {
+ VolumeHashTable.Size = 1 << logsize;
+ VolumeHashTable.Mask = VolumeHashTable.Size - 1;
+ } else {
+ /* we can't yet support runtime modification of this
+ * parameter. we'll need a configuration rwlock to
+ * make runtime modification feasible.... */
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * initialize dynamic data structures for volume hash table.
+ *
+ * @post hash table is allocated, and fields are initialized.
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VInitVolumeHash(void)
+{
+ int i;
+
+ VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size,
+ sizeof(VolumeHashChainHead));
+ osi_Assert(VolumeHashTable.Table != NULL);
+
+ for (i=0; i < VolumeHashTable.Size; i++) {
+ queue_Init(&VolumeHashTable.Table[i]);
+#ifdef AFS_DEMAND_ATTACH_FS
+ CV_INIT(&VolumeHashTable.Table[i].chain_busy_cv, "vhash busy", CV_DEFAULT, 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+ }
+}
+
+/**
+ * add a volume object to the hash table.
+ *
+ * @param[in] vp pointer to volume object
+ * @param[in] hashid hash of volume id
+ *
+ * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight
+ * reference on vp.
+ *
+ * @post volume is added to hash chain.
+ *
+ * @internal volume package internal use only.
+ *
+ * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
+ * asynchronous hash chain reordering to finish.
+ */
+static void
+AddVolumeToHashTable(Volume * vp, int hashid)
+{
+ VolumeHashChainHead * head;
+
+ if (queue_IsOnQueue(vp))
+ return;
+
+ head = &VolumeHashTable.Table[VOLUME_HASH(hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* wait for the hash chain to become available */
+ VHashWait_r(head);
+
+ V_attachFlags(vp) |= VOL_IN_HASH;
+ vp->chainCacheCheck = ++head->cacheCheck;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ head->len++;
+ vp->hashid = hashid;
+ queue_Append(head, vp);
+ vp->vnodeHashOffset = VolumeHashOffset_r();
}
-VnodeId
-VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
-{
- VnodeId retVal;
+/**
+ * delete a volume object from the hash table.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight
+ * reference on vp.
+ *
+ * @post volume is removed from hash chain.
+ *
+ * @internal volume package internal use only.
+ *
+ * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
+ * asynchronous hash chain reordering to finish.
+ */
+static void
+DeleteVolumeFromHashTable(Volume * vp)
+{
+ VolumeHashChainHead * head;
+
+ if (!queue_IsOnQueue(vp))
+ return;
+
+ head = &VolumeHashTable.Table[VOLUME_HASH(vp->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* wait for the hash chain to become available */
+ VHashWait_r(head);
+
+ V_attachFlags(vp) &= ~(VOL_IN_HASH);
+ head->cacheCheck++;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ head->len--;
+ queue_Remove(vp);
+ /* do NOT reset hashid to zero, as the online
+ * salvager package may need to know the volume id
+ * after the volume is removed from the hash */
+}
+
+/**
+ * lookup a volume object in the hash table given a volume id.
+ *
+ * @param[out] ec error code return
+ * @param[in] volumeId volume id
+ * @param[in] hint volume object which we believe could be the correct
+ mapping
+ *
+ * @return volume object pointer
+ * @retval NULL no such volume id is registered with the hash table.
+ *
+ * @pre VOL_LOCK is held. For DAFS, caller must hold a lightweight
+ ref on hint.
+ *
+ * @post volume object with the given id is returned. volume object and
+ * hash chain access statistics are updated. hash chain may have
+ * been reordered.
+ *
+ * @note For DAFS, VOL_LOCK may be dropped in order to wait for an
+ * asynchronous hash chain reordering operation to finish, or
+ * in order for us to perform an asynchronous chain reordering.
+ *
+ * @note Hash chain reorderings occur when the access count for the
+ * volume object being looked up exceeds the sum of the previous
+ * node's (the node ahead of it in the hash chain linked list)
+ * access count plus the constant VOLUME_HASH_REORDER_THRESHOLD.
+ *
+ * @note For DAFS, the hint parameter allows us to short-circuit if the
+ * cacheCheck fields match between the hash chain head and the
+ * hint volume object.
+ */
+Volume *
+VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
+{
+ int looks = 0;
+ Volume * vp, *np;
+#ifdef AFS_DEMAND_ATTACH_FS
+ Volume *pp;
+#endif
+ VolumeHashChainHead * head;
+ *ec = 0;
+
+ head = &VolumeHashTable.Table[VOLUME_HASH(volumeId)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* wait for the hash chain to become available */
+ VHashWait_r(head);
+
+ /* check to see if we can short circuit without walking the hash chain */
+ if (hint && (hint->chainCacheCheck == head->cacheCheck)) {
+ IncUInt64(&hint->stats.hash_short_circuits);
+ return hint;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ /* someday we need to either do per-chain locks, RWlocks,
+ * or both for volhash access.
+ * (and move to a data structure with better cache locality) */
+
+ /* search the chain for this volume id */
+ for(queue_Scan(head, vp, np, Volume)) {
+ looks++;
+ if ((vp->hashid == volumeId)) {
+ break;
+ }
+ }
+
+ if (queue_IsEnd(head, vp)) {
+ vp = NULL;
+ }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+ /* update hash chain statistics */
+ {
+ afs_uint64 lks;
+ FillInt64(lks, 0, looks);
+ AddUInt64(head->looks, lks, &head->looks);
+ AddUInt64(VStats.hash_looks, lks, &VStats.hash_looks);
+ IncUInt64(&head->gets);
+ }
+
+ if (vp) {
+ afs_uint64 thresh;
+ IncUInt64(&vp->stats.hash_lookups);
+
+ /* for demand attach fileserver, we permit occasional hash chain reordering
+ * so that frequently looked up volumes move towards the head of the chain */
+ pp = queue_Prev(vp, Volume);
+ if (!queue_IsEnd(head, pp)) {
+ FillInt64(thresh, 0, VOLUME_HASH_REORDER_THRESHOLD);
+ AddUInt64(thresh, pp->stats.hash_lookups, &thresh);
+ if (GEInt64(vp->stats.hash_lookups, thresh)) {
+ VReorderHash_r(head, pp, vp);
+ }
+ }
+
+ /* update the short-circuit cache check */
+ vp->chainCacheCheck = head->cacheCheck;
+ }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ return vp;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* perform volume hash chain reordering.
+ *
+ * advance a subchain beginning at vp ahead of
+ * the adjacent subchain ending at pp */
+static void
+VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp)
+{
+ Volume *tp, *np, *lp;
+ afs_uint64 move_thresh;
+
+ /* this should never be called if the chain is already busy, so
+ * no need to wait for other exclusive chain ops to finish */
+
+ /* this is a rather heavy set of operations,
+ * so let's set the chain busy flag and drop
+ * the vol_glock */
+ VHashBeginExclusive_r(head);
+ VOL_UNLOCK;
+
+ /* scan forward in the chain from vp looking for the last element
+ * in the chain we want to advance */
+ FillInt64(move_thresh, 0, VOLUME_HASH_REORDER_CHAIN_THRESH);
+ AddUInt64(move_thresh, pp->stats.hash_lookups, &move_thresh);
+ for(queue_ScanFrom(head, vp, tp, np, Volume)) {
+ if (LTInt64(tp->stats.hash_lookups, move_thresh)) {
+ break;
+ }
+ }
+ lp = queue_Prev(tp, Volume);
+
+ /* scan backwards from pp to determine where to splice and
+ * insert the subchain we're advancing */
+ for(queue_ScanBackwardsFrom(head, pp, tp, np, Volume)) {
+ if (GTInt64(tp->stats.hash_lookups, move_thresh)) {
+ break;
+ }
+ }
+ tp = queue_Next(tp, Volume);
+
+ /* rebalance chain(vp,...,lp) ahead of chain(tp,...,pp) */
+ queue_MoveChainBefore(tp,vp,lp);
+
VOL_LOCK;
- retVal = VAllocBitmapEntry_r(ec, vp, index);
- VOL_UNLOCK;
- return retVal;
+ IncUInt64(&VStats.hash_reorders);
+ head->cacheCheck++;
+ IncUInt64(&head->reorders);
+
+ /* wake up any threads waiting for the hash chain */
+ VHashEndExclusive_r(head);
}
-void
-VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
- unsigned bitNumber)
+
+/* demand-attach fs volume hash
+ * asynchronous exclusive operations */
+
+/**
+ * begin an asynchronous exclusive operation on a volume hash chain.
+ *
+ * @param[in] head pointer to volume hash chain head object
+ *
+ * @pre VOL_LOCK held. hash chain is quiescent.
+ *
+ * @post hash chain marked busy.
+ *
+ * @note this interface is used in conjunction with VHashEndExclusive_r and
+ * VHashWait_r to perform asynchronous (wrt VOL_LOCK) operations on a
+ * volume hash chain. Its main use case is hash chain reordering, which
+ * has the potential to be a highly latent operation.
+ *
+ * @see VHashEndExclusive_r
+ * @see VHashWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VHashBeginExclusive_r(VolumeHashChainHead * head)
{
- unsigned int offset;
- *ec = 0;
-#ifdef BITMAP_LATER
- if (!index->bitmap)
- return;
-#endif /* BITMAP_LATER */
- offset = bitNumber >> 3;
- if (offset >= index->bitmapSize) {
- *ec = VNOVNODE;
- return;
- }
- if (offset < index->bitmapOffset)
- index->bitmapOffset = offset & ~3; /* Truncate to nearest bit32 */
- *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7));
+ osi_Assert(head->busy == 0);
+ head->busy = 1;
}
-void
-VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
- unsigned bitNumber)
+/**
+ * relinquish exclusive ownership of a volume hash chain.
+ *
+ * @param[in] head pointer to volume hash chain head object
+ *
+ * @pre VOL_LOCK held. thread owns the hash chain exclusively.
+ *
+ * @post hash chain is marked quiescent. threads awaiting use of
+ * chain are awakened.
+ *
+ * @see VHashBeginExclusive_r
+ * @see VHashWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VHashEndExclusive_r(VolumeHashChainHead * head)
{
- VOL_LOCK;
- VFreeBitMapEntry_r(ec, index, bitNumber);
- VOL_UNLOCK;
+ osi_Assert(head->busy);
+ head->busy = 0;
+ CV_BROADCAST(&head->chain_busy_cv);
}
-void
-VUpdateVolume_r(Error * ec, Volume * vp)
+/**
+ * wait for all asynchronous operations on a hash chain to complete.
+ *
+ * @param[in] head pointer to volume hash chain head object
+ *
+ * @pre VOL_LOCK held.
+ *
+ * @post hash chain object is quiescent.
+ *
+ * @see VHashBeginExclusive_r
+ * @see VHashEndExclusive_r
+ *
+ * @note DAFS only
+ *
+ * @note This interface should be called before any attempt to
+ * traverse the hash chain. It is permissible for a thread
+ * to gain exclusive access to the chain, and then perform
+ * latent operations on the chain asynchronously wrt the
+ * VOL_LOCK.
+ *
+ * @warning if waiting is necessary, VOL_LOCK is dropped
+ *
+ * @internal volume package internal use only.
+ */
+static void
+VHashWait_r(VolumeHashChainHead * head)
{
- *ec = 0;
- if (programType == fileServer)
- V_uniquifier(vp) =
- (V_inUse(vp) ? V_nextVnodeUnique(vp) +
- 200 : V_nextVnodeUnique(vp));
- /*printf("Writing volume header for '%s'\n", V_name(vp)); */
- WriteVolumeHeader_r(ec, vp);
- if (*ec) {
- Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
- V_id(vp), V_name(vp));
- VForceOffline_r(vp);
+ while (head->busy) {
+ VOL_CV_WAIT(&head->chain_busy_cv);
}
}
+#endif /* AFS_DEMAND_ATTACH_FS */
-void
-VUpdateVolume(Error * ec, Volume * vp)
+
+/***************************************************/
+/* Volume by Partition List routines */
+/***************************************************/
+
+/*
+ * demand attach fileserver adds a
+ * linked list of volumes to each
+ * partition object, thus allowing
+ * for quick enumeration of all
+ * volumes on a partition
+ */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/**
+ * add a volume to its disk partition VByPList.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre either the disk partition VByPList is owned exclusively
+ * by the calling thread, or the list is quiescent and
+ * VOL_LOCK is held.
+ *
+ * @post volume is added to disk partition VByPList
+ *
+ * @note DAFS only
+ *
+ * @warning it is the caller's responsibility to ensure list
+ * quiescence.
+ *
+ * @see VVByPListWait_r
+ * @see VVByPListBeginExclusive_r
+ * @see VVByPListEndExclusive_r
+ *
+ * @internal volume package internal use only.
+ */
+static void
+AddVolumeToVByPList_r(Volume * vp)
{
- VOL_LOCK;
- VUpdateVolume_r(ec, vp);
- VOL_UNLOCK;
+ if (queue_IsNotOnQueue(&vp->vol_list)) {
+ queue_Append(&vp->partition->vol_list, &vp->vol_list);
+ V_attachFlags(vp) |= VOL_ON_VBYP_LIST;
+ vp->partition->vol_list.len++;
+ }
}
-void
-VSyncVolume_r(Error * ec, Volume * vp)
+/**
+ * delete a volume from its disk partition VByPList.
+ *
+ * @param[in] vp pointer to volume object
+ *
+ * @pre either the disk partition VByPList is owned exclusively
+ * by the calling thread, or the list is quiescent and
+ * VOL_LOCK is held.
+ *
+ * @post volume is removed from the disk partition VByPList
+ *
+ * @note DAFS only
+ *
+ * @warning it is the caller's responsibility to ensure list
+ * quiescence.
+ *
+ * @see VVByPListWait_r
+ * @see VVByPListBeginExclusive_r
+ * @see VVByPListEndExclusive_r
+ *
+ * @internal volume package internal use only.
+ */
+static void
+DeleteVolumeFromVByPList_r(Volume * vp)
{
- FdHandle_t *fdP;
- VUpdateVolume_r(ec, vp);
- if (!ec) {
- int code;
- fdP = IH_OPEN(V_diskDataHandle(vp));
- assert(fdP != NULL);
- code = FDH_SYNC(fdP);
- assert(code == 0);
- FDH_CLOSE(fdP);
+ if (queue_IsOnQueue(&vp->vol_list)) {
+ queue_Remove(&vp->vol_list);
+ V_attachFlags(vp) &= ~(VOL_ON_VBYP_LIST);
+ vp->partition->vol_list.len--;
}
}
-void
-VSyncVolume(Error * ec, Volume * vp)
+/**
+ * begin an asynchronous exclusive operation on a VByPList.
+ *
+ * @param[in] dp pointer to disk partition object
+ *
+ * @pre VOL_LOCK held. VByPList is quiescent.
+ *
+ * @post VByPList marked busy.
+ *
+ * @note this interface is used in conjunction with VVByPListEndExclusive_r and
+ * VVByPListWait_r to perform asynchronous (wrt VOL_LOCK) operations on a
+ * VByPList.
+ *
+ * @see VVByPListEndExclusive_r
+ * @see VVByPListWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
+/* take exclusive control over the list */
+static void
+VVByPListBeginExclusive_r(struct DiskPartition64 * dp)
{
- VOL_LOCK;
- VSyncVolume_r(ec, vp);
- VOL_UNLOCK;
+ osi_Assert(dp->vol_list.busy == 0);
+ dp->vol_list.busy = 1;
}
+/**
+ * relinquish exclusive ownership of a VByPList.
+ *
+ * @param[in] dp pointer to disk partition object
+ *
+ * @pre VOL_LOCK held. thread owns the VByPList exclusively.
+ *
+ * @post VByPList is marked quiescent. threads awaiting use of
+ * the list are awakened.
+ *
+ * @see VVByPListBeginExclusive_r
+ * @see VVByPListWait_r
+ *
+ * @note DAFS only
+ *
+ * @internal volume package internal use only.
+ */
static void
-FreeVolume(Volume * vp)
+VVByPListEndExclusive_r(struct DiskPartition64 * dp)
{
- int i;
- if (!vp)
- return;
- for (i = 0; i < nVNODECLASSES; i++)
- if (vp->vnodeIndex[i].bitmap)
- free(vp->vnodeIndex[i].bitmap);
- FreeVolumeHeader(vp);
- DeleteVolumeFromHashTable(vp);
- free(vp);
+ osi_Assert(dp->vol_list.busy);
+ dp->vol_list.busy = 0;
+ CV_BROADCAST(&dp->vol_list.cv);
}
+/**
+ * wait for all asynchronous operations on a VByPList to complete.
+ *
+ * @param[in] dp pointer to disk partition object
+ *
+ * @pre VOL_LOCK is held.
+ *
+ * @post disk partition's VByP list is quiescent
+ *
+ * @note DAFS only
+ *
+ * @note This interface should be called before any attempt to
+ * traverse the VByPList. It is permissible for a thread
+ * to gain exclusive access to the list, and then perform
+ * latent operations on the list asynchronously wrt the
+ * VOL_LOCK.
+ *
+ * @warning if waiting is necessary, VOL_LOCK is dropped
+ *
+ * @see VVByPListEndExclusive_r
+ * @see VVByPListBeginExclusive_r
+ *
+ * @internal volume package internal use only.
+ */
static void
-GetBitmap(Error * ec, Volume * vp, VnodeClass class)
+VVByPListWait_r(struct DiskPartition64 * dp)
{
- StreamHandle_t *file;
- int nVnodes;
- int size;
- struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
- struct vnodeIndex *vip = &vp->vnodeIndex[class];
- struct VnodeDiskObject *vnode;
- unsigned int unique = 0;
- FdHandle_t *fdP;
-#ifdef BITMAP_LATER
- byte *BitMap = 0;
-#endif /* BITMAP_LATER */
-
- *ec = 0;
-
- fdP = IH_OPEN(vip->handle);
- assert(fdP != NULL);
- file = FDH_FDOPEN(fdP, "r");
- assert(file != NULL);
- vnode = (VnodeDiskObject *) malloc(vcp->diskSize);
- assert(vnode != NULL);
- size = OS_SIZE(fdP->fd_fd);
- assert(size != -1);
- nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize)
- >> vcp->logSize;
- vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4; /* The 10 is a little extra so
- * a few files can be created in this volume,
- * the whole thing is rounded up to nearest 4
- * bytes, because the bit map allocator likes
- * it that way */
-#ifdef BITMAP_LATER
- BitMap = (byte *) calloc(1, vip->bitmapSize);
- assert(BitMap != NULL);
-#else /* BITMAP_LATER */
- vip->bitmap = (byte *) calloc(1, vip->bitmapSize);
- assert(vip->bitmap != NULL);
- vip->bitmapOffset = 0;
-#endif /* BITMAP_LATER */
- if (STREAM_SEEK(file, vcp->diskSize, 0) != -1) {
- int bitNumber = 0;
- for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) {
- if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1)
- break;
- if (vnode->type != vNull) {
- if (vnode->vnodeMagic != vcp->magic) {
- Log("GetBitmap: addled vnode index in volume %s; volume needs salvage\n", V_name(vp));
- *ec = VSALVAGE;
- break;
- }
-#ifdef BITMAP_LATER
- *(BitMap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
-#else /* BITMAP_LATER */
- *(vip->bitmap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
-#endif /* BITMAP_LATER */
- if (unique <= vnode->uniquifier)
- unique = vnode->uniquifier + 1;
- }
-#ifndef AFS_PTHREAD_ENV
- if ((bitNumber & 0x00ff) == 0x0ff) { /* every 256 iterations */
- IOMGR_Poll();
- }
-#endif /* !AFS_PTHREAD_ENV */
- }
- }
- if (vp->nextVnodeUnique < unique) {
- Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
- *ec = VSALVAGE;
+ while (dp->vol_list.busy) {
+ VOL_CV_WAIT(&dp->vol_list.cv);
}
- /* Paranoia, partly justified--I think fclose after fdopen
- * doesn't seem to close fd. In any event, the documentation
- * doesn't specify, so it's safer to close it twice.
- */
- STREAM_CLOSE(file);
- FDH_CLOSE(fdP);
- free(vnode);
-#ifdef BITMAP_LATER
- /* There may have been a racing condition with some other thread, both
- * creating the bitmaps for this volume. If the other thread was faster
- * the pointer to bitmap should already be filled and we can free ours.
- */
- if (vip->bitmap == NULL) {
- vip->bitmap = BitMap;
- vip->bitmapOffset = 0;
- } else
- free((byte *) BitMap);
-#endif /* BITMAP_LATER */
}
+#endif /* AFS_DEMAND_ATTACH_FS */
-static void
-GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
-{
- static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
- char path[VMAXPATHLEN];
- int found = 0;
- struct DiskPartition *dp;
+/***************************************************/
+/* Volume Cache Statistics routines */
+/***************************************************/
- *ec = 0;
- name[0] = '/';
- (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, volumeId);
- for (dp = DiskPartitionList; dp; dp = dp->next) {
- struct afs_stat status;
- strcpy(path, VPartitionPath(dp));
- strcat(path, name);
- if (afs_stat(path, &status) == 0) {
- strcpy(partition, dp->name);
- found = 1;
- break;
- }
- }
- if (!found) {
- *ec = VNOVOL;
- *partitionp = *namep = NULL;
- } else {
- *partitionp = partition;
- *namep = name;
- }
+void
+VPrintCacheStats_r(void)
+{
+ afs_uint32 get_hi, get_lo, load_hi, load_lo;
+ struct VnodeClassInfo *vcp;
+ vcp = &VnodeClassInfo[vLarge];
+ Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
+ vcp = &VnodeClassInfo[vSmall];
+ Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
+ SplitInt64(VStats.hdr_gets, get_hi, get_lo);
+ SplitInt64(VStats.hdr_loads, load_hi, load_lo);
+ Log("Volume header cache, %d entries, %d gets, %d replacements\n",
+ VStats.hdr_cache_size, get_lo, load_lo);
}
-int
-VolumeNumber(char *name)
+void
+VPrintCacheStats(void)
{
- if (*name == '/')
- name++;
- return atoi(name + 1);
+ VOL_LOCK;
+ VPrintCacheStats_r();
+ VOL_UNLOCK;
}
-char *
-VolumeExternalName(VolumeId volumeId)
+#ifdef AFS_DEMAND_ATTACH_FS
+static double
+UInt64ToDouble(afs_uint64 * x)
{
- static char name[VMAXPATHLEN];
- (void)afs_snprintf(name, sizeof name, VFORMAT, volumeId);
- return name;
+ static double c32 = 4.0 * 1.073741824 * 1000000000.0;
+ afs_uint32 h, l;
+ SplitInt64(*x, h, l);
+ return (((double)h) * c32) + ((double) l);
}
-#if OPENAFS_VOL_STATS
-#define OneDay (86400) /* 24 hours' worth of seconds */
-#else
-#define OneDay (24*60*60) /* 24 hours */
-#endif /* OPENAFS_VOL_STATS */
+static char *
+DoubleToPrintable(double x, char * buf, int len)
+{
+ static double billion = 1000000000.0;
+ afs_uint32 y[3];
-#define Midnight(date) ((date-TimeZoneCorrection)/OneDay*OneDay+TimeZoneCorrection)
+ y[0] = (afs_uint32) (x / (billion * billion));
+ y[1] = (afs_uint32) ((x - (((double)y[0]) * billion * billion)) / billion);
+ y[2] = (afs_uint32) (x - ((((double)y[0]) * billion * billion) + (((double)y[1]) * billion)));
-/*------------------------------------------------------------------------
- * [export] VAdjustVolumeStatistics
- *
- * Description:
- * If we've passed midnight, we need to update all the day use
- * statistics as well as zeroing the detailed volume statistics
- * (if we are implementing them).
+ if (y[0]) {
+ snprintf(buf, len, "%d%09d%09d", y[0], y[1], y[2]);
+ } else if (y[1]) {
+ snprintf(buf, len, "%d%09d", y[1], y[2]);
+ } else {
+ snprintf(buf, len, "%d", y[2]);
+ }
+ buf[len-1] = '\0';
+ return buf;
+}
+
+struct VLRUExtStatsEntry {
+ VolumeId volid;
+};
+
+struct VLRUExtStats {
+ afs_uint32 len;
+ afs_uint32 used;
+ struct {
+ afs_uint32 start;
+ afs_uint32 len;
+ } queue_info[VLRU_QUEUE_INVALID];
+ struct VLRUExtStatsEntry * vec;
+};
+
+/**
+ * add a 256-entry fudge factor onto the vector in case state changes
+ * out from under us.
+ */
+#define VLRU_EXT_STATS_VEC_LEN_FUDGE 256
+
+/**
+ * collect extended statistics for the VLRU subsystem.
*
- * Arguments:
- * vp : Pointer to the volume structure describing the lucky
- * volume being considered for update.
+ * @param[out] stats pointer to stats structure to be populated
+ * @param[in] nvols number of volumes currently known to exist
*
- * Returns:
- * 0 (always!)
+ * @pre VOL_LOCK held
*
- * Environment:
- * Nothing interesting.
+ * @post stats->vec allocated and populated
*
- * Side Effects:
- * As described.
- *------------------------------------------------------------------------*/
-
-int
-VAdjustVolumeStatistics_r(register Volume * vp)
+ * @return operation status
+ * @retval 0 success
+ * @retval 1 failure
+ */
+static int
+VVLRUExtStats_r(struct VLRUExtStats * stats, afs_uint32 nvols)
{
- unsigned int now = FT_ApproxTime();
+ afs_uint32 cur, idx, len;
+ struct rx_queue * qp, * nqp;
+ Volume * vp;
+ struct VLRUExtStatsEntry * vec;
+
+ len = nvols + VLRU_EXT_STATS_VEC_LEN_FUDGE;
+ vec = stats->vec = calloc(len,
+ sizeof(struct VLRUExtStatsEntry));
+ if (vec == NULL) {
+ return 1;
+ }
- if (now - V_dayUseDate(vp) > OneDay) {
- register ndays, i;
+ cur = 0;
+ for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
+ VLRU_Wait_r(&volume_LRU.q[idx]);
+ VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+ VOL_UNLOCK;
- ndays = (now - V_dayUseDate(vp)) / OneDay;
- for (i = 6; i > ndays - 1; i--)
- V_weekUse(vp)[i] = V_weekUse(vp)[i - ndays];
- for (i = 0; i < ndays - 1 && i < 7; i++)
- V_weekUse(vp)[i] = 0;
- if (ndays <= 7)
- V_weekUse(vp)[ndays - 1] = V_dayUse(vp);
- V_dayUse(vp) = 0;
- V_dayUseDate(vp) = Midnight(now);
+ stats->queue_info[idx].start = cur;
-#if OPENAFS_VOL_STATS
- /*
- * All we need to do is bzero the entire VOL_STATS_BYTES of
- * the detailed volume statistics area.
- */
- memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
-#endif /* OPENAFS_VOL_STATS */
- }
+ for (queue_Scan(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+ if (cur == len) {
+ /* out of space in vec */
+ break;
+ }
+ vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+ vec[cur].volid = vp->hashid;
+ cur++;
+ }
- /*It's been more than a day of collection */
- /*
- * Always return happily.
- */
- return (0);
-} /*VAdjustVolumeStatistics */
+ stats->queue_info[idx].len = cur - stats->queue_info[idx].start;
-int
-VAdjustVolumeStatistics(register Volume * vp)
-{
- int retVal;
- VOL_LOCK;
- retVal = VAdjustVolumeStatistics_r(vp);
- VOL_UNLOCK;
- return retVal;
+ VOL_LOCK;
+ VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+ }
+
+ stats->len = len;
+ stats->used = cur;
+ return 0;
}
-void
-VBumpVolumeUsage_r(register Volume * vp)
+#define ENUMTOSTRING(en) #en
+#define ENUMCASE(en) \
+ case en: \
+ return ENUMTOSTRING(en); \
+ break
+
+static char *
+vlru_idx_to_string(int idx)
{
- unsigned int now = FT_ApproxTime();
- if (now - V_dayUseDate(vp) > OneDay)
- VAdjustVolumeStatistics_r(vp);
- /*
- * Save the volume header image to disk after every 128 bumps to dayUse.
- */
- if ((V_dayUse(vp)++ & 127) == 0) {
- Error error;
- VUpdateVolume_r(&error, vp);
+ switch (idx) {
+ ENUMCASE(VLRU_QUEUE_NEW);
+ ENUMCASE(VLRU_QUEUE_MID);
+ ENUMCASE(VLRU_QUEUE_OLD);
+ ENUMCASE(VLRU_QUEUE_CANDIDATE);
+ ENUMCASE(VLRU_QUEUE_HELD);
+ ENUMCASE(VLRU_QUEUE_INVALID);
+ default:
+ return "**UNKNOWN**";
}
}
void
-VBumpVolumeUsage(register Volume * vp)
+VPrintExtendedCacheStats_r(int flags)
{
- VOL_LOCK;
- VBumpVolumeUsage_r(vp);
- VOL_UNLOCK;
-}
+ int i;
+ afs_uint32 vol_sum = 0;
+ struct stats {
+ double min;
+ double max;
+ double sum;
+ double avg;
+ };
+ struct stats looks, gets, reorders, len;
+ struct stats ch_looks, ch_gets, ch_reorders;
+ char pr_buf[4][32];
+ VolumeHashChainHead *head;
+ Volume *vp, *np;
+ struct VLRUExtStats vlru_stats;
+
+ /* zero out stats */
+ memset(&looks, 0, sizeof(struct stats));
+ memset(&gets, 0, sizeof(struct stats));
+ memset(&reorders, 0, sizeof(struct stats));
+ memset(&len, 0, sizeof(struct stats));
+ memset(&ch_looks, 0, sizeof(struct stats));
+ memset(&ch_gets, 0, sizeof(struct stats));
+ memset(&ch_reorders, 0, sizeof(struct stats));
+
+ for (i = 0; i < VolumeHashTable.Size; i++) {
+ head = &VolumeHashTable.Table[i];
+
+ VHashWait_r(head);
+ VHashBeginExclusive_r(head);
+ VOL_UNLOCK;
-void
-VSetDiskUsage_r(void)
-{
- static int FifteenMinuteCounter = 0;
+ ch_looks.sum = UInt64ToDouble(&head->looks);
+ ch_gets.sum = UInt64ToDouble(&head->gets);
+ ch_reorders.sum = UInt64ToDouble(&head->reorders);
+
+ /* update global statistics */
+ {
+ looks.sum += ch_looks.sum;
+ gets.sum += ch_gets.sum;
+ reorders.sum += ch_reorders.sum;
+ len.sum += (double)head->len;
+ vol_sum += head->len;
+
+ if (i == 0) {
+ len.min = (double) head->len;
+ len.max = (double) head->len;
+ looks.min = ch_looks.sum;
+ looks.max = ch_looks.sum;
+ gets.min = ch_gets.sum;
+ gets.max = ch_gets.sum;
+ reorders.min = ch_reorders.sum;
+ reorders.max = ch_reorders.sum;
+ } else {
+ if (((double)head->len) < len.min)
+ len.min = (double) head->len;
+ if (((double)head->len) > len.max)
+ len.max = (double) head->len;
+ if (ch_looks.sum < looks.min)
+ looks.min = ch_looks.sum;
+ else if (ch_looks.sum > looks.max)
+ looks.max = ch_looks.sum;
+ if (ch_gets.sum < gets.min)
+ gets.min = ch_gets.sum;
+ else if (ch_gets.sum > gets.max)
+ gets.max = ch_gets.sum;
+ if (ch_reorders.sum < reorders.min)
+ reorders.min = ch_reorders.sum;
+ else if (ch_reorders.sum > reorders.max)
+ reorders.max = ch_reorders.sum;
+ }
+ }
- while (VInit < 2) {
- /* NOTE: Don't attempt to access the partitions list until the
- * initialization level indicates that all volumes are attached,
- * which implies that all partitions are initialized. */
-#ifdef AFS_PTHREAD_ENV
- sleep(10);
-#else /* AFS_PTHREAD_ENV */
- IOMGR_Sleep(10);
-#endif /* AFS_PTHREAD_ENV */
- }
+ if ((flags & VOL_STATS_PER_CHAIN2) && queue_IsNotEmpty(head)) {
+ /* compute detailed per-chain stats */
+ struct stats hdr_loads, hdr_gets;
+ double v_looks, v_loads, v_gets;
+
+ /* initialize stats with data from first element in chain */
+ vp = queue_First(head, Volume);
+ v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
+ v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
+ v_gets = UInt64ToDouble(&vp->stats.hdr_gets);
+ ch_gets.min = ch_gets.max = v_looks;
+ hdr_loads.min = hdr_loads.max = v_loads;
+ hdr_gets.min = hdr_gets.max = v_gets;
+ hdr_loads.sum = hdr_gets.sum = 0;
+
+ vp = queue_Next(vp, Volume);
+
+ /* pull in stats from remaining elements in chain */
+ for (queue_ScanFrom(head, vp, vp, np, Volume)) {
+ v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
+ v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
+ v_gets = UInt64ToDouble(&vp->stats.hdr_gets);
+
+ hdr_loads.sum += v_loads;
+ hdr_gets.sum += v_gets;
+
+ if (v_looks < ch_gets.min)
+ ch_gets.min = v_looks;
+ else if (v_looks > ch_gets.max)
+ ch_gets.max = v_looks;
+
+ if (v_loads < hdr_loads.min)
+ hdr_loads.min = v_loads;
+ else if (v_loads > hdr_loads.max)
+ hdr_loads.max = v_loads;
+
+ if (v_gets < hdr_gets.min)
+ hdr_gets.min = v_gets;
+ else if (v_gets > hdr_gets.max)
+ hdr_gets.max = v_gets;
+ }
- VResetDiskUsage_r();
- if (++FifteenMinuteCounter == 3) {
- FifteenMinuteCounter = 0;
- VScanUpdateList();
+ /* compute per-chain averages */
+ ch_gets.avg = ch_gets.sum / ((double)head->len);
+ hdr_loads.avg = hdr_loads.sum / ((double)head->len);
+ hdr_gets.avg = hdr_gets.sum / ((double)head->len);
+
+ /* dump per-chain stats */
+ Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n",
+ i, head->len,
+ DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1])));
+ Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(ch_gets.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(ch_gets.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(ch_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(ch_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+ Log("\tHDR gets : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(hdr_gets.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(hdr_gets.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(hdr_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(hdr_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+ Log("\tHDR loads : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(hdr_loads.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(hdr_loads.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(hdr_loads.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(hdr_loads.sum, pr_buf[3], sizeof(pr_buf[3])));
+ } else if (flags & VOL_STATS_PER_CHAIN) {
+ /* dump simple per-chain stats */
+ Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n",
+ i, head->len,
+ DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2])));
+ }
+
+ VOL_LOCK;
+ VHashEndExclusive_r(head);
}
-}
-void
-VSetDiskUsage(void)
-{
- VOL_LOCK;
- VSetDiskUsage_r();
VOL_UNLOCK;
-}
-/* The number of minutes that a volume hasn't been updated before the
- * "Dont salvage" flag in the volume header will be turned on */
-#define SALVAGE_INTERVAL (10*60)
+ /* compute global averages */
+ len.avg = len.sum / ((double)VolumeHashTable.Size);
+ looks.avg = looks.sum / ((double)VolumeHashTable.Size);
+ gets.avg = gets.sum / ((double)VolumeHashTable.Size);
+ reorders.avg = reorders.sum / ((double)VolumeHashTable.Size);
+
+ /* dump global stats */
+ Log("Volume hash summary: %d buckets\n", VolumeHashTable.Size);
+ Log(" chain length : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(len.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(len.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(len.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(len.sum, pr_buf[3], sizeof(pr_buf[3])));
+ Log(" looks : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(looks.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(looks.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(looks.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(looks.sum, pr_buf[3], sizeof(pr_buf[3])));
+ Log(" gets : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(gets.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(gets.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+ Log(" reorders : min=%s, max=%s, avg=%s, total=%s\n",
+ DoubleToPrintable(reorders.min, pr_buf[0], sizeof(pr_buf[0])),
+ DoubleToPrintable(reorders.max, pr_buf[1], sizeof(pr_buf[1])),
+ DoubleToPrintable(reorders.avg, pr_buf[2], sizeof(pr_buf[2])),
+ DoubleToPrintable(reorders.sum, pr_buf[3], sizeof(pr_buf[3])));
+
+ /* print extended disk related statistics */
+ {
+ struct DiskPartition64 * diskP;
+ afs_uint32 vol_count[VOLMAXPARTS+1];
+ byte part_exists[VOLMAXPARTS+1];
+ Device id;
+ int i;
-static VolumeId *UpdateList; /* Pointer to array of Volume ID's */
-static int nUpdatedVolumes; /* Updated with entry in UpdateList, salvage after crash flag on */
-static int updateSize; /* number of entries possible */
-#define UPDATE_LIST_SIZE 100 /* size increment */
+ memset(vol_count, 0, sizeof(vol_count));
+ memset(part_exists, 0, sizeof(part_exists));
-void
-VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
-{
- *ec = 0;
- vp->updateTime = FT_ApproxTime();
- if (V_dontSalvage(vp) == 0)
- return;
- V_dontSalvage(vp) = 0;
- VSyncVolume_r(ec, vp);
- if (*ec)
- return;
- if (!UpdateList) {
- updateSize = UPDATE_LIST_SIZE;
- UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize);
- } else {
- if (nUpdatedVolumes == updateSize) {
- updateSize += UPDATE_LIST_SIZE;
- UpdateList =
- (VolumeId *) realloc(UpdateList,
- sizeof(VolumeId) * updateSize);
+ VOL_LOCK;
+
+ for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+ id = diskP->index;
+ vol_count[id] = diskP->vol_list.len;
+ part_exists[id] = 1;
}
- }
- assert(UpdateList != NULL);
- UpdateList[nUpdatedVolumes++] = V_id(vp);
-}
-static void
-VScanUpdateList(void)
-{
- register int i, gap;
- register Volume *vp;
- Error error;
- afs_uint32 now = FT_ApproxTime();
- /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */
- for (i = gap = 0; i < nUpdatedVolumes; i++) {
- vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]);
- if (error) {
- gap++;
- } else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) {
- V_dontSalvage(vp) = DONT_SALVAGE;
- VUpdateVolume_r(&error, vp); /* No need to fsync--not critical */
- gap++;
+ VOL_UNLOCK;
+ for (i = 0; i <= VOLMAXPARTS; i++) {
+ if (part_exists[i]) {
+ /* XXX while this is currently safe, it is a violation
+ * of the VGetPartitionById_r interface contract. */
+ diskP = VGetPartitionById_r(i, 0);
+ if (diskP) {
+ Log("Partition %s has %d online volumes\n",
+ VPartitionPath(diskP), diskP->vol_list.len);
+ }
+ }
}
- if (vp)
- VPutVolume_r(vp);
-#ifndef AFS_PTHREAD_ENV
- IOMGR_Poll();
-#endif /* !AFS_PTHREAD_ENV */
+ VOL_LOCK;
}
- nUpdatedVolumes -= gap;
-}
-/***************************************************/
-/* Add on routines to manage a volume header cache */
-/***************************************************/
+ /* print extended VLRU statistics */
+ if (VVLRUExtStats_r(&vlru_stats, vol_sum) == 0) {
+ afs_uint32 idx, cur, lpos;
+ VolumeId line[5];
-static struct volHeader *volumeLRU;
+ VOL_UNLOCK;
-/* Allocate a bunch of headers; string them together */
-static void
-InitLRU(int howMany)
-{
- register struct volHeader *hp;
- if (programType != fileServer)
- return;
- hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
- while (howMany--)
- ReleaseVolumeHeader(hp++);
-}
+ Log("VLRU State Dump:\n\n");
-/* Get a volume header from the LRU list; update the old one if necessary */
-/* Returns 1 if there was already a header, which is removed from the LRU list */
-static int
-GetVolumeHeader(register Volume * vp)
-{
- Error error;
- register struct volHeader *hd;
- int old;
- static int everLogged = 0;
+ for (idx = VLRU_QUEUE_NEW; idx < VLRU_QUEUE_INVALID; idx++) {
+ Log("\t%s:\n", vlru_idx_to_string(idx));
- old = (vp->header != 0); /* old == volume already has a header */
- if (programType != fileServer) {
- if (!vp->header) {
- hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
- assert(hd != 0);
- vp->header = hd;
- hd->back = vp;
- }
- } else {
- if (old) {
- hd = vp->header;
- if (volumeLRU == hd)
- volumeLRU = hd->next;
- assert(hd->back == vp);
- } else {
- if (volumeLRU)
- /* not currently in use and least recently used */
- hd = volumeLRU->prev;
- else {
- hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
- /* make it look like single elt LRU */
- hd->prev = hd->next = hd;
- if (!everLogged) {
- Log("****Allocated more volume headers, probably leak****\n");
- everLogged = 1;
+ lpos = 0;
+ for (cur = vlru_stats.queue_info[idx].start;
+ cur < vlru_stats.queue_info[idx].len;
+ cur++) {
+ line[lpos++] = vlru_stats.vec[cur].volid;
+ if (lpos==5) {
+ Log("\t\t%u, %u, %u, %u, %u,\n",
+ line[0], line[1], line[2], line[3], line[4]);
+ lpos = 0;
}
}
- if (hd->back) {
- if (hd->diskstuff.inUse) {
- WriteVolumeHeader_r(&error, hd->back);
- /* Ignore errors; catch them later */
+
+ if (lpos) {
+ while (lpos < 5) {
+ line[lpos++] = 0;
}
- hd->back->header = 0;
+ Log("\t\t%u, %u, %u, %u, %u\n",
+ line[0], line[1], line[2], line[3], line[4]);
}
- hd->back = vp;
- vp->header = hd;
+ Log("\n");
}
- if (hd->next) { /* hd->next != 0 --> in LRU chain (we zero it later) */
- hd->prev->next = hd->next; /* pull hd out of LRU list */
- hd->next->prev = hd->prev; /* if hd only element, this is noop */
- }
- hd->next = hd->prev = 0;
- /* if not in LRU chain, next test won't be true */
- if (hd == volumeLRU) /* last header item, turn into empty list */
- volumeLRU = NULL;
- }
- return old;
-}
-/* Put it at the top of the LRU chain */
-static void
-ReleaseVolumeHeader(register struct volHeader *hd)
-{
- if (programType != fileServer)
- return;
- if (!hd || hd->next) /* no header, or header already released */
- return;
- if (!volumeLRU) {
- hd->next = hd->prev = hd;
- } else {
- hd->prev = volumeLRU->prev;
- hd->next = volumeLRU;
- hd->prev->next = hd->next->prev = hd;
+ free(vlru_stats.vec);
+
+ VOL_LOCK;
}
- volumeLRU = hd;
}
-static void
-FreeVolumeHeader(register Volume * vp)
+void
+VPrintExtendedCacheStats(int flags)
{
- register struct volHeader *hd = vp->header;
- if (!hd)
- return;
- if (programType == fileServer) {
- ReleaseVolumeHeader(hd);
- hd->back = 0;
- } else {
- free(hd);
- }
- vp->header = 0;
+ VOL_LOCK;
+ VPrintExtendedCacheStats_r(flags);
+ VOL_UNLOCK;
}
+#endif /* AFS_DEMAND_ATTACH_FS */
-
-/***************************************************/
-/* Routines to add volume to hash chain, delete it */
-/***************************************************/
-
-static void
-AddVolumeToHashTable(register Volume * vp, int hashid)
+afs_int32
+VCanScheduleSalvage(void)
{
- int hash = VOLUME_HASH(hashid);
- vp->hashid = hashid;
- vp->hashNext = VolumeHashTable[hash];
- VolumeHashTable[hash] = vp;
- vp->vnodeHashOffset = VolumeHashOffset_r();
+ return vol_opts.canScheduleSalvage;
}
-static void
-DeleteVolumeFromHashTable(register Volume * vp)
+afs_int32
+VCanUseFSSYNC(void)
{
- int hash = VOLUME_HASH(vp->hashid);
- if (VolumeHashTable[hash] == vp)
- VolumeHashTable[hash] = vp->hashNext;
- else {
- Volume *tvp = VolumeHashTable[hash];
- if (tvp == NULL)
- return;
- while (tvp->hashNext && tvp->hashNext != vp)
- tvp = tvp->hashNext;
- if (tvp->hashNext == NULL)
- return;
- tvp->hashNext = vp->hashNext;
- }
- vp->hashid = 0;
+ return vol_opts.canUseFSSYNC;
}
-void
-VPrintCacheStats_r(void)
+afs_int32
+VCanUseSALVSYNC(void)
{
- register struct VnodeClassInfo *vcp;
- vcp = &VnodeClassInfo[vLarge];
- Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
- vcp = &VnodeClassInfo[vSmall];
- Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
- Log("Volume header cache, %d entries, %d gets, %d replacements\n",
- VolumeCacheSize, VolumeGets, VolumeReplacements);
+ return vol_opts.canUseSALVSYNC;
}
-void
-VPrintCacheStats(void)
+afs_int32
+VCanUnsafeAttach(void)
{
- VOL_LOCK;
- VPrintCacheStats_r();
- VOL_UNLOCK;
+ return vol_opts.unsafe_attach;
}