2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
18 #ifdef AFS_DEMAND_ATTACH_FS
26 #include <afs/afs_assert.h>
32 #include <sys/param.h>
33 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX_ENV)
36 #endif /* AFS_NT40_ENV */
38 #include <afs/afsutil.h>
41 #include <afs/afsint.h>
45 #include "viceinode.h"
47 #include "partition.h"
48 #include <afs/errors.h>
50 #define __VOL_VG_CACHE_IMPL 1
53 #include "vg_cache_impl.h"
55 static int _VVGC_lookup(struct DiskPartition64 *,
57 VVGCache_entry_t ** entry,
58 VVGCache_hash_entry_t ** hentry);
59 static int _VVGC_entry_alloc(VVGCache_entry_t ** entry);
60 static int _VVGC_entry_free(VVGCache_entry_t * entry);
61 static int _VVGC_entry_get(VVGCache_entry_t * entry);
62 static int _VVGC_entry_put(struct DiskPartition64 *,
63 VVGCache_entry_t * entry);
64 static int _VVGC_entry_add(struct DiskPartition64 *,
67 VVGCache_hash_entry_t **);
68 static int _VVGC_entry_cl_add(VVGCache_entry_t *, VolumeId);
69 static int _VVGC_entry_cl_del(struct DiskPartition64 *, VVGCache_entry_t *,
71 static int _VVGC_entry_export(VVGCache_entry_t *, VVGCache_query_t *);
72 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry);
73 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry);
74 static int _VVGC_hash_entry_add(struct DiskPartition64 *,
77 VVGCache_hash_entry_t **);
78 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t * entry);
79 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * entry);
81 VVGCache_hash_table_t VVGCache_hash_table;
85 * initialize volume group cache subsystem.
87 * @return operation status
91 VVGCache_PkgInit(void)
96 /* allocate hash table */
97 VVGCache_hash_table.hash_buckets =
98 malloc(VolumeHashTable.Size * sizeof(struct rx_queue));
99 if (VVGCache_hash_table.hash_buckets == NULL) {
104 /* setup hash chain heads */
105 for (i = 0; i < VolumeHashTable.Size; i++) {
106 queue_Init(&VVGCache_hash_table.hash_buckets[i]);
109 /* initialize per-partition VVGC state */
110 for (i = 0; i <= VOLMAXPARTS; i++) {
111 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
112 VVGCache.part[i].dlist_hash_buckets = NULL;
113 CV_INIT(&VVGCache.part[i].cv, "cache part", CV_DEFAULT, 0);
124 * shut down volume group cache subsystem.
126 * @return operation status
132 VVGCache_PkgShutdown(void)
138 /* free hash table */
139 free(VVGCache_hash_table.hash_buckets);
140 VVGCache_hash_table.hash_buckets = NULL;
142 /* destroy per-partition VVGC state */
143 for (i = 0; i <= VOLMAXPARTS; i++) {
144 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
145 CV_DESTROY(&VVGCache.part[i].cv);
152 * allocate a cache entry.
154 * @param[out] entry_out pointer to newly allocated entry
156 * @return operation status
162 _VVGC_entry_alloc(VVGCache_entry_t ** entry_out)
165 VVGCache_entry_t * ent;
167 *entry_out = ent = malloc(sizeof(VVGCache_entry_t));
173 memset(ent, 0, sizeof(*ent));
180 * free a cache entry.
182 * @param[in] entry cache entry
184 * @return operation status
190 _VVGC_entry_free(VVGCache_entry_t * entry)
194 osi_Assert(entry->refcnt == 0);
201 * allocate and register an entry for a volume group.
203 * @param[in] dp disk partition object
204 * @param[in] volid volume id
205 * @param[out] entry_out vg cache object pointer
206 * @param[out] hash_out vg cache hash entry object pointer
208 * @pre - VOL_LOCK held
209 * - no such entry exists in hash table
211 * @return operation status
217 _VVGC_entry_add(struct DiskPartition64 * dp,
219 VVGCache_entry_t ** entry_out,
220 VVGCache_hash_entry_t ** hash_out)
223 VVGCache_entry_t * ent;
225 code = _VVGC_entry_alloc(&ent);
231 /* refcnt will be inc'd when a child is added */
234 code = _VVGC_hash_entry_add(dp, volid, ent, hash_out);
246 _VVGC_entry_free(ent);
253 * add a volid to the entry's child list.
255 * @param[in] ent volume group object
256 * @param[in] volid volume id
258 * @return operation status
260 * @retval -1 child table is full
265 _VVGC_entry_cl_add(VVGCache_entry_t * ent,
271 /* search table to avoid duplicates */
272 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
273 if (ent->children[i] == volid) {
274 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
276 afs_printable_uint32_lu(volid),
277 afs_printable_uint32_lu(ent->rw)));
280 if (empty_idx == -1 && !ent->children[i]) {
282 /* don't break; make sure we go through all children so we don't
283 * add a duplicate entry */
287 /* verify table isn't full */
288 if (empty_idx == -1) {
290 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
291 "is full\n", afs_printable_uint32_lu(volid),
292 afs_printable_uint32_lu(ent->rw)));
297 ent->children[empty_idx] = volid;
300 code = _VVGC_entry_get(ent);
307 * delete a volid from the entry's child list.
309 * @param[in] dp disk partition object
310 * @param[in] ent volume group object
311 * @param[in] volid volume id
313 * @return operation status
315 * @retval -1 no such entry found
320 _VVGC_entry_cl_del(struct DiskPartition64 *dp,
321 VVGCache_entry_t * ent,
326 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
327 if (ent->children[i] == volid) {
328 ent->children[i] = 0;
336 code = _VVGC_entry_put(dp, ent);
343 * add a refcount to an entry.
345 * @param[in] entry cache entry
349 * @return operation status
354 static int _VVGC_entry_get(VVGCache_entry_t * entry)
361 * put back a reference to an entry.
363 * @param[in] dp disk partition object
364 * @param[in] entry cache entry
368 * @warning do not attempt to deref pointer after calling this interface
370 * @return operation status
373 * @note dp is needed to lookup the RW hash entry to unlink, if we are
374 * putting back the final reference and freeing
379 _VVGC_entry_put(struct DiskPartition64 * dp, VVGCache_entry_t * entry)
383 osi_Assert(entry->refcnt > 0);
385 if (--entry->refcnt == 0) {
386 VVGCache_entry_t *nentry;
387 VVGCache_hash_entry_t *hentry;
389 /* first, try to delete the RW id hash entry pointing to this
391 code = _VVGC_lookup(dp, entry->rw, &nentry, &hentry);
393 if (nentry != entry) {
394 /* looking up the rw of this entry points to a different
395 * entry; should not happen */
396 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
397 "found different entry than was passed",
398 afs_printable_uint32_lu(entry->rw)));
401 code = _VVGC_hash_entry_unlink(hentry);
404 } else if (code == ENOENT) {
405 /* ignore ENOENT; this shouldn't happen, since the RW hash
406 * entry should always exist if the entry does... but we
407 * were going to delete it anyway, so try to continue */
408 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
409 "vol %lu, but RW hash entry doesn't exist; continuing "
410 "anyway...\n", afs_printable_uint32_lu(entry->rw)));
415 /* now, just free the entry itself */
417 code = _VVGC_entry_free(entry);
425 * export a volume group entry in the external object format.
427 * @param[in] ent internal-format volume group object
428 * @param[out] qry external-format volume group object
432 * @return operation status
438 _VVGC_entry_export(VVGCache_entry_t * ent, VVGCache_query_t * qry)
443 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
444 qry->children[i] = ent->children[i];
451 * allocate a hash table entry structure.
453 * @param[out] entry_out address in which to store newly allocated hash entry struct
455 * @return operation status
461 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry_out)
464 VVGCache_hash_entry_t * ent;
466 *entry_out = ent = malloc(sizeof(VVGCache_hash_entry_t));
475 * free a hash table entry structure.
477 * @param[in] entry hash table entry structure to be freed
479 * @return operation status
485 _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry)
495 * add an entry to the hash table.
497 * @param[in] dp disk partition object
498 * @param[in] volid volume id
499 * @param[in] ent volume group object
500 * @param[out] hash_out address in which to store pointer to hash entry
504 * @return operation status
506 * @retval EEXIST hash entry for volid already exists, and it points to
507 * a different VG entry
512 _VVGC_hash_entry_add(struct DiskPartition64 * dp,
514 VVGCache_entry_t * ent,
515 VVGCache_hash_entry_t ** hash_out)
518 VVGCache_hash_entry_t * hent;
519 int hash = VVGC_HASH(volid);
520 VVGCache_entry_t *nent;
522 code = _VVGC_lookup(dp, volid, &nent, hash_out);
525 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
526 " nonmatching entry for vol %lu: original "
527 "(%"AFS_PTR_FMT",%lu) new (%"AFS_PTR_FMT",%lu)\n",
528 afs_printable_uint32_lu(volid),
529 nent, afs_printable_uint32_lu(nent->rw),
530 ent, afs_printable_uint32_lu(ent->rw)));
533 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
534 "hash entry for vol %lu, VG %lu",
535 afs_printable_uint32_lu(volid),
536 afs_printable_uint32_lu(ent->rw)));
537 /* accept attempts to add matching duplicate entries; just
538 * pretend we added it */
542 code = _VVGC_hash_entry_alloc(&hent);
550 queue_Append(&VVGCache_hash_table.hash_buckets[hash],
561 * remove an entry from the hash table.
563 * @param[in] hent hash table entry
567 * @return operation status
573 _VVGC_hash_entry_del(VVGCache_hash_entry_t * hent)
578 if (hent->entry->rw == hent->volid) {
582 code = _VVGC_entry_cl_del(hent->dp, hent->entry, hent->volid);
583 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
584 * if hent->entry->rw == hent->volid, it is possible for hent to
588 /* If we are the RW id, don't unlink, since we still need the
589 * hash entry to exist, so when we lookup children, they can
590 * look up the RW id hash chain, and they will all go to the
593 * If we are the last entry and the entry should be deleted,
594 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
596 res = _VVGC_hash_entry_unlink(hent);
606 * low-level interface to remove an entry from the hash table.
608 * Does not alter the refcount or worry about the children lists or
609 * anything like that; just removes the hash table entry, frees it, and
610 * that's all. You probably want @see _VVGC_hash_entry_del instead.
612 * @param[in] hent hash table entry
616 * @return operation status
622 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * hent)
629 code = _VVGC_hash_entry_free(hent);
635 * lookup a vg cache entry given any member volume id.
637 * @param[in] dp disk partition object
638 * @param[in] volid vg member volume id
639 * @param[out] entry_out address in which to store volume group entry structure pointer
640 * @param[out] hash_out address in which to store hash entry pointer
644 * @warning - it is up to the caller to get a ref to entry_out, if needed
645 * - hash_out must not be referenced after dropping VOL_LOCK
647 * @return operation status
649 * @retval ENOENT volume id not found
650 * @retval EINVAL partition's VGC is invalid
655 _VVGC_lookup(struct DiskPartition64 * dp,
657 VVGCache_entry_t ** entry_out,
658 VVGCache_hash_entry_t ** hash_out)
661 int bucket = VVGC_HASH(volid);
662 struct VVGCache_hash_entry * ent, * nent;
664 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
670 for (queue_Scan(&VVGCache_hash_table.hash_buckets[bucket],
673 VVGCache_hash_entry)) {
674 if (ent->volid == volid && ent->dp == dp) {
676 *entry_out = ent->entry;
688 * add an entry to the volume group cache.
690 * @param[in] dp disk partition object
691 * @param[in] parent parent volume id
692 * @param[in] child child volume id
693 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
694 * new VG, 0 if we added to an existing VG
698 * @return operation status
700 * @retval -1 parent and child are already registered in
704 VVGCache_entry_add_r(struct DiskPartition64 * dp,
710 VVGCache_entry_t * child_ent, * parent_ent;
716 /* check for existing entries */
717 res = _VVGC_lookup(dp, child, &child_ent, NULL);
718 if (res && res != ENOENT) {
723 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
724 if (res && res != ENOENT) {
730 * branch based upon existence of parent and child nodes
732 if (parent_ent && child_ent) {
733 /* both exist. we're done.
734 * if they point different places, then report the error. */
735 if (child_ent != parent_ent) {
738 if (parent == child) {
739 /* if we're adding the RW entry as a child, the RW id may
740 * not be in the child array yet, so make sure not to skip
745 } else if (!parent_ent && child_ent) {
747 * update vg root volid, and add hash entry. */
748 parent_ent = child_ent;
749 parent_ent->rw = parent;
751 code = _VVGC_hash_entry_add(dp,
756 } else if (!child_ent && !parent_ent) {
757 code = _VVGC_entry_add(dp,
767 if (child == parent) {
768 /* if we're the RW, skip over adding the child hash entry;
769 * we already added the hash entry when creating the entry */
770 child_ent = parent_ent;
775 osi_Assert(!child_ent);
776 child_ent = parent_ent;
777 code = _VVGC_hash_entry_add(dp,
786 code = _VVGC_entry_cl_add(child_ent, child);
789 if (code && code != EINVAL) {
790 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
791 " %lu on partition %s", code, afs_printable_uint32_lu(child),
792 afs_printable_uint32_lu(parent), VPartitionPath(dp)));
795 if (code == 0 && VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
796 /* we successfully added the entry; make sure it's not on the
797 * to-delete list, so it doesn't get deleted later */
798 code = _VVGC_dlist_del_r(dp, parent, child);
799 if (code && code != ENOENT) {
800 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
801 "%lu (parent %lu) from the to-delete list for part "
802 "%s.\n", code, afs_printable_uint32_lu(child),
803 afs_printable_uint32_lu(parent),
804 VPartitionPath(dp)));
814 * add an entry to the volume group cache.
816 * @param[in] dp disk partition object
817 * @param[in] parent parent volume id
818 * @param[in] child child volume id
819 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
820 * new VG, 0 if we added to an existing VG
822 * @return operation status
826 VVGCache_entry_add(struct DiskPartition64 * dp,
834 VVGCache_entry_add_r(dp, parent, child, newvg);
841 * delete an entry from the volume group cache.
843 * If partition is scanning, actually puts the entry on a list of entries
844 * to delete when the scan is done.
846 * @param[in] dp disk partition object
847 * @param[in] parent parent volume id
848 * @param[in] child child volume id
852 * @return operation status
856 VVGCache_entry_del_r(struct DiskPartition64 * dp,
857 VolumeId parent, VolumeId child)
859 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
861 code = _VVGC_dlist_add_r(dp, parent, child);
866 return _VVGC_entry_purge_r(dp, parent, child);
870 * delete an entry from the volume group cache.
872 * @param[in] dp disk partition object
873 * @param[in] parent parent volume id
874 * @param[in] child child volume id
880 * @return operation status
884 _VVGC_entry_purge_r(struct DiskPartition64 * dp,
885 VolumeId parent, VolumeId child)
888 VVGCache_entry_t * parent_ent, * child_ent;
889 VVGCache_hash_entry_t * child_hent;
891 /* check mappings for each volid */
892 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
897 res = _VVGC_lookup(dp, child, &child_ent, &child_hent);
903 /* if the mappings don't match, we have a serious error */
904 if (parent_ent != child_ent) {
905 ViceLog(0, ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
906 "but vol %lu points to VGC entry %"AFS_PTR_FMT" and VG %lu "
907 "points to VGC entry %"AFS_PTR_FMT"\n",
908 afs_printable_uint32_lu(child),
909 afs_printable_uint32_lu(parent),
910 afs_printable_uint32_lu(child),
911 child_ent, afs_printable_uint32_lu(parent), parent_ent));
916 code = _VVGC_hash_entry_del(child_hent);
923 * delete an entry from the volume group cache.
925 * @param[in] dp disk partition object
926 * @param[in] parent parent volume id
927 * @param[in] child child volume id
929 * @return operation status
933 VVGCache_entry_del(struct DiskPartition64 * dp,
934 VolumeId parent, VolumeId child)
939 code = VVGCache_entry_del_r(dp, parent, child);
946 * query a volume group by any member volume id.
948 * @param[in] dp disk partition object
949 * @param[in] volume volume id of a member of VG
950 * @param[out] res vg membership data
954 * @return operation status
956 * @retval EAGAIN partition needs to finish scanning
959 VVGCache_query_r(struct DiskPartition64 * dp,
961 VVGCache_query_t * res)
964 VVGCache_entry_t * ent;
966 /* If cache for this partition doesn't exist; start a scan */
967 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
968 code = VVGCache_scanStart_r(dp);
969 if (code == 0 || code == -3) {
970 /* -3 means another thread already started scanning */
975 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
979 code = _VVGC_lookup(dp, volume, &ent, NULL);
981 code = _VVGC_entry_export(ent, res);
988 * query a volume group by any member volume id.
990 * @param[in] dp disk partition object
991 * @param[in] volume volume id of a member of VG
992 * @param[out] res vg membership data
994 * @return operation status
998 VVGCache_query(struct DiskPartition64 * dp,
999 VolumeId volume, VVGCache_query_t * res)
1004 code = VVGCache_query_r(dp, volume, res);
1011 * begin asynchronous scan of on-disk volume group metadata.
1013 * @param[in] dp disk partition object
1015 * @pre VOL_LOCK held
1017 * @return operation status
1021 VVGCache_scanStart_r(struct DiskPartition64 * dp)
1026 code = _VVGC_scan_start(dp);
1028 /* start a scanner thread on each partition */
1029 for (dp = DiskPartitionList; dp; dp = dp->next) {
1030 res = _VVGC_scan_start(dp);
1041 * begin asynchronous scan of on-disk volume group metadata.
1043 * @param[in] dp disk partition object
1045 * @return operation status
1049 VVGCache_scanStart(struct DiskPartition64 * dp)
1054 code = VVGCache_scanStart_r(dp);
1061 * wait for async on-disk VG metadata scan to complete.
1063 * @param[in] dp disk partition object
1065 * @pre VOL_LOCK held
1067 * @warning this routine must drop VOL_LOCK internally
1069 * @return operation status
1073 VVGCache_scanWait_r(struct DiskPartition64 * dp)
1077 while (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
1078 VOL_CV_WAIT(&VVGCache.part[dp->index].cv);
1085 * wait for async on-disk VG metadata scan to complete.
1087 * @param[in] dp disk partition object
1089 * @return operation status
1093 VVGCache_scanWait(struct DiskPartition64 * dp)
1098 code = VVGCache_scanWait_r(dp);
1105 * flush all cache entries for a given disk partition.
1107 * @param[in] part disk partition object
1109 * @pre VOL_LOCK held
1111 * @return operation status
1117 _VVGC_flush_part_r(struct DiskPartition64 * part)
1121 VVGCache_hash_entry_t * ent, * nent;
1123 for (i = 0; i < VolumeHashTable.Size; i++) {
1124 for (queue_Scan(&VVGCache_hash_table.hash_buckets[i],
1127 VVGCache_hash_entry)) {
1128 if (ent->dp == part) {
1129 VolumeId volid = ent->volid;
1130 res = _VVGC_hash_entry_del(ent);
1132 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1133 res, afs_printable_uint32_lu(volid)));
1144 * flush all cache entries for a given disk partition.
1146 * @param[in] part disk partition object
1148 * @return operation status
1154 _VVGC_flush_part(struct DiskPartition64 * part)
1159 code = _VVGC_flush_part_r(part);
1167 * change VVGC partition state.
1169 * @param[in] part disk partition object
1170 * @param[in] state new state
1172 * @pre VOL_LOCK is held
1179 _VVGC_state_change(struct DiskPartition64 * part,
1180 VVGCache_part_state_t state)
1182 VVGCache_part_state_t old_state;
1184 old_state = VVGCache.part[part->index].state;
1185 VVGCache.part[part->index].state = state;
1187 if (old_state != state) {
1188 CV_BROADCAST(&VVGCache.part[part->index].cv);
1194 #endif /* AFS_DEMAND_ATTACH_FS */