2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
20 #ifdef HAVE_SYS_FILE_H
24 #ifdef AFS_DEMAND_ATTACH_FS
26 #include <afs/afs_assert.h>
29 #include <afs/afsutil.h>
32 #include <afs/afsint.h>
36 #include "viceinode.h"
38 #include "partition.h"
39 #include <afs/errors.h>
41 #define __VOL_VG_CACHE_IMPL 1
44 #include "vg_cache_impl.h"
46 static int _VVGC_lookup(struct DiskPartition64 *,
48 VVGCache_entry_t ** entry,
49 VVGCache_hash_entry_t ** hentry);
50 static int _VVGC_entry_alloc(VVGCache_entry_t ** entry);
51 static int _VVGC_entry_free(VVGCache_entry_t * entry);
52 static int _VVGC_entry_get(VVGCache_entry_t * entry);
53 static int _VVGC_entry_put(struct DiskPartition64 *,
54 VVGCache_entry_t * entry);
55 static int _VVGC_entry_add(struct DiskPartition64 *,
58 VVGCache_hash_entry_t **);
59 static int _VVGC_entry_cl_add(VVGCache_entry_t *, VolumeId);
60 static int _VVGC_entry_cl_del(struct DiskPartition64 *, VVGCache_entry_t *,
62 static int _VVGC_entry_export(VVGCache_entry_t *, VVGCache_query_t *);
63 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry);
64 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry);
65 static int _VVGC_hash_entry_add(struct DiskPartition64 *,
68 VVGCache_hash_entry_t **);
69 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t * entry);
70 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * entry);
72 VVGCache_hash_table_t VVGCache_hash_table;
76 * initialize volume group cache subsystem.
78 * @return operation status
82 VVGCache_PkgInit(void)
87 /* allocate hash table */
88 VVGCache_hash_table.hash_buckets =
89 malloc(VolumeHashTable.Size * sizeof(struct rx_queue));
90 if (VVGCache_hash_table.hash_buckets == NULL) {
95 /* setup hash chain heads */
96 for (i = 0; i < VolumeHashTable.Size; i++) {
97 queue_Init(&VVGCache_hash_table.hash_buckets[i]);
100 /* initialize per-partition VVGC state */
101 for (i = 0; i <= VOLMAXPARTS; i++) {
102 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
103 VVGCache.part[i].dlist_hash_buckets = NULL;
104 CV_INIT(&VVGCache.part[i].cv, "cache part", CV_DEFAULT, 0);
115 * shut down volume group cache subsystem.
117 * @return operation status
123 VVGCache_PkgShutdown(void)
129 /* free hash table */
130 free(VVGCache_hash_table.hash_buckets);
131 VVGCache_hash_table.hash_buckets = NULL;
133 /* destroy per-partition VVGC state */
134 for (i = 0; i <= VOLMAXPARTS; i++) {
135 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
136 CV_DESTROY(&VVGCache.part[i].cv);
143 * allocate a cache entry.
145 * @param[out] entry_out pointer to newly allocated entry
147 * @return operation status
153 _VVGC_entry_alloc(VVGCache_entry_t ** entry_out)
156 VVGCache_entry_t * ent;
158 *entry_out = ent = malloc(sizeof(VVGCache_entry_t));
164 memset(ent, 0, sizeof(*ent));
171 * free a cache entry.
173 * @param[in] entry cache entry
175 * @return operation status
181 _VVGC_entry_free(VVGCache_entry_t * entry)
185 osi_Assert(entry->refcnt == 0);
192 * allocate and register an entry for a volume group.
194 * @param[in] dp disk partition object
195 * @param[in] volid volume id
196 * @param[out] entry_out vg cache object pointer
197 * @param[out] hash_out vg cache hash entry object pointer
199 * @pre - VOL_LOCK held
200 * - no such entry exists in hash table
202 * @return operation status
208 _VVGC_entry_add(struct DiskPartition64 * dp,
210 VVGCache_entry_t ** entry_out,
211 VVGCache_hash_entry_t ** hash_out)
214 VVGCache_entry_t * ent;
216 code = _VVGC_entry_alloc(&ent);
222 /* refcnt will be inc'd when a child is added */
225 code = _VVGC_hash_entry_add(dp, volid, ent, hash_out);
237 _VVGC_entry_free(ent);
244 * add a volid to the entry's child list.
246 * @param[in] ent volume group object
247 * @param[in] volid volume id
249 * @return operation status
251 * @retval -1 child table is full
256 _VVGC_entry_cl_add(VVGCache_entry_t * ent,
262 /* search table to avoid duplicates */
263 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
264 if (ent->children[i] == volid) {
265 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
267 afs_printable_uint32_lu(volid),
268 afs_printable_uint32_lu(ent->rw)));
271 if (empty_idx == -1 && !ent->children[i]) {
273 /* don't break; make sure we go through all children so we don't
274 * add a duplicate entry */
278 /* verify table isn't full */
279 if (empty_idx == -1) {
281 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
282 "is full\n", afs_printable_uint32_lu(volid),
283 afs_printable_uint32_lu(ent->rw)));
288 ent->children[empty_idx] = volid;
291 code = _VVGC_entry_get(ent);
298 * delete a volid from the entry's child list.
300 * @param[in] dp disk partition object
301 * @param[in] ent volume group object
302 * @param[in] volid volume id
304 * @return operation status
306 * @retval -1 no such entry found
311 _VVGC_entry_cl_del(struct DiskPartition64 *dp,
312 VVGCache_entry_t * ent,
317 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
318 if (ent->children[i] == volid) {
319 ent->children[i] = 0;
327 code = _VVGC_entry_put(dp, ent);
334 * add a refcount to an entry.
336 * @param[in] entry cache entry
340 * @return operation status
345 static int _VVGC_entry_get(VVGCache_entry_t * entry)
352 * put back a reference to an entry.
354 * @param[in] dp disk partition object
355 * @param[in] entry cache entry
359 * @warning do not attempt to deref pointer after calling this interface
361 * @return operation status
364 * @note dp is needed to lookup the RW hash entry to unlink, if we are
365 * putting back the final reference and freeing
370 _VVGC_entry_put(struct DiskPartition64 * dp, VVGCache_entry_t * entry)
374 osi_Assert(entry->refcnt > 0);
376 if (--entry->refcnt == 0) {
377 VVGCache_entry_t *nentry;
378 VVGCache_hash_entry_t *hentry;
380 /* first, try to delete the RW id hash entry pointing to this
382 code = _VVGC_lookup(dp, entry->rw, &nentry, &hentry);
384 if (nentry != entry) {
385 /* looking up the rw of this entry points to a different
386 * entry; should not happen */
387 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
388 "found different entry than was passed",
389 afs_printable_uint32_lu(entry->rw)));
392 code = _VVGC_hash_entry_unlink(hentry);
395 } else if (code == ENOENT) {
396 /* ignore ENOENT; this shouldn't happen, since the RW hash
397 * entry should always exist if the entry does... but we
398 * were going to delete it anyway, so try to continue */
399 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
400 "vol %lu, but RW hash entry doesn't exist; continuing "
401 "anyway...\n", afs_printable_uint32_lu(entry->rw)));
406 /* now, just free the entry itself */
408 code = _VVGC_entry_free(entry);
416 * export a volume group entry in the external object format.
418 * @param[in] ent internal-format volume group object
419 * @param[out] qry external-format volume group object
423 * @return operation status
429 _VVGC_entry_export(VVGCache_entry_t * ent, VVGCache_query_t * qry)
434 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
435 qry->children[i] = ent->children[i];
442 * allocate a hash table entry structure.
444 * @param[out] entry_out address in which to store newly allocated hash entry struct
446 * @return operation status
452 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry_out)
455 VVGCache_hash_entry_t * ent;
457 *entry_out = ent = malloc(sizeof(VVGCache_hash_entry_t));
466 * free a hash table entry structure.
468 * @param[in] entry hash table entry structure to be freed
470 * @return operation status
476 _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry)
486 * add an entry to the hash table.
488 * @param[in] dp disk partition object
489 * @param[in] volid volume id
490 * @param[in] ent volume group object
491 * @param[out] hash_out address in which to store pointer to hash entry
495 * @return operation status
497 * @retval EEXIST hash entry for volid already exists, and it points to
498 * a different VG entry
503 _VVGC_hash_entry_add(struct DiskPartition64 * dp,
505 VVGCache_entry_t * ent,
506 VVGCache_hash_entry_t ** hash_out)
509 VVGCache_hash_entry_t * hent;
510 int hash = VVGC_HASH(volid);
511 VVGCache_entry_t *nent;
513 code = _VVGC_lookup(dp, volid, &nent, hash_out);
516 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
517 " nonmatching entry for vol %lu: original "
518 "(%"AFS_PTR_FMT",%lu) new (%"AFS_PTR_FMT",%lu)\n",
519 afs_printable_uint32_lu(volid),
520 nent, afs_printable_uint32_lu(nent->rw),
521 ent, afs_printable_uint32_lu(ent->rw)));
524 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
525 "hash entry for vol %lu, VG %lu",
526 afs_printable_uint32_lu(volid),
527 afs_printable_uint32_lu(ent->rw)));
528 /* accept attempts to add matching duplicate entries; just
529 * pretend we added it */
533 code = _VVGC_hash_entry_alloc(&hent);
541 queue_Append(&VVGCache_hash_table.hash_buckets[hash],
552 * remove an entry from the hash table.
554 * @param[in] hent hash table entry
558 * @return operation status
564 _VVGC_hash_entry_del(VVGCache_hash_entry_t * hent)
569 if (hent->entry->rw == hent->volid) {
573 code = _VVGC_entry_cl_del(hent->dp, hent->entry, hent->volid);
574 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
575 * if hent->entry->rw == hent->volid, it is possible for hent to
579 /* If we are the RW id, don't unlink, since we still need the
580 * hash entry to exist, so when we lookup children, they can
581 * look up the RW id hash chain, and they will all go to the
584 * If we are the last entry and the entry should be deleted,
585 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
587 res = _VVGC_hash_entry_unlink(hent);
597 * low-level interface to remove an entry from the hash table.
599 * Does not alter the refcount or worry about the children lists or
600 * anything like that; just removes the hash table entry, frees it, and
601 * that's all. You probably want @see _VVGC_hash_entry_del instead.
603 * @param[in] hent hash table entry
607 * @return operation status
613 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * hent)
620 code = _VVGC_hash_entry_free(hent);
626 * lookup a vg cache entry given any member volume id.
628 * @param[in] dp disk partition object
629 * @param[in] volid vg member volume id
630 * @param[out] entry_out address in which to store volume group entry structure pointer
631 * @param[out] hash_out address in which to store hash entry pointer
635 * @warning - it is up to the caller to get a ref to entry_out, if needed
636 * - hash_out must not be referenced after dropping VOL_LOCK
638 * @return operation status
640 * @retval ENOENT volume id not found
641 * @retval EINVAL partition's VGC is invalid
646 _VVGC_lookup(struct DiskPartition64 * dp,
648 VVGCache_entry_t ** entry_out,
649 VVGCache_hash_entry_t ** hash_out)
652 int bucket = VVGC_HASH(volid);
653 struct VVGCache_hash_entry * ent, * nent;
655 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
661 for (queue_Scan(&VVGCache_hash_table.hash_buckets[bucket],
664 VVGCache_hash_entry)) {
665 if (ent->volid == volid && ent->dp == dp) {
667 *entry_out = ent->entry;
679 * add an entry to the volume group cache.
681 * @param[in] dp disk partition object
682 * @param[in] parent parent volume id
683 * @param[in] child child volume id
684 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
685 * new VG, 0 if we added to an existing VG
689 * @return operation status
691 * @retval -1 parent and child are already registered in
695 VVGCache_entry_add_r(struct DiskPartition64 * dp,
701 VVGCache_entry_t * child_ent, * parent_ent;
707 /* check for existing entries */
708 res = _VVGC_lookup(dp, child, &child_ent, NULL);
709 if (res && res != ENOENT) {
714 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
715 if (res && res != ENOENT) {
721 * branch based upon existence of parent and child nodes
723 if (parent_ent && child_ent) {
724 /* both exist. we're done.
725 * if they point different places, then report the error. */
726 if (child_ent != parent_ent) {
729 if (parent == child) {
730 /* if we're adding the RW entry as a child, the RW id may
731 * not be in the child array yet, so make sure not to skip
736 } else if (!parent_ent && child_ent) {
738 * update vg root volid, and add hash entry. */
739 parent_ent = child_ent;
740 parent_ent->rw = parent;
742 code = _VVGC_hash_entry_add(dp,
747 } else if (!child_ent && !parent_ent) {
748 code = _VVGC_entry_add(dp,
758 if (child == parent) {
759 /* if we're the RW, skip over adding the child hash entry;
760 * we already added the hash entry when creating the entry */
761 child_ent = parent_ent;
766 osi_Assert(!child_ent);
767 child_ent = parent_ent;
768 code = _VVGC_hash_entry_add(dp,
777 code = _VVGC_entry_cl_add(child_ent, child);
780 if (code && code != EINVAL) {
781 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
782 " %lu on partition %s", code, afs_printable_uint32_lu(child),
783 afs_printable_uint32_lu(parent), VPartitionPath(dp)));
786 if (code == 0 && VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
787 /* we successfully added the entry; make sure it's not on the
788 * to-delete list, so it doesn't get deleted later */
789 code = _VVGC_dlist_del_r(dp, parent, child);
790 if (code && code != ENOENT) {
791 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
792 "%lu (parent %lu) from the to-delete list for part "
793 "%s.\n", code, afs_printable_uint32_lu(child),
794 afs_printable_uint32_lu(parent),
795 VPartitionPath(dp)));
805 * add an entry to the volume group cache.
807 * @param[in] dp disk partition object
808 * @param[in] parent parent volume id
809 * @param[in] child child volume id
810 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
811 * new VG, 0 if we added to an existing VG
813 * @return operation status
817 VVGCache_entry_add(struct DiskPartition64 * dp,
825 VVGCache_entry_add_r(dp, parent, child, newvg);
832 * delete an entry from the volume group cache.
834 * If partition is scanning, actually puts the entry on a list of entries
835 * to delete when the scan is done.
837 * @param[in] dp disk partition object
838 * @param[in] parent parent volume id
839 * @param[in] child child volume id
843 * @return operation status
847 VVGCache_entry_del_r(struct DiskPartition64 * dp,
848 VolumeId parent, VolumeId child)
850 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
852 code = _VVGC_dlist_add_r(dp, parent, child);
857 return _VVGC_entry_purge_r(dp, parent, child);
861 * delete an entry from the volume group cache.
863 * @param[in] dp disk partition object
864 * @param[in] parent parent volume id
865 * @param[in] child child volume id
871 * @return operation status
875 _VVGC_entry_purge_r(struct DiskPartition64 * dp,
876 VolumeId parent, VolumeId child)
879 VVGCache_entry_t * parent_ent, * child_ent;
880 VVGCache_hash_entry_t * child_hent;
882 /* check mappings for each volid */
883 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
888 res = _VVGC_lookup(dp, child, &child_ent, &child_hent);
894 /* if the mappings don't match, we have a serious error */
895 if (parent_ent != child_ent) {
896 ViceLog(0, ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
897 "but vol %lu points to VGC entry %"AFS_PTR_FMT" and VG %lu "
898 "points to VGC entry %"AFS_PTR_FMT"\n",
899 afs_printable_uint32_lu(child),
900 afs_printable_uint32_lu(parent),
901 afs_printable_uint32_lu(child),
902 child_ent, afs_printable_uint32_lu(parent), parent_ent));
907 code = _VVGC_hash_entry_del(child_hent);
914 * delete an entry from the volume group cache.
916 * @param[in] dp disk partition object
917 * @param[in] parent parent volume id
918 * @param[in] child child volume id
920 * @return operation status
924 VVGCache_entry_del(struct DiskPartition64 * dp,
925 VolumeId parent, VolumeId child)
930 code = VVGCache_entry_del_r(dp, parent, child);
937 * query a volume group by any member volume id.
939 * @param[in] dp disk partition object
940 * @param[in] volume volume id of a member of VG
941 * @param[out] res vg membership data
945 * @return operation status
947 * @retval EAGAIN partition needs to finish scanning
950 VVGCache_query_r(struct DiskPartition64 * dp,
952 VVGCache_query_t * res)
955 VVGCache_entry_t * ent;
957 /* If cache for this partition doesn't exist; start a scan */
958 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
959 code = VVGCache_scanStart_r(dp);
960 if (code == 0 || code == -3) {
961 /* -3 means another thread already started scanning */
966 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
970 code = _VVGC_lookup(dp, volume, &ent, NULL);
972 code = _VVGC_entry_export(ent, res);
979 * query a volume group by any member volume id.
981 * @param[in] dp disk partition object
982 * @param[in] volume volume id of a member of VG
983 * @param[out] res vg membership data
985 * @return operation status
989 VVGCache_query(struct DiskPartition64 * dp,
990 VolumeId volume, VVGCache_query_t * res)
995 code = VVGCache_query_r(dp, volume, res);
1002 * begin asynchronous scan of on-disk volume group metadata.
1004 * @param[in] dp disk partition object
1006 * @pre VOL_LOCK held
1008 * @return operation status
1012 VVGCache_scanStart_r(struct DiskPartition64 * dp)
1017 code = _VVGC_scan_start(dp);
1019 /* start a scanner thread on each partition */
1020 for (dp = DiskPartitionList; dp; dp = dp->next) {
1021 res = _VVGC_scan_start(dp);
1032 * begin asynchronous scan of on-disk volume group metadata.
1034 * @param[in] dp disk partition object
1036 * @return operation status
1040 VVGCache_scanStart(struct DiskPartition64 * dp)
1045 code = VVGCache_scanStart_r(dp);
1052 * wait for async on-disk VG metadata scan to complete.
1054 * @param[in] dp disk partition object
1056 * @pre VOL_LOCK held
1058 * @warning this routine must drop VOL_LOCK internally
1060 * @return operation status
1064 VVGCache_scanWait_r(struct DiskPartition64 * dp)
1068 while (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
1069 VOL_CV_WAIT(&VVGCache.part[dp->index].cv);
1076 * wait for async on-disk VG metadata scan to complete.
1078 * @param[in] dp disk partition object
1080 * @return operation status
1084 VVGCache_scanWait(struct DiskPartition64 * dp)
1089 code = VVGCache_scanWait_r(dp);
1096 * flush all cache entries for a given disk partition.
1098 * @param[in] part disk partition object
1100 * @pre VOL_LOCK held
1102 * @return operation status
1108 _VVGC_flush_part_r(struct DiskPartition64 * part)
1112 VVGCache_hash_entry_t * ent, * nent;
1114 for (i = 0; i < VolumeHashTable.Size; i++) {
1115 for (queue_Scan(&VVGCache_hash_table.hash_buckets[i],
1118 VVGCache_hash_entry)) {
1119 if (ent->dp == part) {
1120 VolumeId volid = ent->volid;
1121 res = _VVGC_hash_entry_del(ent);
1123 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1124 res, afs_printable_uint32_lu(volid)));
1135 * flush all cache entries for a given disk partition.
1137 * @param[in] part disk partition object
1139 * @return operation status
1145 _VVGC_flush_part(struct DiskPartition64 * part)
1150 code = _VVGC_flush_part_r(part);
1158 * change VVGC partition state.
1160 * @param[in] part disk partition object
1161 * @param[in] state new state
1163 * @pre VOL_LOCK is held
1170 _VVGC_state_change(struct DiskPartition64 * part,
1171 VVGCache_part_state_t state)
1173 VVGCache_part_state_t old_state;
1175 old_state = VVGCache.part[part->index].state;
1176 VVGCache.part[part->index].state = state;
1178 if (old_state != state) {
1179 CV_BROADCAST(&VVGCache.part[part->index].cv);
1185 #endif /* AFS_DEMAND_ATTACH_FS */