2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
20 #ifdef HAVE_SYS_FILE_H
24 #ifdef AFS_DEMAND_ATTACH_FS
27 #include <rx/rx_queue.h>
30 #include <afs/afsutil.h>
32 #include <afs/afsint.h>
36 #include "viceinode.h"
38 #include "partition.h"
39 #include <afs/errors.h>
41 #define __VOL_VG_CACHE_IMPL 1
44 #include "vg_cache_impl.h"
46 static int _VVGC_lookup(struct DiskPartition64 *,
48 VVGCache_entry_t ** entry,
49 VVGCache_hash_entry_t ** hentry);
50 static int _VVGC_entry_alloc(VVGCache_entry_t ** entry);
51 static int _VVGC_entry_free(VVGCache_entry_t * entry);
52 static int _VVGC_entry_get(VVGCache_entry_t * entry);
53 static int _VVGC_entry_put(struct DiskPartition64 *,
54 VVGCache_entry_t * entry);
55 static int _VVGC_entry_add(struct DiskPartition64 *,
58 VVGCache_hash_entry_t **);
59 static int _VVGC_entry_cl_add(VVGCache_entry_t *, VolumeId);
60 static int _VVGC_entry_cl_del(struct DiskPartition64 *, VVGCache_entry_t *,
62 static int _VVGC_entry_export(VVGCache_entry_t *, VVGCache_query_t *);
63 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry);
64 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry);
65 static int _VVGC_hash_entry_add(struct DiskPartition64 *,
68 VVGCache_hash_entry_t **);
69 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t * entry);
70 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * entry);
72 VVGCache_hash_table_t VVGCache_hash_table;
76 * initialize volume group cache subsystem.
78 * @return operation status
82 VVGCache_PkgInit(void)
87 /* allocate hash table */
88 VVGCache_hash_table.hash_buckets =
89 malloc(VolumeHashTable.Size * sizeof(struct rx_queue));
90 if (VVGCache_hash_table.hash_buckets == NULL) {
95 /* setup hash chain heads */
96 for (i = 0; i < VolumeHashTable.Size; i++) {
97 queue_Init(&VVGCache_hash_table.hash_buckets[i]);
100 /* initialize per-partition VVGC state */
101 for (i = 0; i <= VOLMAXPARTS; i++) {
102 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
103 VVGCache.part[i].dlist_hash_buckets = NULL;
104 CV_INIT(&VVGCache.part[i].cv, "cache part", CV_DEFAULT, 0);
115 * shut down volume group cache subsystem.
117 * @return operation status
123 VVGCache_PkgShutdown(void)
129 /* free hash table */
130 free(VVGCache_hash_table.hash_buckets);
131 VVGCache_hash_table.hash_buckets = NULL;
133 /* destroy per-partition VVGC state */
134 for (i = 0; i <= VOLMAXPARTS; i++) {
135 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
136 CV_DESTROY(&VVGCache.part[i].cv);
143 * allocate a cache entry.
145 * @param[out] entry_out pointer to newly allocated entry
147 * @return operation status
153 _VVGC_entry_alloc(VVGCache_entry_t ** entry_out)
155 *entry_out = calloc(1, sizeof(VVGCache_entry_t));
157 if (*entry_out == NULL)
164 * free a cache entry.
166 * @param[in] entry cache entry
168 * @return operation status
174 _VVGC_entry_free(VVGCache_entry_t * entry)
178 opr_Assert(entry->refcnt == 0);
185 * allocate and register an entry for a volume group.
187 * @param[in] dp disk partition object
188 * @param[in] volid volume id
189 * @param[out] entry_out vg cache object pointer
190 * @param[out] hash_out vg cache hash entry object pointer
192 * @pre - VOL_LOCK held
193 * - no such entry exists in hash table
195 * @return operation status
201 _VVGC_entry_add(struct DiskPartition64 * dp,
203 VVGCache_entry_t ** entry_out,
204 VVGCache_hash_entry_t ** hash_out)
207 VVGCache_entry_t * ent;
209 code = _VVGC_entry_alloc(&ent);
215 /* refcnt will be inc'd when a child is added */
218 code = _VVGC_hash_entry_add(dp, volid, ent, hash_out);
230 _VVGC_entry_free(ent);
237 * add a volid to the entry's child list.
239 * @param[in] ent volume group object
240 * @param[in] volid volume id
242 * @return operation status
244 * @retval -1 child table is full
249 _VVGC_entry_cl_add(VVGCache_entry_t * ent,
255 /* search table to avoid duplicates */
256 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
257 if (ent->children[i] == volid) {
258 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
260 afs_printable_uint32_lu(volid),
261 afs_printable_uint32_lu(ent->rw)));
264 if (empty_idx == -1 && !ent->children[i]) {
266 /* don't break; make sure we go through all children so we don't
267 * add a duplicate entry */
271 /* verify table isn't full */
272 if (empty_idx == -1) {
274 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
275 "is full\n", afs_printable_uint32_lu(volid),
276 afs_printable_uint32_lu(ent->rw)));
281 ent->children[empty_idx] = volid;
284 code = _VVGC_entry_get(ent);
291 * delete a volid from the entry's child list.
293 * @param[in] dp disk partition object
294 * @param[in] ent volume group object
295 * @param[in] volid volume id
297 * @return operation status
299 * @retval -1 no such entry found
304 _VVGC_entry_cl_del(struct DiskPartition64 *dp,
305 VVGCache_entry_t * ent,
310 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
311 if (ent->children[i] == volid) {
312 ent->children[i] = 0;
320 code = _VVGC_entry_put(dp, ent);
327 * add a refcount to an entry.
329 * @param[in] entry cache entry
333 * @return operation status
338 static int _VVGC_entry_get(VVGCache_entry_t * entry)
345 * put back a reference to an entry.
347 * @param[in] dp disk partition object
348 * @param[in] entry cache entry
352 * @warning do not attempt to deref pointer after calling this interface
354 * @return operation status
357 * @note dp is needed to lookup the RW hash entry to unlink, if we are
358 * putting back the final reference and freeing
363 _VVGC_entry_put(struct DiskPartition64 * dp, VVGCache_entry_t * entry)
367 opr_Assert(entry->refcnt > 0);
369 if (--entry->refcnt == 0) {
370 VVGCache_entry_t *nentry;
371 VVGCache_hash_entry_t *hentry;
373 /* first, try to delete the RW id hash entry pointing to this
375 code = _VVGC_lookup(dp, entry->rw, &nentry, &hentry);
377 if (nentry != entry) {
378 /* looking up the rw of this entry points to a different
379 * entry; should not happen */
380 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
381 "found different entry than was passed",
382 afs_printable_uint32_lu(entry->rw)));
385 code = _VVGC_hash_entry_unlink(hentry);
388 } else if (code == ENOENT) {
389 /* ignore ENOENT; this shouldn't happen, since the RW hash
390 * entry should always exist if the entry does... but we
391 * were going to delete it anyway, so try to continue */
392 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
393 "vol %lu, but RW hash entry doesn't exist; continuing "
394 "anyway...\n", afs_printable_uint32_lu(entry->rw)));
399 /* now, just free the entry itself */
401 code = _VVGC_entry_free(entry);
409 * export a volume group entry in the external object format.
411 * @param[in] ent internal-format volume group object
412 * @param[out] qry external-format volume group object
416 * @return operation status
422 _VVGC_entry_export(VVGCache_entry_t * ent, VVGCache_query_t * qry)
427 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
428 qry->children[i] = ent->children[i];
435 * allocate a hash table entry structure.
437 * @param[out] entry_out address in which to store newly allocated hash entry struct
439 * @return operation status
445 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry_out)
448 VVGCache_hash_entry_t * ent;
450 *entry_out = ent = malloc(sizeof(VVGCache_hash_entry_t));
459 * free a hash table entry structure.
461 * @param[in] entry hash table entry structure to be freed
463 * @return operation status
469 _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry)
479 * add an entry to the hash table.
481 * @param[in] dp disk partition object
482 * @param[in] volid volume id
483 * @param[in] ent volume group object
484 * @param[out] hash_out address in which to store pointer to hash entry
488 * @return operation status
490 * @retval EEXIST hash entry for volid already exists, and it points to
491 * a different VG entry
496 _VVGC_hash_entry_add(struct DiskPartition64 * dp,
498 VVGCache_entry_t * ent,
499 VVGCache_hash_entry_t ** hash_out)
502 VVGCache_hash_entry_t * hent;
503 int hash = VVGC_HASH(volid);
504 VVGCache_entry_t *nent;
506 code = _VVGC_lookup(dp, volid, &nent, hash_out);
509 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
510 " nonmatching entry for vol %lu: original "
511 "(%"AFS_PTR_FMT",%lu) new (%"AFS_PTR_FMT",%lu)\n",
512 afs_printable_uint32_lu(volid),
513 nent, afs_printable_uint32_lu(nent->rw),
514 ent, afs_printable_uint32_lu(ent->rw)));
517 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
518 "hash entry for vol %lu, VG %lu",
519 afs_printable_uint32_lu(volid),
520 afs_printable_uint32_lu(ent->rw)));
521 /* accept attempts to add matching duplicate entries; just
522 * pretend we added it */
526 code = _VVGC_hash_entry_alloc(&hent);
534 queue_Append(&VVGCache_hash_table.hash_buckets[hash],
545 * remove an entry from the hash table.
547 * @param[in] hent hash table entry
551 * @return operation status
557 _VVGC_hash_entry_del(VVGCache_hash_entry_t * hent)
562 if (hent->entry->rw == hent->volid) {
566 code = _VVGC_entry_cl_del(hent->dp, hent->entry, hent->volid);
567 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
568 * if hent->entry->rw == hent->volid, it is possible for hent to
572 /* If we are the RW id, don't unlink, since we still need the
573 * hash entry to exist, so when we lookup children, they can
574 * look up the RW id hash chain, and they will all go to the
577 * If we are the last entry and the entry should be deleted,
578 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
580 res = _VVGC_hash_entry_unlink(hent);
590 * low-level interface to remove an entry from the hash table.
592 * Does not alter the refcount or worry about the children lists or
593 * anything like that; just removes the hash table entry, frees it, and
594 * that's all. You probably want @see _VVGC_hash_entry_del instead.
596 * @param[in] hent hash table entry
600 * @return operation status
606 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * hent)
613 code = _VVGC_hash_entry_free(hent);
619 * lookup a vg cache entry given any member volume id.
621 * @param[in] dp disk partition object
622 * @param[in] volid vg member volume id
623 * @param[out] entry_out address in which to store volume group entry structure pointer
624 * @param[out] hash_out address in which to store hash entry pointer
628 * @warning - it is up to the caller to get a ref to entry_out, if needed
629 * - hash_out must not be referenced after dropping VOL_LOCK
631 * @return operation status
633 * @retval ENOENT volume id not found
634 * @retval EINVAL partition's VGC is invalid
639 _VVGC_lookup(struct DiskPartition64 * dp,
641 VVGCache_entry_t ** entry_out,
642 VVGCache_hash_entry_t ** hash_out)
645 int bucket = VVGC_HASH(volid);
646 struct VVGCache_hash_entry * ent, * nent;
648 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
654 for (queue_Scan(&VVGCache_hash_table.hash_buckets[bucket],
657 VVGCache_hash_entry)) {
658 if (ent->volid == volid && ent->dp == dp) {
660 *entry_out = ent->entry;
672 * add an entry to the volume group cache.
674 * @param[in] dp disk partition object
675 * @param[in] parent parent volume id
676 * @param[in] child child volume id
677 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
678 * new VG, 0 if we added to an existing VG
682 * @return operation status
684 * @retval -1 parent and child are already registered in
688 VVGCache_entry_add_r(struct DiskPartition64 * dp,
694 VVGCache_entry_t * child_ent, * parent_ent;
700 /* check for existing entries */
701 res = _VVGC_lookup(dp, child, &child_ent, NULL);
702 if (res && res != ENOENT) {
707 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
708 if (res && res != ENOENT) {
714 * branch based upon existence of parent and child nodes
716 if (parent_ent && child_ent) {
717 /* both exist. we're done.
718 * if they point different places, then report the error. */
719 if (child_ent != parent_ent) {
722 if (parent == child) {
723 /* if we're adding the RW entry as a child, the RW id may
724 * not be in the child array yet, so make sure not to skip
729 } else if (!parent_ent && child_ent) {
731 * update vg root volid, and add hash entry. */
732 parent_ent = child_ent;
733 parent_ent->rw = parent;
735 code = _VVGC_hash_entry_add(dp,
740 } else if (!child_ent && !parent_ent) {
741 code = _VVGC_entry_add(dp,
751 if (child == parent) {
752 /* if we're the RW, skip over adding the child hash entry;
753 * we already added the hash entry when creating the entry */
754 child_ent = parent_ent;
759 opr_Assert(!child_ent);
760 child_ent = parent_ent;
761 code = _VVGC_hash_entry_add(dp,
770 code = _VVGC_entry_cl_add(child_ent, child);
773 if (code && code != EINVAL) {
774 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
775 " %lu on partition %s", code, afs_printable_uint32_lu(child),
776 afs_printable_uint32_lu(parent), VPartitionPath(dp)));
779 if (code == 0 && VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
780 /* we successfully added the entry; make sure it's not on the
781 * to-delete list, so it doesn't get deleted later */
782 code = _VVGC_dlist_del_r(dp, parent, child);
783 if (code && code != ENOENT) {
784 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
785 "%lu (parent %lu) from the to-delete list for part "
786 "%s.\n", code, afs_printable_uint32_lu(child),
787 afs_printable_uint32_lu(parent),
788 VPartitionPath(dp)));
798 * add an entry to the volume group cache.
800 * @param[in] dp disk partition object
801 * @param[in] parent parent volume id
802 * @param[in] child child volume id
803 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
804 * new VG, 0 if we added to an existing VG
806 * @return operation status
810 VVGCache_entry_add(struct DiskPartition64 * dp,
818 VVGCache_entry_add_r(dp, parent, child, newvg);
825 * delete an entry from the volume group cache.
827 * If partition is scanning, actually puts the entry on a list of entries
828 * to delete when the scan is done.
830 * @param[in] dp disk partition object
831 * @param[in] parent parent volume id
832 * @param[in] child child volume id
836 * @return operation status
840 VVGCache_entry_del_r(struct DiskPartition64 * dp,
841 VolumeId parent, VolumeId child)
843 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
845 code = _VVGC_dlist_add_r(dp, parent, child);
850 return _VVGC_entry_purge_r(dp, parent, child);
854 * delete an entry from the volume group cache.
856 * @param[in] dp disk partition object
857 * @param[in] parent parent volume id
858 * @param[in] child child volume id
864 * @return operation status
868 _VVGC_entry_purge_r(struct DiskPartition64 * dp,
869 VolumeId parent, VolumeId child)
872 VVGCache_entry_t * child_ent;
873 VVGCache_hash_entry_t * child_hent;
875 res = _VVGC_lookup(dp, child, &child_ent, &child_hent);
882 VVGCache_entry_t * parent_ent;
884 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
890 /* if the mappings don't match, we have a serious error */
891 if (parent_ent != child_ent) {
893 ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
894 "but vol %lu points to VGC entry %" AFS_PTR_FMT
895 " and VG %lu " "points to VGC entry %" AFS_PTR_FMT "\n",
896 afs_printable_uint32_lu(child),
897 afs_printable_uint32_lu(parent),
898 afs_printable_uint32_lu(child), child_ent,
899 afs_printable_uint32_lu(parent), parent_ent));
905 code = _VVGC_hash_entry_del(child_hent);
912 * delete an entry from the volume group cache.
914 * @param[in] dp disk partition object
915 * @param[in] parent parent volume id
916 * @param[in] child child volume id
918 * @return operation status
922 VVGCache_entry_del(struct DiskPartition64 * dp,
923 VolumeId parent, VolumeId child)
928 code = VVGCache_entry_del_r(dp, parent, child);
935 * query a volume group by any member volume id.
937 * @param[in] dp disk partition object
938 * @param[in] volume volume id of a member of VG
939 * @param[out] res vg membership data
943 * @return operation status
945 * @retval EAGAIN partition needs to finish scanning
948 VVGCache_query_r(struct DiskPartition64 * dp,
950 VVGCache_query_t * res)
953 VVGCache_entry_t * ent;
955 /* If cache for this partition doesn't exist; start a scan */
956 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
957 code = VVGCache_scanStart_r(dp);
958 if (code == 0 || code == -3) {
959 /* -3 means another thread already started scanning */
964 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
968 code = _VVGC_lookup(dp, volume, &ent, NULL);
970 code = _VVGC_entry_export(ent, res);
977 * query a volume group by any member volume id.
979 * @param[in] dp disk partition object
980 * @param[in] volume volume id of a member of VG
981 * @param[out] res vg membership data
983 * @return operation status
987 VVGCache_query(struct DiskPartition64 * dp,
988 VolumeId volume, VVGCache_query_t * res)
993 code = VVGCache_query_r(dp, volume, res);
1000 * begin asynchronous scan of on-disk volume group metadata.
1002 * @param[in] dp disk partition object
1004 * @pre VOL_LOCK held
1006 * @return operation status
1010 VVGCache_scanStart_r(struct DiskPartition64 * dp)
1015 code = _VVGC_scan_start(dp);
1017 /* start a scanner thread on each partition */
1018 for (dp = DiskPartitionList; dp; dp = dp->next) {
1019 res = _VVGC_scan_start(dp);
1030 * begin asynchronous scan of on-disk volume group metadata.
1032 * @param[in] dp disk partition object
1034 * @return operation status
1038 VVGCache_scanStart(struct DiskPartition64 * dp)
1043 code = VVGCache_scanStart_r(dp);
1050 * wait for async on-disk VG metadata scan to complete.
1052 * @param[in] dp disk partition object
1054 * @pre VOL_LOCK held
1056 * @warning this routine must drop VOL_LOCK internally
1058 * @return operation status
1062 VVGCache_scanWait_r(struct DiskPartition64 * dp)
1066 while (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
1067 VOL_CV_WAIT(&VVGCache.part[dp->index].cv);
1074 * wait for async on-disk VG metadata scan to complete.
1076 * @param[in] dp disk partition object
1078 * @return operation status
1082 VVGCache_scanWait(struct DiskPartition64 * dp)
1087 code = VVGCache_scanWait_r(dp);
1094 * flush all cache entries for a given disk partition.
1096 * @param[in] part disk partition object
1098 * @pre VOL_LOCK held
1100 * @return operation status
1106 _VVGC_flush_part_r(struct DiskPartition64 * part)
1110 VVGCache_hash_entry_t * ent, * nent;
1112 for (i = 0; i < VolumeHashTable.Size; i++) {
1113 for (queue_Scan(&VVGCache_hash_table.hash_buckets[i],
1116 VVGCache_hash_entry)) {
1117 if (ent->dp == part) {
1118 VolumeId volid = ent->volid;
1119 res = _VVGC_hash_entry_del(ent);
1121 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1122 res, afs_printable_uint32_lu(volid)));
1133 * flush all cache entries for a given disk partition.
1135 * @param[in] part disk partition object
1137 * @return operation status
1143 _VVGC_flush_part(struct DiskPartition64 * part)
1148 code = _VVGC_flush_part_r(part);
1156 * change VVGC partition state.
1158 * @param[in] part disk partition object
1159 * @param[in] state new state
1161 * @pre VOL_LOCK is held
1168 _VVGC_state_change(struct DiskPartition64 * part,
1169 VVGCache_part_state_t state)
1171 VVGCache_part_state_t old_state;
1173 old_state = VVGCache.part[part->index].state;
1174 VVGCache.part[part->index].state = state;
1176 if (old_state != state) {
1177 CV_BROADCAST(&VVGCache.part[part->index].cv);
1183 #endif /* AFS_DEMAND_ATTACH_FS */