2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
20 #ifdef HAVE_SYS_FILE_H
24 #ifdef AFS_DEMAND_ATTACH_FS
27 #include <afs/afsutil.h>
30 #include <afs/afsint.h>
34 #include "viceinode.h"
36 #include "partition.h"
37 #include <afs/errors.h>
39 #define __VOL_VG_CACHE_IMPL 1
42 #include "vg_cache_impl.h"
44 static int _VVGC_lookup(struct DiskPartition64 *,
46 VVGCache_entry_t ** entry,
47 VVGCache_hash_entry_t ** hentry);
48 static int _VVGC_entry_alloc(VVGCache_entry_t ** entry);
49 static int _VVGC_entry_free(VVGCache_entry_t * entry);
50 static int _VVGC_entry_get(VVGCache_entry_t * entry);
51 static int _VVGC_entry_put(struct DiskPartition64 *,
52 VVGCache_entry_t * entry);
53 static int _VVGC_entry_add(struct DiskPartition64 *,
56 VVGCache_hash_entry_t **);
57 static int _VVGC_entry_cl_add(VVGCache_entry_t *, VolumeId);
58 static int _VVGC_entry_cl_del(struct DiskPartition64 *, VVGCache_entry_t *,
60 static int _VVGC_entry_export(VVGCache_entry_t *, VVGCache_query_t *);
61 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry);
62 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry);
63 static int _VVGC_hash_entry_add(struct DiskPartition64 *,
66 VVGCache_hash_entry_t **);
67 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t * entry);
68 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * entry);
70 VVGCache_hash_table_t VVGCache_hash_table;
74 * initialize volume group cache subsystem.
76 * @return operation status
80 VVGCache_PkgInit(void)
85 /* allocate hash table */
86 VVGCache_hash_table.hash_buckets =
87 malloc(VolumeHashTable.Size * sizeof(struct rx_queue));
88 if (VVGCache_hash_table.hash_buckets == NULL) {
93 /* setup hash chain heads */
94 for (i = 0; i < VolumeHashTable.Size; i++) {
95 queue_Init(&VVGCache_hash_table.hash_buckets[i]);
98 /* initialize per-partition VVGC state */
99 for (i = 0; i <= VOLMAXPARTS; i++) {
100 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
101 VVGCache.part[i].dlist_hash_buckets = NULL;
102 CV_INIT(&VVGCache.part[i].cv, "cache part", CV_DEFAULT, 0);
113 * shut down volume group cache subsystem.
115 * @return operation status
121 VVGCache_PkgShutdown(void)
127 /* free hash table */
128 free(VVGCache_hash_table.hash_buckets);
129 VVGCache_hash_table.hash_buckets = NULL;
131 /* destroy per-partition VVGC state */
132 for (i = 0; i <= VOLMAXPARTS; i++) {
133 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
134 CV_DESTROY(&VVGCache.part[i].cv);
141 * allocate a cache entry.
143 * @param[out] entry_out pointer to newly allocated entry
145 * @return operation status
151 _VVGC_entry_alloc(VVGCache_entry_t ** entry_out)
153 *entry_out = calloc(1, sizeof(VVGCache_entry_t));
155 if (*entry_out == NULL)
162 * free a cache entry.
164 * @param[in] entry cache entry
166 * @return operation status
172 _VVGC_entry_free(VVGCache_entry_t * entry)
176 osi_Assert(entry->refcnt == 0);
183 * allocate and register an entry for a volume group.
185 * @param[in] dp disk partition object
186 * @param[in] volid volume id
187 * @param[out] entry_out vg cache object pointer
188 * @param[out] hash_out vg cache hash entry object pointer
190 * @pre - VOL_LOCK held
191 * - no such entry exists in hash table
193 * @return operation status
199 _VVGC_entry_add(struct DiskPartition64 * dp,
201 VVGCache_entry_t ** entry_out,
202 VVGCache_hash_entry_t ** hash_out)
205 VVGCache_entry_t * ent;
207 code = _VVGC_entry_alloc(&ent);
213 /* refcnt will be inc'd when a child is added */
216 code = _VVGC_hash_entry_add(dp, volid, ent, hash_out);
228 _VVGC_entry_free(ent);
235 * add a volid to the entry's child list.
237 * @param[in] ent volume group object
238 * @param[in] volid volume id
240 * @return operation status
242 * @retval -1 child table is full
247 _VVGC_entry_cl_add(VVGCache_entry_t * ent,
253 /* search table to avoid duplicates */
254 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
255 if (ent->children[i] == volid) {
256 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
258 afs_printable_uint32_lu(volid),
259 afs_printable_uint32_lu(ent->rw)));
262 if (empty_idx == -1 && !ent->children[i]) {
264 /* don't break; make sure we go through all children so we don't
265 * add a duplicate entry */
269 /* verify table isn't full */
270 if (empty_idx == -1) {
272 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
273 "is full\n", afs_printable_uint32_lu(volid),
274 afs_printable_uint32_lu(ent->rw)));
279 ent->children[empty_idx] = volid;
282 code = _VVGC_entry_get(ent);
289 * delete a volid from the entry's child list.
291 * @param[in] dp disk partition object
292 * @param[in] ent volume group object
293 * @param[in] volid volume id
295 * @return operation status
297 * @retval -1 no such entry found
302 _VVGC_entry_cl_del(struct DiskPartition64 *dp,
303 VVGCache_entry_t * ent,
308 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
309 if (ent->children[i] == volid) {
310 ent->children[i] = 0;
318 code = _VVGC_entry_put(dp, ent);
325 * add a refcount to an entry.
327 * @param[in] entry cache entry
331 * @return operation status
336 static int _VVGC_entry_get(VVGCache_entry_t * entry)
343 * put back a reference to an entry.
345 * @param[in] dp disk partition object
346 * @param[in] entry cache entry
350 * @warning do not attempt to deref pointer after calling this interface
352 * @return operation status
355 * @note dp is needed to lookup the RW hash entry to unlink, if we are
356 * putting back the final reference and freeing
361 _VVGC_entry_put(struct DiskPartition64 * dp, VVGCache_entry_t * entry)
365 osi_Assert(entry->refcnt > 0);
367 if (--entry->refcnt == 0) {
368 VVGCache_entry_t *nentry;
369 VVGCache_hash_entry_t *hentry;
371 /* first, try to delete the RW id hash entry pointing to this
373 code = _VVGC_lookup(dp, entry->rw, &nentry, &hentry);
375 if (nentry != entry) {
376 /* looking up the rw of this entry points to a different
377 * entry; should not happen */
378 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
379 "found different entry than was passed",
380 afs_printable_uint32_lu(entry->rw)));
383 code = _VVGC_hash_entry_unlink(hentry);
386 } else if (code == ENOENT) {
387 /* ignore ENOENT; this shouldn't happen, since the RW hash
388 * entry should always exist if the entry does... but we
389 * were going to delete it anyway, so try to continue */
390 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
391 "vol %lu, but RW hash entry doesn't exist; continuing "
392 "anyway...\n", afs_printable_uint32_lu(entry->rw)));
397 /* now, just free the entry itself */
399 code = _VVGC_entry_free(entry);
407 * export a volume group entry in the external object format.
409 * @param[in] ent internal-format volume group object
410 * @param[out] qry external-format volume group object
414 * @return operation status
420 _VVGC_entry_export(VVGCache_entry_t * ent, VVGCache_query_t * qry)
425 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
426 qry->children[i] = ent->children[i];
433 * allocate a hash table entry structure.
435 * @param[out] entry_out address in which to store newly allocated hash entry struct
437 * @return operation status
443 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry_out)
446 VVGCache_hash_entry_t * ent;
448 *entry_out = ent = malloc(sizeof(VVGCache_hash_entry_t));
457 * free a hash table entry structure.
459 * @param[in] entry hash table entry structure to be freed
461 * @return operation status
467 _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry)
477 * add an entry to the hash table.
479 * @param[in] dp disk partition object
480 * @param[in] volid volume id
481 * @param[in] ent volume group object
482 * @param[out] hash_out address in which to store pointer to hash entry
486 * @return operation status
488 * @retval EEXIST hash entry for volid already exists, and it points to
489 * a different VG entry
494 _VVGC_hash_entry_add(struct DiskPartition64 * dp,
496 VVGCache_entry_t * ent,
497 VVGCache_hash_entry_t ** hash_out)
500 VVGCache_hash_entry_t * hent;
501 int hash = VVGC_HASH(volid);
502 VVGCache_entry_t *nent;
504 code = _VVGC_lookup(dp, volid, &nent, hash_out);
507 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
508 " nonmatching entry for vol %lu: original "
509 "(%"AFS_PTR_FMT",%lu) new (%"AFS_PTR_FMT",%lu)\n",
510 afs_printable_uint32_lu(volid),
511 nent, afs_printable_uint32_lu(nent->rw),
512 ent, afs_printable_uint32_lu(ent->rw)));
515 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
516 "hash entry for vol %lu, VG %lu",
517 afs_printable_uint32_lu(volid),
518 afs_printable_uint32_lu(ent->rw)));
519 /* accept attempts to add matching duplicate entries; just
520 * pretend we added it */
524 code = _VVGC_hash_entry_alloc(&hent);
532 queue_Append(&VVGCache_hash_table.hash_buckets[hash],
543 * remove an entry from the hash table.
545 * @param[in] hent hash table entry
549 * @return operation status
555 _VVGC_hash_entry_del(VVGCache_hash_entry_t * hent)
560 if (hent->entry->rw == hent->volid) {
564 code = _VVGC_entry_cl_del(hent->dp, hent->entry, hent->volid);
565 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
566 * if hent->entry->rw == hent->volid, it is possible for hent to
570 /* If we are the RW id, don't unlink, since we still need the
571 * hash entry to exist, so when we lookup children, they can
572 * look up the RW id hash chain, and they will all go to the
575 * If we are the last entry and the entry should be deleted,
576 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
578 res = _VVGC_hash_entry_unlink(hent);
588 * low-level interface to remove an entry from the hash table.
590 * Does not alter the refcount or worry about the children lists or
591 * anything like that; just removes the hash table entry, frees it, and
592 * that's all. You probably want @see _VVGC_hash_entry_del instead.
594 * @param[in] hent hash table entry
598 * @return operation status
604 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * hent)
611 code = _VVGC_hash_entry_free(hent);
617 * lookup a vg cache entry given any member volume id.
619 * @param[in] dp disk partition object
620 * @param[in] volid vg member volume id
621 * @param[out] entry_out address in which to store volume group entry structure pointer
622 * @param[out] hash_out address in which to store hash entry pointer
626 * @warning - it is up to the caller to get a ref to entry_out, if needed
627 * - hash_out must not be referenced after dropping VOL_LOCK
629 * @return operation status
631 * @retval ENOENT volume id not found
632 * @retval EINVAL partition's VGC is invalid
637 _VVGC_lookup(struct DiskPartition64 * dp,
639 VVGCache_entry_t ** entry_out,
640 VVGCache_hash_entry_t ** hash_out)
643 int bucket = VVGC_HASH(volid);
644 struct VVGCache_hash_entry * ent, * nent;
646 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
652 for (queue_Scan(&VVGCache_hash_table.hash_buckets[bucket],
655 VVGCache_hash_entry)) {
656 if (ent->volid == volid && ent->dp == dp) {
658 *entry_out = ent->entry;
670 * add an entry to the volume group cache.
672 * @param[in] dp disk partition object
673 * @param[in] parent parent volume id
674 * @param[in] child child volume id
675 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
676 * new VG, 0 if we added to an existing VG
680 * @return operation status
682 * @retval -1 parent and child are already registered in
686 VVGCache_entry_add_r(struct DiskPartition64 * dp,
692 VVGCache_entry_t * child_ent, * parent_ent;
698 /* check for existing entries */
699 res = _VVGC_lookup(dp, child, &child_ent, NULL);
700 if (res && res != ENOENT) {
705 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
706 if (res && res != ENOENT) {
712 * branch based upon existence of parent and child nodes
714 if (parent_ent && child_ent) {
715 /* both exist. we're done.
716 * if they point different places, then report the error. */
717 if (child_ent != parent_ent) {
720 if (parent == child) {
721 /* if we're adding the RW entry as a child, the RW id may
722 * not be in the child array yet, so make sure not to skip
727 } else if (!parent_ent && child_ent) {
729 * update vg root volid, and add hash entry. */
730 parent_ent = child_ent;
731 parent_ent->rw = parent;
733 code = _VVGC_hash_entry_add(dp,
738 } else if (!child_ent && !parent_ent) {
739 code = _VVGC_entry_add(dp,
749 if (child == parent) {
750 /* if we're the RW, skip over adding the child hash entry;
751 * we already added the hash entry when creating the entry */
752 child_ent = parent_ent;
757 osi_Assert(!child_ent);
758 child_ent = parent_ent;
759 code = _VVGC_hash_entry_add(dp,
768 code = _VVGC_entry_cl_add(child_ent, child);
771 if (code && code != EINVAL) {
772 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
773 " %lu on partition %s", code, afs_printable_uint32_lu(child),
774 afs_printable_uint32_lu(parent), VPartitionPath(dp)));
777 if (code == 0 && VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
778 /* we successfully added the entry; make sure it's not on the
779 * to-delete list, so it doesn't get deleted later */
780 code = _VVGC_dlist_del_r(dp, parent, child);
781 if (code && code != ENOENT) {
782 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
783 "%lu (parent %lu) from the to-delete list for part "
784 "%s.\n", code, afs_printable_uint32_lu(child),
785 afs_printable_uint32_lu(parent),
786 VPartitionPath(dp)));
796 * add an entry to the volume group cache.
798 * @param[in] dp disk partition object
799 * @param[in] parent parent volume id
800 * @param[in] child child volume id
801 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
802 * new VG, 0 if we added to an existing VG
804 * @return operation status
808 VVGCache_entry_add(struct DiskPartition64 * dp,
816 VVGCache_entry_add_r(dp, parent, child, newvg);
823 * delete an entry from the volume group cache.
825 * If partition is scanning, actually puts the entry on a list of entries
826 * to delete when the scan is done.
828 * @param[in] dp disk partition object
829 * @param[in] parent parent volume id
830 * @param[in] child child volume id
834 * @return operation status
838 VVGCache_entry_del_r(struct DiskPartition64 * dp,
839 VolumeId parent, VolumeId child)
841 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
843 code = _VVGC_dlist_add_r(dp, parent, child);
848 return _VVGC_entry_purge_r(dp, parent, child);
852 * delete an entry from the volume group cache.
854 * @param[in] dp disk partition object
855 * @param[in] parent parent volume id
856 * @param[in] child child volume id
862 * @return operation status
866 _VVGC_entry_purge_r(struct DiskPartition64 * dp,
867 VolumeId parent, VolumeId child)
870 VVGCache_entry_t * parent_ent, * child_ent;
871 VVGCache_hash_entry_t * child_hent;
873 /* check mappings for each volid */
874 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
879 res = _VVGC_lookup(dp, child, &child_ent, &child_hent);
885 /* if the mappings don't match, we have a serious error */
886 if (parent_ent != child_ent) {
887 ViceLog(0, ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
888 "but vol %lu points to VGC entry %"AFS_PTR_FMT" and VG %lu "
889 "points to VGC entry %"AFS_PTR_FMT"\n",
890 afs_printable_uint32_lu(child),
891 afs_printable_uint32_lu(parent),
892 afs_printable_uint32_lu(child),
893 child_ent, afs_printable_uint32_lu(parent), parent_ent));
898 code = _VVGC_hash_entry_del(child_hent);
905 * delete an entry from the volume group cache.
907 * @param[in] dp disk partition object
908 * @param[in] parent parent volume id
909 * @param[in] child child volume id
911 * @return operation status
915 VVGCache_entry_del(struct DiskPartition64 * dp,
916 VolumeId parent, VolumeId child)
921 code = VVGCache_entry_del_r(dp, parent, child);
928 * query a volume group by any member volume id.
930 * @param[in] dp disk partition object
931 * @param[in] volume volume id of a member of VG
932 * @param[out] res vg membership data
936 * @return operation status
938 * @retval EAGAIN partition needs to finish scanning
941 VVGCache_query_r(struct DiskPartition64 * dp,
943 VVGCache_query_t * res)
946 VVGCache_entry_t * ent;
948 /* If cache for this partition doesn't exist; start a scan */
949 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
950 code = VVGCache_scanStart_r(dp);
951 if (code == 0 || code == -3) {
952 /* -3 means another thread already started scanning */
957 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
961 code = _VVGC_lookup(dp, volume, &ent, NULL);
963 code = _VVGC_entry_export(ent, res);
970 * query a volume group by any member volume id.
972 * @param[in] dp disk partition object
973 * @param[in] volume volume id of a member of VG
974 * @param[out] res vg membership data
976 * @return operation status
980 VVGCache_query(struct DiskPartition64 * dp,
981 VolumeId volume, VVGCache_query_t * res)
986 code = VVGCache_query_r(dp, volume, res);
993 * begin asynchronous scan of on-disk volume group metadata.
995 * @param[in] dp disk partition object
999 * @return operation status
1003 VVGCache_scanStart_r(struct DiskPartition64 * dp)
1008 code = _VVGC_scan_start(dp);
1010 /* start a scanner thread on each partition */
1011 for (dp = DiskPartitionList; dp; dp = dp->next) {
1012 res = _VVGC_scan_start(dp);
1023 * begin asynchronous scan of on-disk volume group metadata.
1025 * @param[in] dp disk partition object
1027 * @return operation status
1031 VVGCache_scanStart(struct DiskPartition64 * dp)
1036 code = VVGCache_scanStart_r(dp);
1043 * wait for async on-disk VG metadata scan to complete.
1045 * @param[in] dp disk partition object
1047 * @pre VOL_LOCK held
1049 * @warning this routine must drop VOL_LOCK internally
1051 * @return operation status
1055 VVGCache_scanWait_r(struct DiskPartition64 * dp)
1059 while (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
1060 VOL_CV_WAIT(&VVGCache.part[dp->index].cv);
1067 * wait for async on-disk VG metadata scan to complete.
1069 * @param[in] dp disk partition object
1071 * @return operation status
1075 VVGCache_scanWait(struct DiskPartition64 * dp)
1080 code = VVGCache_scanWait_r(dp);
1087 * flush all cache entries for a given disk partition.
1089 * @param[in] part disk partition object
1091 * @pre VOL_LOCK held
1093 * @return operation status
1099 _VVGC_flush_part_r(struct DiskPartition64 * part)
1103 VVGCache_hash_entry_t * ent, * nent;
1105 for (i = 0; i < VolumeHashTable.Size; i++) {
1106 for (queue_Scan(&VVGCache_hash_table.hash_buckets[i],
1109 VVGCache_hash_entry)) {
1110 if (ent->dp == part) {
1111 VolumeId volid = ent->volid;
1112 res = _VVGC_hash_entry_del(ent);
1114 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1115 res, afs_printable_uint32_lu(volid)));
1126 * flush all cache entries for a given disk partition.
1128 * @param[in] part disk partition object
1130 * @return operation status
1136 _VVGC_flush_part(struct DiskPartition64 * part)
1141 code = _VVGC_flush_part_r(part);
1149 * change VVGC partition state.
1151 * @param[in] part disk partition object
1152 * @param[in] state new state
1154 * @pre VOL_LOCK is held
1161 _VVGC_state_change(struct DiskPartition64 * part,
1162 VVGCache_part_state_t state)
1164 VVGCache_part_state_t old_state;
1166 old_state = VVGCache.part[part->index].state;
1167 VVGCache.part[part->index].state = state;
1169 if (old_state != state) {
1170 CV_BROADCAST(&VVGCache.part[part->index].cv);
1176 #endif /* AFS_DEMAND_ATTACH_FS */