2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
18 #ifdef AFS_DEMAND_ATTACH_FS
26 #include <afs/assert.h>
29 #include <sys/param.h>
31 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX_ENV)
34 #include <afs/afsutil.h>
37 #include <afs/afsint.h>
41 #include "viceinode.h"
43 #include "partition.h"
44 #include <afs/errors.h>
46 #define __VOL_VG_CACHE_IMPL 1
49 #include "vg_cache_impl.h"
51 static int _VVGC_lookup(struct DiskPartition64 *,
53 VVGCache_entry_t ** entry,
54 VVGCache_hash_entry_t ** hentry);
55 static int _VVGC_entry_alloc(VVGCache_entry_t ** entry);
56 static int _VVGC_entry_free(VVGCache_entry_t * entry);
57 static int _VVGC_entry_get(VVGCache_entry_t * entry);
58 static int _VVGC_entry_put(struct DiskPartition64 *,
59 VVGCache_entry_t * entry);
60 static int _VVGC_entry_add(struct DiskPartition64 *,
63 VVGCache_hash_entry_t **);
64 static int _VVGC_entry_cl_add(VVGCache_entry_t *, VolumeId);
65 static int _VVGC_entry_cl_del(struct DiskPartition64 *, VVGCache_entry_t *,
67 static int _VVGC_entry_export(VVGCache_entry_t *, VVGCache_query_t *);
68 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry);
69 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry);
70 static int _VVGC_hash_entry_add(struct DiskPartition64 *,
73 VVGCache_hash_entry_t **);
74 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t * entry);
75 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * entry);
77 VVGCache_hash_table_t VVGCache_hash_table;
81 * initialize volume group cache subsystem.
83 * @return operation status
87 VVGCache_PkgInit(void)
92 /* allocate hash table */
93 VVGCache_hash_table.hash_buckets =
94 malloc(VolumeHashTable.Size * sizeof(struct rx_queue));
95 if (VVGCache_hash_table.hash_buckets == NULL) {
100 /* setup hash chain heads */
101 for (i = 0; i < VolumeHashTable.Size; i++) {
102 queue_Init(&VVGCache_hash_table.hash_buckets[i]);
105 /* initialize per-partition VVGC state */
106 for (i = 0; i <= VOLMAXPARTS; i++) {
107 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
108 VVGCache.part[i].dlist_hash_buckets = NULL;
109 code = pthread_cond_init(&VVGCache.part[i].cv, NULL);
120 * shut down volume group cache subsystem.
122 * @return operation status
128 VVGCache_PkgShutdown(void)
134 /* free hash table */
135 free(VVGCache_hash_table.hash_buckets);
136 VVGCache_hash_table.hash_buckets = NULL;
138 /* destroy per-partition VVGC state */
139 for (i = 0; i <= VOLMAXPARTS; i++) {
140 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
141 pthread_cond_destroy(&VVGCache.part[i].cv);
148 * allocate a cache entry.
150 * @param[out] entry_out pointer to newly allocated entry
152 * @return operation status
158 _VVGC_entry_alloc(VVGCache_entry_t ** entry_out)
161 VVGCache_entry_t * ent;
163 *entry_out = ent = malloc(sizeof(VVGCache_entry_t));
169 memset(ent, 0, sizeof(*ent));
176 * free a cache entry.
178 * @param[in] entry cache entry
180 * @return operation status
186 _VVGC_entry_free(VVGCache_entry_t * entry)
190 assert(entry->refcnt == 0);
197 * allocate and register an entry for a volume group.
199 * @param[in] dp disk partition object
200 * @param[in] volid volume id
201 * @param[out] entry_out vg cache object pointer
202 * @param[out] hash_out vg cache hash entry object pointer
204 * @pre - VOL_LOCK held
205 * - no such entry exists in hash table
207 * @return operation status
213 _VVGC_entry_add(struct DiskPartition64 * dp,
215 VVGCache_entry_t ** entry_out,
216 VVGCache_hash_entry_t ** hash_out)
219 VVGCache_entry_t * ent;
221 code = _VVGC_entry_alloc(&ent);
227 /* refcnt will be inc'd when a child is added */
230 code = _VVGC_hash_entry_add(dp, volid, ent, hash_out);
242 _VVGC_entry_free(ent);
249 * add a volid to the entry's child list.
251 * @param[in] ent volume group object
252 * @param[in] volid volume id
254 * @return operation status
256 * @retval -1 child table is full
261 _VVGC_entry_cl_add(VVGCache_entry_t * ent,
267 /* search table to avoid duplicates */
268 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
269 if (ent->children[i] == volid) {
270 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
272 afs_printable_uint32_lu(volid),
273 afs_printable_uint32_lu(ent->rw)));
276 if (empty_idx == -1 && !ent->children[i]) {
278 /* don't break; make sure we go through all children so we don't
279 * add a duplicate entry */
283 /* verify table isn't full */
284 if (empty_idx == -1) {
286 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
287 "is full\n", afs_printable_uint32_lu(volid),
288 afs_printable_uint32_lu(ent->rw)));
293 ent->children[empty_idx] = volid;
296 code = _VVGC_entry_get(ent);
303 * delete a volid from the entry's child list.
305 * @param[in] dp disk partition object
306 * @param[in] ent volume group object
307 * @param[in] volid volume id
309 * @return operation status
311 * @retval -1 no such entry found
316 _VVGC_entry_cl_del(struct DiskPartition64 *dp,
317 VVGCache_entry_t * ent,
322 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
323 if (ent->children[i] == volid) {
324 ent->children[i] = 0;
332 code = _VVGC_entry_put(dp, ent);
339 * add a refcount to an entry.
341 * @param[in] entry cache entry
345 * @return operation status
350 static int _VVGC_entry_get(VVGCache_entry_t * entry)
357 * put back a reference to an entry.
359 * @param[in] dp disk partition object
360 * @param[in] entry cache entry
364 * @warning do not attempt to deref pointer after calling this interface
366 * @return operation status
369 * @note dp is needed to lookup the RW hash entry to unlink, if we are
370 * putting back the final reference and freeing
375 _VVGC_entry_put(struct DiskPartition64 * dp, VVGCache_entry_t * entry)
379 assert(entry->refcnt > 0);
381 if (--entry->refcnt == 0) {
382 VVGCache_entry_t *nentry;
383 VVGCache_hash_entry_t *hentry;
385 /* first, try to delete the RW id hash entry pointing to this
387 code = _VVGC_lookup(dp, entry->rw, &nentry, &hentry);
389 if (nentry != entry) {
390 /* looking up the rw of this entry points to a different
391 * entry; should not happen */
392 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
393 "found different entry than was passed",
394 afs_printable_uint32_lu(entry->rw)));
397 code = _VVGC_hash_entry_unlink(hentry);
400 } else if (code == ENOENT) {
401 /* ignore ENOENT; this shouldn't happen, since the RW hash
402 * entry should always exist if the entry does... but we
403 * were going to delete it anyway, so try to continue */
404 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
405 "vol %lu, but RW hash entry doesn't exist; continuing "
406 "anyway...\n", afs_printable_uint32_lu(entry->rw)));
411 /* now, just free the entry itself */
413 code = _VVGC_entry_free(entry);
421 * export a volume group entry in the external object format.
423 * @param[in] ent internal-format volume group object
424 * @param[out] qry external-format volume group object
428 * @return operation status
434 _VVGC_entry_export(VVGCache_entry_t * ent, VVGCache_query_t * qry)
439 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
440 qry->children[i] = ent->children[i];
447 * allocate a hash table entry structure.
449 * @param[out] entry_out address in which to store newly allocated hash entry struct
451 * @return operation status
457 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry_out)
460 VVGCache_hash_entry_t * ent;
462 *entry_out = ent = malloc(sizeof(VVGCache_hash_entry_t));
471 * free a hash table entry structure.
473 * @param[in] entry hash table entry structure to be freed
475 * @return operation status
481 _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry)
491 * add an entry to the hash table.
493 * @param[in] dp disk partition object
494 * @param[in] volid volume id
495 * @param[in] ent volume group object
496 * @param[out] hash_out address in which to store pointer to hash entry
500 * @return operation status
502 * @retval EEXIST hash entry for volid already exists, and it points to
503 * a different VG entry
508 _VVGC_hash_entry_add(struct DiskPartition64 * dp,
510 VVGCache_entry_t * ent,
511 VVGCache_hash_entry_t ** hash_out)
514 VVGCache_hash_entry_t * hent;
515 int hash = VVGC_HASH(volid);
516 VVGCache_entry_t *nent;
518 code = _VVGC_lookup(dp, volid, &nent, hash_out);
521 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
522 " nonmatching entry for vol %lu: original "
523 "(%"AFS_PTR_FMT",%lu) new (%"AFS_PTR_FMT",%lu)\n",
524 afs_printable_uint32_lu(volid),
525 nent, afs_printable_uint32_lu(nent->rw),
526 ent, afs_printable_uint32_lu(ent->rw)));
529 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
530 "hash entry for vol %lu, VG %lu",
531 afs_printable_uint32_lu(volid),
532 afs_printable_uint32_lu(ent->rw)));
533 /* accept attempts to add matching duplicate entries; just
534 * pretend we added it */
538 code = _VVGC_hash_entry_alloc(&hent);
546 queue_Append(&VVGCache_hash_table.hash_buckets[hash],
557 * remove an entry from the hash table.
559 * @param[in] hent hash table entry
563 * @return operation status
569 _VVGC_hash_entry_del(VVGCache_hash_entry_t * hent)
574 if (hent->entry->rw == hent->volid) {
578 code = _VVGC_entry_cl_del(hent->dp, hent->entry, hent->volid);
579 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
580 * if hent->entry->rw == hent->volid, it is possible for hent to
584 /* If we are the RW id, don't unlink, since we still need the
585 * hash entry to exist, so when we lookup children, they can
586 * look up the RW id hash chain, and they will all go to the
589 * If we are the last entry and the entry should be deleted,
590 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
592 res = _VVGC_hash_entry_unlink(hent);
602 * low-level interface to remove an entry from the hash table.
604 * Does not alter the refcount or worry about the children lists or
605 * anything like that; just removes the hash table entry, frees it, and
606 * that's all. You probably want @see _VVGC_hash_entry_del instead.
608 * @param[in] hent hash table entry
612 * @return operation status
618 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * hent)
625 code = _VVGC_hash_entry_free(hent);
631 * lookup a vg cache entry given any member volume id.
633 * @param[in] dp disk partition object
634 * @param[in] volid vg member volume id
635 * @param[out] entry_out address in which to store volume group entry structure pointer
636 * @param[out] hash_out address in which to store hash entry pointer
640 * @warning - it is up to the caller to get a ref to entry_out, if needed
641 * - hash_out must not be referenced after dropping VOL_LOCK
643 * @return operation status
645 * @retval ENOENT volume id not found
646 * @retval EINVAL partition's VGC is invalid
651 _VVGC_lookup(struct DiskPartition64 * dp,
653 VVGCache_entry_t ** entry_out,
654 VVGCache_hash_entry_t ** hash_out)
657 int bucket = VVGC_HASH(volid);
658 struct VVGCache_hash_entry * ent, * nent;
660 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
666 for (queue_Scan(&VVGCache_hash_table.hash_buckets[bucket],
669 VVGCache_hash_entry)) {
670 if (ent->volid == volid && ent->dp == dp) {
672 *entry_out = ent->entry;
684 * add an entry to the volume group cache.
686 * @param[in] dp disk partition object
687 * @param[in] parent parent volume id
688 * @param[in] child child volume id
689 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
690 * new VG, 0 if we added to an existing VG
694 * @return operation status
696 * @retval -1 parent and child are already registered in
700 VVGCache_entry_add_r(struct DiskPartition64 * dp,
706 VVGCache_entry_t * child_ent, * parent_ent;
712 /* check for existing entries */
713 res = _VVGC_lookup(dp, child, &child_ent, NULL);
714 if (res && res != ENOENT) {
719 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
720 if (res && res != ENOENT) {
726 * branch based upon existence of parent and child nodes
728 if (parent_ent && child_ent) {
729 /* both exist. we're done.
730 * if they point different places, then report the error. */
731 if (child_ent != parent_ent) {
734 if (parent == child) {
735 /* if we're adding the RW entry as a child, the RW id may
736 * not be in the child array yet, so make sure not to skip
741 } else if (!parent_ent && child_ent) {
743 * update vg root volid, and add hash entry. */
744 parent_ent = child_ent;
745 parent_ent->rw = parent;
747 code = _VVGC_hash_entry_add(dp,
752 } else if (!child_ent && !parent_ent) {
753 code = _VVGC_entry_add(dp,
763 if (child == parent) {
764 /* if we're the RW, skip over adding the child hash entry;
765 * we already added the hash entry when creating the entry */
766 child_ent = parent_ent;
772 child_ent = parent_ent;
773 code = _VVGC_hash_entry_add(dp,
782 code = _VVGC_entry_cl_add(child_ent, child);
785 if (code && code != EINVAL) {
786 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
787 " %lu on partition %s", code, afs_printable_uint32_lu(child),
788 afs_printable_uint32_lu(parent), VPartitionPath(dp)));
791 if (code == 0 && VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
792 /* we successfully added the entry; make sure it's not on the
793 * to-delete list, so it doesn't get deleted later */
794 code = _VVGC_dlist_del_r(dp, parent, child);
795 if (code && code != ENOENT) {
796 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
797 "%lu (parent %lu) from the to-delete list for part "
798 "%s.\n", code, afs_printable_uint32_lu(child),
799 afs_printable_uint32_lu(parent),
800 VPartitionPath(dp)));
810 * add an entry to the volume group cache.
812 * @param[in] dp disk partition object
813 * @param[in] parent parent volume id
814 * @param[in] child child volume id
815 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
816 * new VG, 0 if we added to an existing VG
818 * @return operation status
822 VVGCache_entry_add(struct DiskPartition64 * dp,
830 VVGCache_entry_add_r(dp, parent, child, newvg);
837 * delete an entry from the volume group cache.
839 * If partition is scanning, actually puts the entry on a list of entries
840 * to delete when the scan is done.
842 * @param[in] dp disk partition object
843 * @param[in] parent parent volume id
844 * @param[in] child child volume id
848 * @return operation status
852 VVGCache_entry_del_r(struct DiskPartition64 * dp,
853 VolumeId parent, VolumeId child)
855 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
857 code = _VVGC_dlist_add_r(dp, parent, child);
862 return _VVGC_entry_purge_r(dp, parent, child);
866 * delete an entry from the volume group cache.
868 * @param[in] dp disk partition object
869 * @param[in] parent parent volume id
870 * @param[in] child child volume id
876 * @return operation status
880 _VVGC_entry_purge_r(struct DiskPartition64 * dp,
881 VolumeId parent, VolumeId child)
884 VVGCache_entry_t * parent_ent, * child_ent;
885 VVGCache_hash_entry_t * child_hent;
887 /* check mappings for each volid */
888 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
893 res = _VVGC_lookup(dp, child, &child_ent, &child_hent);
899 /* if the mappings don't match, we have a serious error */
900 if (parent_ent != child_ent) {
901 ViceLog(0, ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
902 "but vol %lu points to VGC entry %"AFS_PTR_FMT" and VG %lu "
903 "points to VGC entry %"AFS_PTR_FMT"\n",
904 afs_printable_uint32_lu(child),
905 afs_printable_uint32_lu(parent),
906 afs_printable_uint32_lu(child),
907 child_ent, afs_printable_uint32_lu(parent), parent_ent));
912 code = _VVGC_hash_entry_del(child_hent);
919 * delete an entry from the volume group cache.
921 * @param[in] dp disk partition object
922 * @param[in] parent parent volume id
923 * @param[in] child child volume id
925 * @return operation status
929 VVGCache_entry_del(struct DiskPartition64 * dp,
930 VolumeId parent, VolumeId child)
935 code = VVGCache_entry_del_r(dp, parent, child);
942 * query a volume group by any member volume id.
944 * @param[in] dp disk partition object
945 * @param[in] volume volume id of a member of VG
946 * @param[out] res vg membership data
950 * @return operation status
952 * @retval EAGAIN partition needs to finish scanning
955 VVGCache_query_r(struct DiskPartition64 * dp,
957 VVGCache_query_t * res)
960 VVGCache_entry_t * ent;
962 /* If cache for this partition doesn't exist; start a scan */
963 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
964 code = VVGCache_scanStart_r(dp);
965 if (code == 0 || code == -3) {
966 /* -3 means another thread already started scanning */
971 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
975 code = _VVGC_lookup(dp, volume, &ent, NULL);
977 code = _VVGC_entry_export(ent, res);
984 * query a volume group by any member volume id.
986 * @param[in] dp disk partition object
987 * @param[in] volume volume id of a member of VG
988 * @param[out] res vg membership data
990 * @return operation status
994 VVGCache_query(struct DiskPartition64 * dp,
995 VolumeId volume, VVGCache_query_t * res)
1000 code = VVGCache_query_r(dp, volume, res);
1007 * begin asynchronous scan of on-disk volume group metadata.
1009 * @param[in] dp disk partition object
1011 * @pre VOL_LOCK held
1013 * @return operation status
1017 VVGCache_scanStart_r(struct DiskPartition64 * dp)
1022 code = _VVGC_scan_start(dp);
1024 /* start a scanner thread on each partition */
1025 for (dp = DiskPartitionList; dp; dp = dp->next) {
1026 res = _VVGC_scan_start(dp);
1037 * begin asynchronous scan of on-disk volume group metadata.
1039 * @param[in] dp disk partition object
1041 * @return operation status
1045 VVGCache_scanStart(struct DiskPartition64 * dp)
1050 code = VVGCache_scanStart_r(dp);
1057 * wait for async on-disk VG metadata scan to complete.
1059 * @param[in] dp disk partition object
1061 * @pre VOL_LOCK held
1063 * @warning this routine must drop VOL_LOCK internally
1065 * @return operation status
1069 VVGCache_scanWait_r(struct DiskPartition64 * dp)
1073 while (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
1074 VOL_CV_WAIT(&VVGCache.part[dp->index].cv);
1081 * wait for async on-disk VG metadata scan to complete.
1083 * @param[in] dp disk partition object
1085 * @return operation status
1089 VVGCache_scanWait(struct DiskPartition64 * dp)
1094 code = VVGCache_scanWait_r(dp);
1101 * flush all cache entries for a given disk partition.
1103 * @param[in] part disk partition object
1105 * @pre VOL_LOCK held
1107 * @return operation status
1113 _VVGC_flush_part_r(struct DiskPartition64 * part)
1117 VVGCache_hash_entry_t * ent, * nent;
1119 for (i = 0; i < VolumeHashTable.Size; i++) {
1120 for (queue_Scan(&VVGCache_hash_table.hash_buckets[i],
1123 VVGCache_hash_entry)) {
1124 if (ent->dp == part) {
1125 VolumeId volid = ent->volid;
1126 res = _VVGC_hash_entry_del(ent);
1128 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1129 res, afs_printable_uint32_lu(volid)));
1140 * flush all cache entries for a given disk partition.
1142 * @param[in] part disk partition object
1144 * @return operation status
1150 _VVGC_flush_part(struct DiskPartition64 * part)
1155 code = _VVGC_flush_part_r(part);
1163 * change VVGC partition state.
1165 * @param[in] part disk partition object
1166 * @param[in] state new state
1168 * @pre VOL_LOCK is held
1175 _VVGC_state_change(struct DiskPartition64 * part,
1176 VVGCache_part_state_t state)
1178 VVGCache_part_state_t old_state;
1180 old_state = VVGCache.part[part->index].state;
1181 VVGCache.part[part->index].state = state;
1183 if (old_state != state) {
1184 pthread_cond_broadcast(&VVGCache.part[part->index].cv);
1190 #endif /* AFS_DEMAND_ATTACH_FS */