2 * Copyright 2009-2010, Sine Nomine Associates and others.
5 * This software has been released under the terms of the IBM Public
6 * License. For details, see the LICENSE file in the top-level source
7 * directory or online at http://www.openafs.org/dl/license10.html
12 * volume group membership cache
15 #include <afsconfig.h>
16 #include <afs/param.h>
18 #ifdef AFS_DEMAND_ATTACH_FS
26 #include <afs/assert.h>
29 #include <sys/param.h>
31 #if defined(AFS_SUN5_ENV) || defined(AFS_HPUX_ENV)
34 #include <afs/afsutil.h>
37 #include <afs/afsint.h>
41 #include "viceinode.h"
43 #include "partition.h"
44 #include <afs/errors.h>
46 #define __VOL_VG_CACHE_IMPL 1
49 #include "vg_cache_impl.h"
51 static int _VVGC_lookup(struct DiskPartition64 *,
53 VVGCache_entry_t ** entry,
54 VVGCache_hash_entry_t ** hentry);
55 static int _VVGC_entry_alloc(VVGCache_entry_t ** entry);
56 static int _VVGC_entry_free(VVGCache_entry_t * entry);
57 static int _VVGC_entry_get(VVGCache_entry_t * entry);
58 static int _VVGC_entry_put(struct DiskPartition64 *,
59 VVGCache_entry_t * entry);
60 static int _VVGC_entry_add(struct DiskPartition64 *,
63 VVGCache_hash_entry_t **);
64 static int _VVGC_entry_cl_add(VVGCache_entry_t *, VolumeId);
65 static int _VVGC_entry_cl_del(struct DiskPartition64 *, VVGCache_entry_t *,
67 static int _VVGC_entry_export(VVGCache_entry_t *, VVGCache_query_t *);
68 static int _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry);
69 static int _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry);
70 static int _VVGC_hash_entry_add(struct DiskPartition64 *,
73 VVGCache_hash_entry_t **);
74 static int _VVGC_hash_entry_del(VVGCache_hash_entry_t * entry);
75 static int _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * entry);
77 VVGCache_hash_table_t VVGCache_hash_table;
81 * initialize volume group cache subsystem.
83 * @return operation status
87 VVGCache_PkgInit(void)
92 /* allocate hash table */
93 VVGCache_hash_table.hash_buckets =
94 malloc(VolumeHashTable.Size * sizeof(struct rx_queue));
95 if (VVGCache_hash_table.hash_buckets == NULL) {
100 /* setup hash chain heads */
101 for (i = 0; i < VolumeHashTable.Size; i++) {
102 queue_Init(&VVGCache_hash_table.hash_buckets[i]);
105 /* initialize per-partition VVGC state */
106 for (i = 0; i <= VOLMAXPARTS; i++) {
107 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
108 VVGCache.part[i].dlist_hash_buckets = NULL;
109 code = pthread_cond_init(&VVGCache.part[i].cv, NULL);
120 * shut down volume group cache subsystem.
122 * @return operation status
128 VVGCache_PkgShutdown(void)
134 /* free hash table */
135 free(VVGCache_hash_table.hash_buckets);
136 VVGCache_hash_table.hash_buckets = NULL;
138 /* destroy per-partition VVGC state */
139 for (i = 0; i <= VOLMAXPARTS; i++) {
140 VVGCache.part[i].state = VVGC_PART_STATE_INVALID;
141 pthread_cond_destroy(&VVGCache.part[i].cv);
148 * allocate a cache entry.
150 * @param[out] entry_out pointer to newly allocated entry
152 * @return operation status
158 _VVGC_entry_alloc(VVGCache_entry_t ** entry_out)
161 VVGCache_entry_t * ent;
163 *entry_out = ent = malloc(sizeof(VVGCache_entry_t));
169 memset(ent, 0, sizeof(*ent));
176 * free a cache entry.
178 * @param[in] entry cache entry
180 * @return operation status
186 _VVGC_entry_free(VVGCache_entry_t * entry)
190 assert(entry->refcnt == 0);
197 * allocate and register an entry for a volume group.
199 * @param[in] dp disk partition object
200 * @param[in] volid volume id
201 * @param[out] entry_out vg cache object pointer
202 * @param[out] hash_out vg cache hash entry object pointer
204 * @pre - VOL_LOCK held
205 * - no such entry exists in hash table
207 * @return operation status
213 _VVGC_entry_add(struct DiskPartition64 * dp,
215 VVGCache_entry_t ** entry_out,
216 VVGCache_hash_entry_t ** hash_out)
219 VVGCache_entry_t * ent;
221 code = _VVGC_entry_alloc(&ent);
227 /* refcnt will be inc'd when a child is added */
230 code = _VVGC_hash_entry_add(dp, volid, ent, hash_out);
242 _VVGC_entry_free(ent);
249 * add a volid to the entry's child list.
251 * @param[in] ent volume group object
252 * @param[in] volid volume id
254 * @return operation status
256 * @retval -1 child table is full
261 _VVGC_entry_cl_add(VVGCache_entry_t * ent,
264 int code = 0, i, empty_found, empty_idx;
266 /* search table to avoid duplicates */
267 for (i = 0, empty_found = 0;
270 if (ent->children[i] == volid) {
271 ViceLog(1, ("VVGC_entry_cl_add: tried to add duplicate vol "
273 afs_printable_uint32_lu(volid),
274 afs_printable_uint32_lu(ent->rw)));
277 if (!empty_found && !ent->children[i]) {
280 /* don't break; make sure we go through all children so we don't
281 * add a duplicate entry */
285 /* verify table isn't full */
288 ViceLog(0, ("VVGC_entry_cl_add: tried to add vol %lu to VG %lu, but VG "
289 "is full\n", afs_printable_uint32_lu(volid),
290 afs_printable_uint32_lu(ent->rw)));
295 ent->children[empty_idx] = volid;
298 code = _VVGC_entry_get(ent);
305 * delete a volid from the entry's child list.
307 * @param[in] dp disk partition object
308 * @param[in] ent volume group object
309 * @param[in] volid volume id
311 * @return operation status
313 * @retval -1 no such entry found
318 _VVGC_entry_cl_del(struct DiskPartition64 *dp,
319 VVGCache_entry_t * ent,
324 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
325 if (ent->children[i] == volid) {
326 ent->children[i] = 0;
334 code = _VVGC_entry_put(dp, ent);
341 * add a refcount to an entry.
343 * @param[in] entry cache entry
347 * @return operation status
352 static int _VVGC_entry_get(VVGCache_entry_t * entry)
359 * put back a reference to an entry.
361 * @param[in] dp disk partition object
362 * @param[in] entry cache entry
366 * @warning do not attempt to deref pointer after calling this interface
368 * @return operation status
371 * @note dp is needed to lookup the RW hash entry to unlink, if we are
372 * putting back the final reference and freeing
377 _VVGC_entry_put(struct DiskPartition64 * dp, VVGCache_entry_t * entry)
381 assert(entry->refcnt > 0);
383 if (--entry->refcnt == 0) {
384 VVGCache_entry_t *nentry;
385 VVGCache_hash_entry_t *hentry;
387 /* first, try to delete the RW id hash entry pointing to this
389 code = _VVGC_lookup(dp, entry->rw, &nentry, &hentry);
391 if (nentry != entry) {
392 /* looking up the rw of this entry points to a different
393 * entry; should not happen */
394 ViceLog(0, ("VVGC_entry_put: error: entry lookup for entry %lu "
395 "found different entry than was passed",
396 afs_printable_uint32_lu(entry->rw)));
399 code = _VVGC_hash_entry_unlink(hentry);
402 } else if (code == ENOENT) {
403 /* ignore ENOENT; this shouldn't happen, since the RW hash
404 * entry should always exist if the entry does... but we
405 * were going to delete it anyway, so try to continue */
406 ViceLog(0, ("VVGC_entry_put: warning: tried to unlink entry for "
407 "vol %lu, but RW hash entry doesn't exist; continuing "
408 "anyway...\n", afs_printable_uint32_lu(entry->rw)));
413 /* now, just free the entry itself */
415 code = _VVGC_entry_free(entry);
423 * export a volume group entry in the external object format.
425 * @param[in] ent internal-format volume group object
426 * @param[out] qry external-format volume group object
430 * @return operation status
436 _VVGC_entry_export(VVGCache_entry_t * ent, VVGCache_query_t * qry)
441 for (i = 0; i < VOL_VG_MAX_VOLS; i++) {
442 qry->children[i] = ent->children[i];
449 * allocate a hash table entry structure.
451 * @param[out] entry_out address in which to store newly allocated hash entry struct
453 * @return operation status
459 _VVGC_hash_entry_alloc(VVGCache_hash_entry_t ** entry_out)
462 VVGCache_hash_entry_t * ent;
464 *entry_out = ent = malloc(sizeof(VVGCache_hash_entry_t));
473 * free a hash table entry structure.
475 * @param[in] entry hash table entry structure to be freed
477 * @return operation status
483 _VVGC_hash_entry_free(VVGCache_hash_entry_t * entry)
493 * add an entry to the hash table.
495 * @param[in] dp disk partition object
496 * @param[in] volid volume id
497 * @param[in] ent volume group object
498 * @param[out] hash_out address in which to store pointer to hash entry
502 * @return operation status
504 * @retval EEXIST hash entry for volid already exists, and it points to
505 * a different VG entry
510 _VVGC_hash_entry_add(struct DiskPartition64 * dp,
512 VVGCache_entry_t * ent,
513 VVGCache_hash_entry_t ** hash_out)
516 VVGCache_hash_entry_t * hent;
517 int hash = VVGC_HASH(volid);
518 VVGCache_entry_t *nent;
520 code = _VVGC_lookup(dp, volid, &nent, hash_out);
523 ViceLog(0, ("_VVGC_hash_entry_add: tried to add a duplicate "
524 " nonmatching entry for vol %lu: original "
525 "(%"AFS_PTR_FMT",%lu) new (%"AFS_PTR_FMT",%lu)\n",
526 afs_printable_uint32_lu(volid),
527 nent, afs_printable_uint32_lu(nent->rw),
528 ent, afs_printable_uint32_lu(ent->rw)));
531 ViceLog(1, ("_VVGC_hash_entry_add: tried to add duplicate "
532 "hash entry for vol %lu, VG %lu",
533 afs_printable_uint32_lu(volid),
534 afs_printable_uint32_lu(ent->rw)));
535 /* accept attempts to add matching duplicate entries; just
536 * pretend we added it */
540 code = _VVGC_hash_entry_alloc(&hent);
548 queue_Append(&VVGCache_hash_table.hash_buckets[hash],
559 * remove an entry from the hash table.
561 * @param[in] hent hash table entry
565 * @return operation status
571 _VVGC_hash_entry_del(VVGCache_hash_entry_t * hent)
576 if (hent->entry->rw == hent->volid) {
580 code = _VVGC_entry_cl_del(hent->dp, hent->entry, hent->volid);
581 /* note: hent->entry is possibly NULL after _VVGC_entry_cl_del, and
582 * if hent->entry->rw == hent->volid, it is possible for hent to
586 /* If we are the RW id, don't unlink, since we still need the
587 * hash entry to exist, so when we lookup children, they can
588 * look up the RW id hash chain, and they will all go to the
591 * If we are the last entry and the entry should be deleted,
592 * _VVGC_entry_cl_del will take care of unlinking the RW hash entry.
594 res = _VVGC_hash_entry_unlink(hent);
604 * low-level interface to remove an entry from the hash table.
606 * Does not alter the refcount or worry about the children lists or
607 * anything like that; just removes the hash table entry, frees it, and
608 * that's all. You probably want @see _VVGC_hash_entry_del instead.
610 * @param[in] hent hash table entry
614 * @return operation status
620 _VVGC_hash_entry_unlink(VVGCache_hash_entry_t * hent)
627 code = _VVGC_hash_entry_free(hent);
633 * lookup a vg cache entry given any member volume id.
635 * @param[in] dp disk partition object
636 * @param[in] volid vg member volume id
637 * @param[out] entry_out address in which to store volume group entry structure pointer
638 * @param[out] hash_out address in which to store hash entry pointer
642 * @warning - it is up to the caller to get a ref to entry_out, if needed
643 * - hash_out must not be referenced after dropping VOL_LOCK
645 * @return operation status
647 * @retval ENOENT volume id not found
648 * @retval EINVAL partition's VGC is invalid
653 _VVGC_lookup(struct DiskPartition64 * dp,
655 VVGCache_entry_t ** entry_out,
656 VVGCache_hash_entry_t ** hash_out)
659 int bucket = VVGC_HASH(volid);
660 struct VVGCache_hash_entry * ent, * nent;
662 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
668 for (queue_Scan(&VVGCache_hash_table.hash_buckets[bucket],
671 VVGCache_hash_entry)) {
672 if (ent->volid == volid && ent->dp == dp) {
674 *entry_out = ent->entry;
686 * add an entry to the volume group cache.
688 * @param[in] dp disk partition object
689 * @param[in] parent parent volume id
690 * @param[in] child child volume id
691 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
692 * new VG, 0 if we added to an existing VG
696 * @return operation status
698 * @retval -1 parent and child are already registered in
702 VVGCache_entry_add_r(struct DiskPartition64 * dp,
708 VVGCache_entry_t * child_ent, * parent_ent;
714 /* check for existing entries */
715 res = _VVGC_lookup(dp, child, &child_ent, NULL);
716 if (res && res != ENOENT) {
721 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
722 if (res && res != ENOENT) {
728 * branch based upon existence of parent and child nodes
730 if (parent_ent && child_ent) {
731 /* both exist. we're done.
732 * if they point different places, then report the error. */
733 if (child_ent != parent_ent) {
736 if (parent == child) {
737 /* if we're adding the RW entry as a child, the RW id may
738 * not be in the child array yet, so make sure not to skip
743 } else if (!parent_ent && child_ent) {
745 * update vg root volid, and add hash entry. */
746 parent_ent = child_ent;
747 parent_ent->rw = parent;
749 code = _VVGC_hash_entry_add(dp,
754 } else if (!child_ent && !parent_ent) {
755 code = _VVGC_entry_add(dp,
765 if (child == parent) {
766 /* if we're the RW, skip over adding the child hash entry;
767 * we already added the hash entry when creating the entry */
768 child_ent = parent_ent;
774 child_ent = parent_ent;
775 code = _VVGC_hash_entry_add(dp,
784 code = _VVGC_entry_cl_add(child_ent, child);
787 if (code && code != EINVAL) {
788 ViceLog(0, ("VVGCache_entry_add: error %d trying to add vol %lu to VG"
789 " %lu on partition %s", code, afs_printable_uint32_lu(child),
790 afs_printable_uint32_lu(parent), VPartitionPath(dp)));
793 if (code == 0 && VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
794 /* we successfully added the entry; make sure it's not on the
795 * to-delete list, so it doesn't get deleted later */
796 code = _VVGC_dlist_del_r(dp, parent, child);
797 if (code && code != ENOENT) {
798 ViceLog(0, ("VVGCache_entry_add: error %d trying to remove vol "
799 "%lu (parent %lu) from the to-delete list for part "
800 "%s.\n", code, afs_printable_uint32_lu(child),
801 afs_printable_uint32_lu(parent),
802 VPartitionPath(dp)));
812 * add an entry to the volume group cache.
814 * @param[in] dp disk partition object
815 * @param[in] parent parent volume id
816 * @param[in] child child volume id
817 * @param[out] newvg if non-NULL, *newvg is 1 if adding this added a
818 * new VG, 0 if we added to an existing VG
820 * @return operation status
824 VVGCache_entry_add(struct DiskPartition64 * dp,
832 VVGCache_entry_add_r(dp, parent, child, newvg);
839 * delete an entry from the volume group cache.
841 * If partition is scanning, actually puts the entry on a list of entries
842 * to delete when the scan is done.
844 * @param[in] dp disk partition object
845 * @param[in] parent parent volume id
846 * @param[in] child child volume id
850 * @return operation status
854 VVGCache_entry_del_r(struct DiskPartition64 * dp,
855 VolumeId parent, VolumeId child)
857 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
859 code = _VVGC_dlist_add_r(dp, parent, child);
864 return _VVGC_entry_purge_r(dp, parent, child);
868 * delete an entry from the volume group cache.
870 * @param[in] dp disk partition object
871 * @param[in] parent parent volume id
872 * @param[in] child child volume id
878 * @return operation status
882 _VVGC_entry_purge_r(struct DiskPartition64 * dp,
883 VolumeId parent, VolumeId child)
886 VVGCache_entry_t * parent_ent, * child_ent;
887 VVGCache_hash_entry_t * child_hent;
889 /* check mappings for each volid */
890 res = _VVGC_lookup(dp, parent, &parent_ent, NULL);
895 res = _VVGC_lookup(dp, child, &child_ent, &child_hent);
901 /* if the mappings don't match, we have a serious error */
902 if (parent_ent != child_ent) {
903 ViceLog(0, ("VVGCache_entry_del: trying to delete vol %lu from VG %lu, "
904 "but vol %lu points to VGC entry %"AFS_PTR_FMT" and VG %lu "
905 "points to VGC entry %"AFS_PTR_FMT"\n",
906 afs_printable_uint32_lu(child),
907 afs_printable_uint32_lu(parent),
908 afs_printable_uint32_lu(child),
909 child_ent, afs_printable_uint32_lu(parent), parent_ent));
914 code = _VVGC_hash_entry_del(child_hent);
921 * delete an entry from the volume group cache.
923 * @param[in] dp disk partition object
924 * @param[in] parent parent volume id
925 * @param[in] child child volume id
927 * @return operation status
931 VVGCache_entry_del(struct DiskPartition64 * dp,
932 VolumeId parent, VolumeId child)
937 code = VVGCache_entry_del_r(dp, parent, child);
944 * query a volume group by any member volume id.
946 * @param[in] dp disk partition object
947 * @param[in] volume volume id of a member of VG
948 * @param[out] res vg membership data
952 * @return operation status
954 * @retval EAGAIN partition needs to finish scanning
957 VVGCache_query_r(struct DiskPartition64 * dp,
959 VVGCache_query_t * res)
962 VVGCache_entry_t * ent;
964 /* If cache for this partition doesn't exist; start a scan */
965 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_INVALID) {
966 code = VVGCache_scanStart_r(dp);
967 if (code == 0 || code == -3) {
968 /* -3 means another thread already started scanning */
973 if (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
977 code = _VVGC_lookup(dp, volume, &ent, NULL);
979 code = _VVGC_entry_export(ent, res);
986 * query a volume group by any member volume id.
988 * @param[in] dp disk partition object
989 * @param[in] volume volume id of a member of VG
990 * @param[out] res vg membership data
992 * @return operation status
996 VVGCache_query(struct DiskPartition64 * dp,
997 VolumeId volume, VVGCache_query_t * res)
1002 code = VVGCache_query_r(dp, volume, res);
1009 * begin asynchronous scan of on-disk volume group metadata.
1011 * @param[in] dp disk partition object
1013 * @pre VOL_LOCK held
1015 * @return operation status
1019 VVGCache_scanStart_r(struct DiskPartition64 * dp)
1024 code = _VVGC_scan_start(dp);
1026 /* start a scanner thread on each partition */
1027 for (dp = DiskPartitionList; dp; dp = dp->next) {
1028 res = _VVGC_scan_start(dp);
1039 * begin asynchronous scan of on-disk volume group metadata.
1041 * @param[in] dp disk partition object
1043 * @return operation status
1047 VVGCache_scanStart(struct DiskPartition64 * dp)
1052 code = VVGCache_scanStart_r(dp);
1059 * wait for async on-disk VG metadata scan to complete.
1061 * @param[in] dp disk partition object
1063 * @pre VOL_LOCK held
1065 * @warning this routine must drop VOL_LOCK internally
1067 * @return operation status
1071 VVGCache_scanWait_r(struct DiskPartition64 * dp)
1075 while (VVGCache.part[dp->index].state == VVGC_PART_STATE_UPDATING) {
1076 VOL_CV_WAIT(&VVGCache.part[dp->index].cv);
1083 * wait for async on-disk VG metadata scan to complete.
1085 * @param[in] dp disk partition object
1087 * @return operation status
1091 VVGCache_scanWait(struct DiskPartition64 * dp)
1096 code = VVGCache_scanWait_r(dp);
1103 * flush all cache entries for a given disk partition.
1105 * @param[in] part disk partition object
1107 * @pre VOL_LOCK held
1109 * @return operation status
1115 _VVGC_flush_part_r(struct DiskPartition64 * part)
1119 VVGCache_hash_entry_t * ent, * nent;
1121 for (i = 0; i < VolumeHashTable.Size; i++) {
1122 for (queue_Scan(&VVGCache_hash_table.hash_buckets[i],
1125 VVGCache_hash_entry)) {
1126 if (ent->dp == part) {
1127 VolumeId volid = ent->volid;
1128 res = _VVGC_hash_entry_del(ent);
1130 ViceLog(0, ("_VVGC_flush_part_r: error %d deleting hash entry for %lu\n",
1131 res, afs_printable_uint32_lu(volid)));
1142 * flush all cache entries for a given disk partition.
1144 * @param[in] part disk partition object
1146 * @return operation status
1152 _VVGC_flush_part(struct DiskPartition64 * part)
1157 code = _VVGC_flush_part_r(part);
1165 * change VVGC partition state.
1167 * @param[in] part disk partition object
1168 * @param[in] state new state
1170 * @pre VOL_LOCK is held
1177 _VVGC_state_change(struct DiskPartition64 * part,
1178 VVGCache_part_state_t state)
1180 VVGCache_part_state_t old_state;
1182 old_state = VVGCache.part[part->index].state;
1183 VVGCache.part[part->index].state = state;
1185 if (old_state != state) {
1186 pthread_cond_broadcast(&VVGCache.part[part->index].cv);
1192 #endif /* AFS_DEMAND_ATTACH_FS */