From: Sergey Cheremencev Date: Thu, 31 Aug 2023 16:12:51 +0000 (+0200) Subject: LU-16356 hsm: store crh in rhashtable instead of list X-Git-Tag: 2.15.62~246 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=dc13a56187f6787c23e7ba66c6187a9a6220c5f6;p=fs%2Flustre-release.git LU-16356 hsm: store crh in rhashtable instead of list Store coordinator restore handles in rhashtable instead of list. Search in a list with above a million entries takes too much time causing to wait a lot of tasks due to contention on cdt_restore_lock. As cdt_restore_lock is not needed anymore to protect cdt_restore_handle_list, this patch also solves the problem with parallel restore requests(LU-15132). Add regression test sanity-hsm 409b. Fixes: 66b3e74bc ("LU-15132 hsm: Protect against parallel HSM restore requests") Test-Parameters: testlist=sanity-hsm env=ONLY=409b,ONLY_REPEAT=20 HPE-bug-id: LUS-11055 Change-Id: I3bb8788f6a0ce4c3fe4a3be85804df1c6845c313 Signed-off-by: Sergey Cheremencev Signed-off-by: Etienne AUJAMES Signed-off-by: Nikitas Angelinas Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49284 Reviewed-by: Oleg Drokin Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo --- diff --git a/lustre/include/lustre_fid.h b/lustre/include/lustre_fid.h index a7ce2da..c2bea87 100644 --- a/lustre/include/lustre_fid.h +++ b/lustre/include/lustre_fid.h @@ -764,6 +764,8 @@ static inline __u32 fid_hash(const struct lu_fid *f, int bits) return cfs_hash_long(fid_flatten64(f), bits); } +u32 lu_fid_hash(const void *data, u32 len, u32 seed); + static inline int lu_fid_diff(const struct lu_fid *fid1, const struct lu_fid *fid2) { diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c index 2c8cb88..7410207 100644 --- a/lustre/mdt/mdt_coordinator.c +++ b/lustre/mdt/mdt_coordinator.c @@ -453,6 +453,44 @@ static int mdt_coordinator_cb(const struct lu_env *env, } } +static void cdt_crh_free(struct rcu_head *head) +{ + struct cdt_restore_handle *crh; + + crh = container_of(head, struct cdt_restore_handle, crh_rcu); + OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); +} + +static void +cdt_crh_put(struct cdt_restore_handle *crh, struct mdt_thread_info *cdt_mti) +{ + if (atomic_dec_and_test(&crh->crh_refc)) { + /* XXX We pass a NULL object since the restore handle does not + * keep a reference on the object being restored. + */ + if (lustre_handle_is_used(&crh->crh_lh.mlh_reg_lh)) + mdt_object_unlock(cdt_mti, NULL, &crh->crh_lh, 1); + call_rcu(&crh->crh_rcu, cdt_crh_free); + } +} + +static void crh_free_hash(void *vcrh, void *vcdt_mti) +{ + struct cdt_restore_handle *crh = vcrh; + struct mdt_thread_info *cdt_mti = vcdt_mti; + + /* put last reference */ + cdt_crh_put(crh, cdt_mti); +} + +static const struct rhashtable_params crh_hash_params = { + .key_len = sizeof(struct lu_fid), + .key_offset = offsetof(struct cdt_restore_handle, crh_fid), + .head_offset = offsetof(struct cdt_restore_handle, crh_hash), + .hashfn = lu_fid_hash, + .automatic_shrinking = true, +}; + /* Release the ressource used by the coordinator. Called when the * coordinator is stopping. */ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt) @@ -460,7 +498,6 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt) struct coordinator *cdt = &mdt->mdt_coordinator; struct cdt_agent_req *car, *tmp1; struct hsm_agent *ha, *tmp2; - struct cdt_restore_handle *crh, *tmp3; struct mdt_thread_info *cdt_mti; /* start cleaning */ @@ -486,15 +523,9 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt) up_write(&cdt->cdt_agent_lock); cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key); - mutex_lock(&cdt->cdt_restore_lock); - list_for_each_entry_safe(crh, tmp3, &cdt->cdt_restore_handle_list, - crh_list) { - list_del(&crh->crh_list); - /* give back layout lock */ - mdt_object_unlock(cdt_mti, NULL, &crh->crh_lh, 1); - OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); - } - mutex_unlock(&cdt->cdt_restore_lock); + rhashtable_free_and_destroy(&cdt->cdt_restore_hash, crh_free_hash, + cdt_mti); + rcu_barrier(); } /* @@ -864,56 +895,52 @@ int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt, */ crh->crh_extent.start = 0; crh->crh_extent.end = he->length; + atomic_set(&crh->crh_refc, 2); - mutex_lock(&cdt->cdt_restore_lock); - if (cdt_restore_handle_find(cdt, fid) != NULL) - GOTO(out_crl, rc = 1); - - list_add_tail(&crh->crh_list, &cdt->cdt_restore_handle_list); - mutex_unlock(&cdt->cdt_restore_lock); + rc = rhashtable_lookup_insert_fast(&cdt->cdt_restore_hash, + &crh->crh_hash, crh_hash_params); + if (rc) { + OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); + RETURN(rc); + } /* get the layout lock */ obj = mdt_object_find_lock(mti, &crh->crh_fid, &crh->crh_lh, MDS_INODELOCK_LAYOUT, LCK_EX); - if (IS_ERR(obj)) - GOTO(out_ldel, rc = PTR_ERR(obj)); + if (IS_ERR(obj)) { + rc = rhashtable_remove_fast(&cdt->cdt_restore_hash, + &crh->crh_hash, crh_hash_params); + /* rc < 0 means it has been removed in a parallel thread. + * This shouldn't happen by design as at current stage record + * hasn't been added in llog yet. + */ + if (!rc) + cdt_crh_put(crh, mti); + cdt_crh_put(crh, mti); + + RETURN(PTR_ERR(obj)); + } /* We do not keep a reference on the object during the restore * which can be very long. */ mdt_object_put(mti->mti_env, obj); - - RETURN(0); -out_ldel: - mutex_lock(&cdt->cdt_restore_lock); - list_del(&crh->crh_list); -out_crl: - mutex_unlock(&cdt->cdt_restore_lock); - OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); - - return rc; + cdt_crh_put(crh, mti); + RETURN(rc); } /** * lookup a restore handle by FID - * caller needs to hold cdt_restore_lock * \param cdt [IN] coordinator * \param fid [IN] FID - * \retval cdt_restore_handle found - * \retval NULL not found + * \retval true cdt_restore_handle found + * \retval false not found */ -struct cdt_restore_handle *cdt_restore_handle_find(struct coordinator *cdt, - const struct lu_fid *fid) +bool cdt_restore_handle_exists(struct coordinator *cdt, + const struct lu_fid *fid) { - struct cdt_restore_handle *crh; - ENTRY; - - list_for_each_entry(crh, &cdt->cdt_restore_handle_list, crh_list) { - if (lu_fid_eq(&crh->crh_fid, fid)) - RETURN(crh); - } - - RETURN(NULL); + return rhashtable_lookup_fast(&cdt->cdt_restore_hash, fid, + crh_hash_params); } void cdt_restore_handle_del(struct mdt_thread_info *mti, @@ -922,19 +949,19 @@ void cdt_restore_handle_del(struct mdt_thread_info *mti, struct cdt_restore_handle *crh; /* give back layout lock */ - mutex_lock(&cdt->cdt_restore_lock); - crh = cdt_restore_handle_find(cdt, fid); - if (crh != NULL) - list_del(&crh->crh_list); - mutex_unlock(&cdt->cdt_restore_lock); - + rcu_read_lock(); + crh = rhashtable_lookup(&cdt->cdt_restore_hash, fid, crh_hash_params); + if (crh && + rhashtable_remove_fast(&cdt->cdt_restore_hash, &crh->crh_hash, + crh_hash_params)) + crh = NULL; + rcu_read_unlock(); + + /* crh has been removed in a parallel thread */ if (crh == NULL) return; - /* XXX We pass a NULL object since the restore handle does not - * keep a reference on the object being restored. */ - mdt_object_unlock(mti, NULL, &crh->crh_lh, 1); - OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem); + cdt_crh_put(crh, mti); } /** @@ -996,8 +1023,11 @@ static int hsm_restore_cb(const struct lu_env *env, } rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid, &hai->hai_extent); - if (rc == 1) + if (rc == -EEXIST) { + CWARN("%s: duplicate restore record for fid="DFID" found in the llog: rc = %d\n", + mdt_obd_name(mti->mti_mdt), PFID(&hai->hai_fid), rc); rc = 0; + } out: RETURN(rc); } @@ -1069,13 +1099,11 @@ int mdt_hsm_cdt_init(struct mdt_device *mdt) init_rwsem(&cdt->cdt_llog_lock); init_rwsem(&cdt->cdt_agent_lock); init_rwsem(&cdt->cdt_request_lock); - mutex_init(&cdt->cdt_restore_lock); mutex_init(&cdt->cdt_state_lock); set_cdt_state(cdt, CDT_STOPPED); INIT_LIST_HEAD(&cdt->cdt_request_list); INIT_LIST_HEAD(&cdt->cdt_agents); - INIT_LIST_HEAD(&cdt->cdt_restore_handle_list); cdt->cdt_request_cookie_hash = cfs_hash_create("REQUEST_COOKIE_HASH", CFS_HASH_BITS_MIN, @@ -1210,6 +1238,13 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt) cdt->cdt_user_request_mask = (1UL << HSMA_RESTORE); cdt->cdt_group_request_mask = (1UL << HSMA_RESTORE); cdt->cdt_other_request_mask = (1UL << HSMA_RESTORE); + rc = rhashtable_init(&cdt->cdt_restore_hash, &crh_hash_params); + if (rc) { + CERROR("%s: failed to create cdt_restore hash: rc = %d\n", + mdt_obd_name(mdt), rc); + set_cdt_state(cdt, CDT_STOPPED); + RETURN(rc); + } /* to avoid deadlock when start is made through sysfs * sysfs entries are created by the coordinator thread diff --git a/lustre/mdt/mdt_hsm_cdt_client.c b/lustre/mdt/mdt_hsm_cdt_client.c index 33cce3d..fceedcf 100644 --- a/lustre/mdt/mdt_hsm_cdt_client.c +++ b/lustre/mdt/mdt_hsm_cdt_client.c @@ -370,11 +370,8 @@ static int mdt_hsm_register_hal(struct mdt_thread_info *mti, rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid, &hai->hai_extent); - if (rc == 1) { - rc = 0; + if (rc == -EEXIST) continue; - } - if (rc < 0) GOTO(out, rc); } @@ -472,9 +469,7 @@ bool mdt_hsm_restore_is_running(struct mdt_thread_info *mti, if (!cdt_getref_try(cdt)) return false; - mutex_lock(&cdt->cdt_restore_lock); - is_running = (cdt_restore_handle_find(cdt, fid) != NULL); - mutex_unlock(&cdt->cdt_restore_lock); + is_running = cdt_restore_handle_exists(cdt, fid); cdt_putref(cdt); diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index c92ae96..7aecce4 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -123,7 +123,6 @@ static inline char *cdt_mdt_state2str(int state) * cdt_llog_lock * cdt_agent_lock * cdt_counter_lock - * cdt_restore_lock * cdt_request_lock */ struct coordinator { @@ -147,8 +146,6 @@ struct coordinator { struct rw_semaphore cdt_agent_lock; /**< protect agent list */ struct rw_semaphore cdt_request_lock; /**< protect request * list */ - struct mutex cdt_restore_lock; /**< protect restore - * list */ timeout_t cdt_loop_period; /**< llog scan period */ timeout_t cdt_grace_delay; /**< request grace * delay */ @@ -171,7 +168,9 @@ struct coordinator { struct list_head cdt_request_list; struct list_head cdt_agents; /**< list of register * agents */ - struct list_head cdt_restore_handle_list; + struct rhashtable cdt_restore_hash; /* rhashtable for + * restore requests + */ /* Hash of cookies to locations of record locations in agent * request log. */ @@ -652,10 +651,12 @@ struct hsm_agent { }; struct cdt_restore_handle { - struct list_head crh_list; /**< to chain the handle */ + struct rhash_head crh_hash; /**< link to cdt_restore_hash */ struct lu_fid crh_fid; /**< fid of the object */ struct ldlm_extent crh_extent; /**< extent of the restore */ struct mdt_lock_handle crh_lh; /**< lock handle */ + atomic_t crh_refc; /**< crh ref counter */ + struct rcu_head crh_rcu; /**< crh rcu head */ }; extern struct kmem_cache *mdt_hsm_cdt_kmem; /** restore handle slab cache */ @@ -1145,7 +1146,7 @@ void mdt_hsm_dump_hal(int level, const char *prefix, int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt, const struct lu_fid *fid, const struct hsm_extent *he); -struct cdt_restore_handle *cdt_restore_handle_find(struct coordinator *cdt, +bool cdt_restore_handle_exists(struct coordinator *cdt, const struct lu_fid *fid); void cdt_restore_handle_del(struct mdt_thread_info *mti, struct coordinator *cdt, const struct lu_fid *fid); diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index ec175e7..a56994862 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -110,7 +110,7 @@ MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache"); static void lu_object_free(const struct lu_env *env, struct lu_object *o); static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx); -static u32 lu_fid_hash(const void *data, u32 len, u32 seed) +u32 lu_fid_hash(const void *data, u32 len, u32 seed) { const struct lu_fid *fid = data; @@ -118,6 +118,7 @@ static u32 lu_fid_hash(const void *data, u32 len, u32 seed) seed ^= cfs_hash_64(fid->f_seq, 32); return seed; } +EXPORT_SYMBOL(lu_fid_hash); static const struct rhashtable_params obj_hash_params = { .key_len = sizeof(struct lu_fid), diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 8c1ca71..bd0e42f 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -5495,6 +5495,37 @@ test_409a() { } run_test 409a "Coordinator should not stop when in use" +test_409b() +{ + mkdir_on_mdt0 $DIR/$tdir + + local f=$DIR/$tdir/$tfile + local fid=$(create_empty_file "$f") + + copytool setup + + $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f + wait_request_state $fid ARCHIVE SUCCEED + + $LFS hsm_release $f || error "cannot release $f" + check_hsm_flags $f "0x0000000d" + + kill_copytools + wait_copytools || error "copytools failed to stop" + + # Remount fs to clear cached file attributes and free restore hash + stack_trap "cdt_check_state enabled" EXIT + stack_trap "cdt_set_mount_state enabled" EXIT + cdt_set_mount_state shutdown + cdt_check_state stopped + + fail mds1 + + # getattr should work even if CDT is stopped + stat $f || error "cannot stat file" +} +run_test 409b "getattr released file with CDT stopped after remount" + test_500() { [ "$MDS1_VERSION" -lt $(version_code 2.6.92) ] && diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index fca6821..3d2b29c 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -5048,7 +5048,6 @@ stopall() { } cleanup_echo_devs () { - trap 0 local dev local devs=$($LCTL dl | grep echo | awk '{print $4}')