Whamcloud - gitweb
LU-16356 hsm: store crh in rhashtable instead of list 84/49284/28
authorSergey Cheremencev <scherementsev@ddn.com>
Thu, 31 Aug 2023 16:12:51 +0000 (18:12 +0200)
committerOleg Drokin <green@whamcloud.com>
Fri, 23 Feb 2024 06:59:37 +0000 (06:59 +0000)
Store coordinator restore handles in rhashtable instead of list.
Search in a list with above a million entries takes too much time
causing to wait a lot of tasks due to contention on cdt_restore_lock.
As cdt_restore_lock is not needed anymore to protect
cdt_restore_handle_list, this patch also solves the problem with
parallel restore requests(LU-15132).

Add regression test sanity-hsm 409b.

Fixes: 66b3e74bc ("LU-15132 hsm: Protect against parallel HSM restore requests")
Test-Parameters: testlist=sanity-hsm env=ONLY=409b,ONLY_REPEAT=20
HPE-bug-id: LUS-11055
Change-Id: I3bb8788f6a0ce4c3fe4a3be85804df1c6845c313
Signed-off-by: Sergey Cheremencev <scherementsev@ddn.com>
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Signed-off-by: Nikitas Angelinas <nikitas.angelinas@hpe.com>
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49284
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
lustre/include/lustre_fid.h
lustre/mdt/mdt_coordinator.c
lustre/mdt/mdt_hsm_cdt_client.c
lustre/mdt/mdt_internal.h
lustre/obdclass/lu_object.c
lustre/tests/sanity-hsm.sh
lustre/tests/test-framework.sh

index a7ce2da..c2bea87 100644 (file)
@@ -764,6 +764,8 @@ static inline __u32 fid_hash(const struct lu_fid *f, int bits)
        return cfs_hash_long(fid_flatten64(f), bits);
 }
 
+u32 lu_fid_hash(const void *data, u32 len, u32 seed);
+
 static inline int
 lu_fid_diff(const struct lu_fid *fid1, const struct lu_fid *fid2)
 {
index 2c8cb88..7410207 100644 (file)
@@ -453,6 +453,44 @@ static int mdt_coordinator_cb(const struct lu_env *env,
        }
 }
 
+static void cdt_crh_free(struct rcu_head *head)
+{
+       struct cdt_restore_handle *crh;
+
+       crh = container_of(head, struct cdt_restore_handle, crh_rcu);
+       OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
+}
+
+static void
+cdt_crh_put(struct cdt_restore_handle *crh, struct mdt_thread_info *cdt_mti)
+{
+       if (atomic_dec_and_test(&crh->crh_refc)) {
+               /* XXX We pass a NULL object since the restore handle does not
+                * keep a reference on the object being restored.
+                */
+               if (lustre_handle_is_used(&crh->crh_lh.mlh_reg_lh))
+                       mdt_object_unlock(cdt_mti, NULL, &crh->crh_lh, 1);
+               call_rcu(&crh->crh_rcu, cdt_crh_free);
+       }
+}
+
+static void crh_free_hash(void *vcrh, void *vcdt_mti)
+{
+       struct cdt_restore_handle *crh = vcrh;
+       struct mdt_thread_info *cdt_mti = vcdt_mti;
+
+       /* put last reference */
+       cdt_crh_put(crh, cdt_mti);
+}
+
+static const struct rhashtable_params crh_hash_params = {
+       .key_len        = sizeof(struct lu_fid),
+       .key_offset     = offsetof(struct cdt_restore_handle, crh_fid),
+       .head_offset    = offsetof(struct cdt_restore_handle, crh_hash),
+       .hashfn         = lu_fid_hash,
+       .automatic_shrinking = true,
+};
+
 /* Release the ressource used by the coordinator. Called when the
  * coordinator is stopping. */
 static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt)
@@ -460,7 +498,6 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt)
        struct coordinator              *cdt = &mdt->mdt_coordinator;
        struct cdt_agent_req            *car, *tmp1;
        struct hsm_agent                *ha, *tmp2;
-       struct cdt_restore_handle       *crh, *tmp3;
        struct mdt_thread_info          *cdt_mti;
 
        /* start cleaning */
@@ -486,15 +523,9 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt)
        up_write(&cdt->cdt_agent_lock);
 
        cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
-       mutex_lock(&cdt->cdt_restore_lock);
-       list_for_each_entry_safe(crh, tmp3, &cdt->cdt_restore_handle_list,
-                                crh_list) {
-               list_del(&crh->crh_list);
-               /* give back layout lock */
-               mdt_object_unlock(cdt_mti, NULL, &crh->crh_lh, 1);
-               OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
-       }
-       mutex_unlock(&cdt->cdt_restore_lock);
+       rhashtable_free_and_destroy(&cdt->cdt_restore_hash, crh_free_hash,
+                                   cdt_mti);
+       rcu_barrier();
 }
 
 /*
@@ -864,56 +895,52 @@ int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt,
         */
        crh->crh_extent.start = 0;
        crh->crh_extent.end = he->length;
+       atomic_set(&crh->crh_refc, 2);
 
-       mutex_lock(&cdt->cdt_restore_lock);
-       if (cdt_restore_handle_find(cdt, fid) != NULL)
-               GOTO(out_crl, rc = 1);
-
-       list_add_tail(&crh->crh_list, &cdt->cdt_restore_handle_list);
-       mutex_unlock(&cdt->cdt_restore_lock);
+       rc = rhashtable_lookup_insert_fast(&cdt->cdt_restore_hash,
+                                          &crh->crh_hash, crh_hash_params);
+       if (rc) {
+               OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
+               RETURN(rc);
+       }
 
        /* get the layout lock */
        obj = mdt_object_find_lock(mti, &crh->crh_fid, &crh->crh_lh,
                                   MDS_INODELOCK_LAYOUT, LCK_EX);
-       if (IS_ERR(obj))
-               GOTO(out_ldel, rc = PTR_ERR(obj));
+       if (IS_ERR(obj)) {
+               rc = rhashtable_remove_fast(&cdt->cdt_restore_hash,
+                                           &crh->crh_hash, crh_hash_params);
+               /* rc < 0 means it has been removed in a parallel thread.
+                * This shouldn't happen by design as at current stage record
+                * hasn't been added in llog yet.
+                */
+               if (!rc)
+                       cdt_crh_put(crh, mti);
+               cdt_crh_put(crh, mti);
+
+               RETURN(PTR_ERR(obj));
+       }
 
        /* We do not keep a reference on the object during the restore
         * which can be very long.
         */
        mdt_object_put(mti->mti_env, obj);
-
-       RETURN(0);
-out_ldel:
-       mutex_lock(&cdt->cdt_restore_lock);
-       list_del(&crh->crh_list);
-out_crl:
-       mutex_unlock(&cdt->cdt_restore_lock);
-       OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
-
-       return rc;
+       cdt_crh_put(crh, mti);
+       RETURN(rc);
 }
 
 /**
  * lookup a restore handle by FID
- * caller needs to hold cdt_restore_lock
  * \param cdt [IN] coordinator
  * \param fid [IN] FID
- * \retval cdt_restore_handle found
- * \retval NULL not found
+ * \retval true cdt_restore_handle found
+ * \retval false not found
  */
-struct cdt_restore_handle *cdt_restore_handle_find(struct coordinator *cdt,
-                                                  const struct lu_fid *fid)
+bool cdt_restore_handle_exists(struct coordinator *cdt,
+                              const struct lu_fid *fid)
 {
-       struct cdt_restore_handle *crh;
-       ENTRY;
-
-       list_for_each_entry(crh, &cdt->cdt_restore_handle_list, crh_list) {
-               if (lu_fid_eq(&crh->crh_fid, fid))
-                       RETURN(crh);
-       }
-
-       RETURN(NULL);
+       return rhashtable_lookup_fast(&cdt->cdt_restore_hash, fid,
+                                     crh_hash_params);
 }
 
 void cdt_restore_handle_del(struct mdt_thread_info *mti,
@@ -922,19 +949,19 @@ void cdt_restore_handle_del(struct mdt_thread_info *mti,
        struct cdt_restore_handle *crh;
 
        /* give back layout lock */
-       mutex_lock(&cdt->cdt_restore_lock);
-       crh = cdt_restore_handle_find(cdt, fid);
-       if (crh != NULL)
-               list_del(&crh->crh_list);
-       mutex_unlock(&cdt->cdt_restore_lock);
-
+       rcu_read_lock();
+       crh = rhashtable_lookup(&cdt->cdt_restore_hash, fid, crh_hash_params);
+       if (crh &&
+           rhashtable_remove_fast(&cdt->cdt_restore_hash, &crh->crh_hash,
+                                  crh_hash_params))
+               crh = NULL;
+       rcu_read_unlock();
+
+       /* crh has been removed in a parallel thread */
        if (crh == NULL)
                return;
 
-       /* XXX We pass a NULL object since the restore handle does not
-        * keep a reference on the object being restored. */
-       mdt_object_unlock(mti, NULL, &crh->crh_lh, 1);
-       OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
+       cdt_crh_put(crh, mti);
 }
 
 /**
@@ -996,8 +1023,11 @@ static int hsm_restore_cb(const struct lu_env *env,
        }
 
        rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid, &hai->hai_extent);
-       if (rc == 1)
+       if (rc == -EEXIST) {
+               CWARN("%s: duplicate restore record for fid="DFID" found in the llog: rc = %d\n",
+                     mdt_obd_name(mti->mti_mdt), PFID(&hai->hai_fid), rc);
                rc = 0;
+       }
 out:
        RETURN(rc);
 }
@@ -1069,13 +1099,11 @@ int mdt_hsm_cdt_init(struct mdt_device *mdt)
        init_rwsem(&cdt->cdt_llog_lock);
        init_rwsem(&cdt->cdt_agent_lock);
        init_rwsem(&cdt->cdt_request_lock);
-       mutex_init(&cdt->cdt_restore_lock);
        mutex_init(&cdt->cdt_state_lock);
        set_cdt_state(cdt, CDT_STOPPED);
 
        INIT_LIST_HEAD(&cdt->cdt_request_list);
        INIT_LIST_HEAD(&cdt->cdt_agents);
-       INIT_LIST_HEAD(&cdt->cdt_restore_handle_list);
 
        cdt->cdt_request_cookie_hash = cfs_hash_create("REQUEST_COOKIE_HASH",
                                                       CFS_HASH_BITS_MIN,
@@ -1210,6 +1238,13 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt)
        cdt->cdt_user_request_mask = (1UL << HSMA_RESTORE);
        cdt->cdt_group_request_mask = (1UL << HSMA_RESTORE);
        cdt->cdt_other_request_mask = (1UL << HSMA_RESTORE);
+       rc = rhashtable_init(&cdt->cdt_restore_hash, &crh_hash_params);
+       if (rc) {
+               CERROR("%s: failed to create cdt_restore hash: rc = %d\n",
+                      mdt_obd_name(mdt), rc);
+               set_cdt_state(cdt, CDT_STOPPED);
+               RETURN(rc);
+       }
 
        /* to avoid deadlock when start is made through sysfs
         * sysfs entries are created by the coordinator thread
index 33cce3d..fceedcf 100644 (file)
@@ -370,11 +370,8 @@ static int mdt_hsm_register_hal(struct mdt_thread_info *mti,
 
                        rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid,
                                                    &hai->hai_extent);
-                       if (rc == 1) {
-                               rc = 0;
+                       if (rc == -EEXIST)
                                continue;
-                       }
-
                        if (rc < 0)
                                GOTO(out, rc);
                }
@@ -472,9 +469,7 @@ bool mdt_hsm_restore_is_running(struct mdt_thread_info *mti,
        if (!cdt_getref_try(cdt))
                return false;
 
-       mutex_lock(&cdt->cdt_restore_lock);
-       is_running = (cdt_restore_handle_find(cdt, fid) != NULL);
-       mutex_unlock(&cdt->cdt_restore_lock);
+       is_running = cdt_restore_handle_exists(cdt, fid);
 
        cdt_putref(cdt);
 
index c92ae96..7aecce4 100644 (file)
@@ -123,7 +123,6 @@ static inline char *cdt_mdt_state2str(int state)
  * cdt_llog_lock
  * cdt_agent_lock
  * cdt_counter_lock
- * cdt_restore_lock
  * cdt_request_lock
  */
 struct coordinator {
@@ -147,8 +146,6 @@ struct coordinator {
        struct rw_semaphore      cdt_agent_lock;      /**< protect agent list */
        struct rw_semaphore      cdt_request_lock;    /**< protect request
                                                       * list */
-       struct mutex             cdt_restore_lock;    /**< protect restore
-                                                      * list */
        timeout_t                cdt_loop_period;     /**< llog scan period */
        timeout_t                cdt_grace_delay;     /**< request grace
                                                       * delay */
@@ -171,7 +168,9 @@ struct coordinator {
        struct list_head         cdt_request_list;
        struct list_head         cdt_agents;          /**< list of register
                                                       * agents */
-       struct list_head         cdt_restore_handle_list;
+       struct rhashtable        cdt_restore_hash;    /* rhashtable for
+                                                      * restore requests
+                                                      */
 
        /* Hash of cookies to locations of record locations in agent
         * request log. */
@@ -652,10 +651,12 @@ struct hsm_agent {
 };
 
 struct cdt_restore_handle {
-       struct list_head        crh_list;       /**< to chain the handle */
+       struct rhash_head       crh_hash;       /**< link to cdt_restore_hash */
        struct lu_fid           crh_fid;        /**< fid of the object */
        struct ldlm_extent      crh_extent;     /**< extent of the restore */
        struct mdt_lock_handle  crh_lh;         /**< lock handle */
+       atomic_t                crh_refc;       /**< crh ref counter */
+       struct rcu_head         crh_rcu;        /**< crh rcu head */
 };
 extern struct kmem_cache *mdt_hsm_cdt_kmem;    /** restore handle slab cache */
 
@@ -1145,7 +1146,7 @@ void mdt_hsm_dump_hal(int level, const char *prefix,
 int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt,
                           const struct lu_fid *fid,
                           const struct hsm_extent *he);
-struct cdt_restore_handle *cdt_restore_handle_find(struct coordinator *cdt,
+bool cdt_restore_handle_exists(struct coordinator *cdt,
                                                   const struct lu_fid *fid);
 void cdt_restore_handle_del(struct mdt_thread_info *mti,
                            struct coordinator *cdt, const struct lu_fid *fid);
index ec175e7..a569948 100644 (file)
@@ -110,7 +110,7 @@ MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache");
 static void lu_object_free(const struct lu_env *env, struct lu_object *o);
 static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx);
 
-static u32 lu_fid_hash(const void *data, u32 len, u32 seed)
+u32 lu_fid_hash(const void *data, u32 len, u32 seed)
 {
        const struct lu_fid *fid = data;
 
@@ -118,6 +118,7 @@ static u32 lu_fid_hash(const void *data, u32 len, u32 seed)
        seed ^= cfs_hash_64(fid->f_seq, 32);
        return seed;
 }
+EXPORT_SYMBOL(lu_fid_hash);
 
 static const struct rhashtable_params obj_hash_params = {
        .key_len        = sizeof(struct lu_fid),
index 8c1ca71..bd0e42f 100755 (executable)
@@ -5495,6 +5495,37 @@ test_409a() {
 }
 run_test 409a "Coordinator should not stop when in use"
 
+test_409b()
+{
+       mkdir_on_mdt0 $DIR/$tdir
+
+       local f=$DIR/$tdir/$tfile
+       local fid=$(create_empty_file "$f")
+
+       copytool setup
+
+       $LFS hsm_archive --archive $HSM_ARCHIVE_NUMBER $f
+       wait_request_state $fid ARCHIVE SUCCEED
+
+       $LFS hsm_release $f || error "cannot release $f"
+       check_hsm_flags $f "0x0000000d"
+
+       kill_copytools
+       wait_copytools || error "copytools failed to stop"
+
+       # Remount fs to clear cached file attributes and free restore hash
+       stack_trap "cdt_check_state enabled" EXIT
+       stack_trap "cdt_set_mount_state enabled" EXIT
+       cdt_set_mount_state shutdown
+       cdt_check_state stopped
+
+       fail mds1
+
+       # getattr should work even if CDT is stopped
+       stat $f || error "cannot stat file"
+}
+run_test 409b "getattr released file with CDT stopped after remount"
+
 test_500()
 {
        [ "$MDS1_VERSION" -lt $(version_code 2.6.92) ] &&
index fca6821..3d2b29c 100755 (executable)
@@ -5048,7 +5048,6 @@ stopall() {
 }
 
 cleanup_echo_devs () {
-       trap 0
        local dev
        local devs=$($LCTL dl | grep echo | awk '{print $4}')