Whamcloud - gitweb
LU-15132 hsm: Protect against parallel HSM restore requests
[fs/lustre-release.git] / lustre / mdt / mdt_coordinator.c
index 02efdb7..b321d94 100644 (file)
@@ -47,8 +47,6 @@
 #include <lustre_kernelcomm.h>
 #include "mdt_internal.h"
 
-static struct lprocfs_vars lprocfs_mdt_hsm_vars[];
-
 /**
  * get obj and HSM attributes on a fid
  * \param mti [IN] context
@@ -142,6 +140,9 @@ struct hsm_scan_data {
         * for new work?
         */
        bool                     hsd_housekeeping;
+       bool                     hsd_one_restore;
+       u32                      hsd_start_cat_idx;
+       u32                      hsd_start_rec_idx;
        int                      hsd_action_count;
        int                      hsd_request_len; /* array alloc len */
        int                      hsd_request_count; /* array used count */
@@ -159,24 +160,25 @@ static int mdt_cdt_waiting_cb(const struct lu_env *env,
        struct hsm_action_item *hai;
        size_t hai_size;
        u32 archive_id;
+       bool wrapped;
        int i;
 
        /* Are agents full? */
+       if (atomic_read(&cdt->cdt_request_count) >= cdt->cdt_max_requests)
+               RETURN(hsd->hsd_housekeeping ? 0 : LLOG_PROC_BREAK);
+
        if (hsd->hsd_action_count + atomic_read(&cdt->cdt_request_count) >=
            cdt->cdt_max_requests) {
-               if (hsd->hsd_housekeeping) {
-                       /* Unknown request and no more room for a new
-                        * request. Continue to scan to find other
-                        * entries for already existing requests. */
-                       RETURN(0);
-               } else {
-                       /* We cannot send and more requests, stop
-                        * here. There might be more known requests
-                        * that could be merged, but this avoid
-                        * analyzing too many llogs for minor
-                        * gains. */
-                       RETURN(LLOG_PROC_BREAK);
-               }
+               /* We cannot send any more request
+                *
+                *                     *** SPECIAL CASE ***
+                *
+                * Restore requests are too important not to schedule at least
+                * one, everytime we can.
+                */
+               if (larr->arr_hai.hai_action != HSMA_RESTORE ||
+                   hsd->hsd_one_restore)
+                       RETURN(hsd->hsd_housekeeping ? 0 : LLOG_PROC_BREAK);
        }
 
        hai_size = cfs_size_round(larr->arr_hai.hai_len);
@@ -193,17 +195,55 @@ static int mdt_cdt_waiting_cb(const struct lu_env *env,
                }
        }
 
-       if (!request) {
-               struct hsm_action_list *hal;
+       /* Are we trying to force-schedule a request? */
+       if (hsd->hsd_action_count + atomic_read(&cdt->cdt_request_count) >=
+           cdt->cdt_max_requests) {
+               /* Is there really no compatible hsm_scan_request? */
+               if (!request) {
+                       for (i -= 1; i >= 0; i--) {
+                               if (hsd->hsd_request[i].hal->hal_archive_id ==
+                                   archive_id) {
+                                       request = &hsd->hsd_request[i];
+                                       break;
+                               }
+                       }
+               }
+
+               /* Make room for the hai */
+               if (request) {
+                       /* Discard the last hai until there is enough space */
+                       do {
+                               request->hal->hal_count--;
+
+                               hai = hai_first(request->hal);
+                               for (i = 0; i < request->hal->hal_count; i++)
+                                       hai = hai_next(hai);
+                               request->hal_used_sz -=
+                                       cfs_size_round(hai->hai_len);
+                               hsd->hsd_action_count--;
+                       } while (request->hal_used_sz + hai_size >
+                                LDLM_MAXREQSIZE);
+               } else if (hsd->hsd_housekeeping) {
+                       struct hsm_scan_request *tmp;
+
+                       /* Discard the (whole) last hal */
+                       hsd->hsd_request_count--;
+                       LASSERT(hsd->hsd_request_count >= 0);
+                       tmp = &hsd->hsd_request[hsd->hsd_request_count];
+                       hsd->hsd_action_count -= tmp->hal->hal_count;
+                       LASSERT(hsd->hsd_action_count >= 0);
+                       OBD_FREE(tmp->hal, tmp->hal_sz);
+               } else {
+                       /* Bailing out, this code path is too hot */
+                       RETURN(LLOG_PROC_BREAK);
 
-               if (hsd->hsd_request_count == hsd->hsd_request_len) {
-                       /* Logic as above. */
-                       if (hsd->hsd_housekeeping)
-                               RETURN(0);
-                       else
-                               RETURN(LLOG_PROC_BREAK);
                }
+       }
 
+       if (!request) {
+               struct hsm_action_list *hal;
+
+               LASSERT(hsd->hsd_request_count < hsd->hsd_request_len);
                request = &hsd->hsd_request[hsd->hsd_request_count];
 
                /* allocates hai vector size just needs to be large
@@ -246,15 +286,32 @@ static int mdt_cdt_waiting_cb(const struct lu_env *env,
 
        memcpy(hai, &larr->arr_hai, larr->arr_hai.hai_len);
 
-       request->hal_used_sz += cfs_size_round(hai->hai_len);
+       request->hal_used_sz += hai_size;
        request->hal->hal_count++;
 
        hsd->hsd_action_count++;
 
-       if (hai->hai_action != HSMA_CANCEL)
+       switch (hai->hai_action) {
+       case HSMA_CANCEL:
+               break;
+       case HSMA_RESTORE:
+               hsd->hsd_one_restore = true;
+               fallthrough;
+       default:
                cdt_agent_record_hash_add(cdt, hai->hai_cookie,
                                          llh->lgh_hdr->llh_cat_idx,
                                          larr->arr_hdr.lrh_index);
+       }
+
+       wrapped = llh->lgh_hdr->llh_cat_idx >= llh->lgh_last_idx &&
+                 llh->lgh_hdr->llh_count > 1;
+       if ((!wrapped && llh->lgh_hdr->llh_cat_idx > hsd->hsd_start_cat_idx) ||
+           (wrapped && llh->lgh_hdr->llh_cat_idx < hsd->hsd_start_cat_idx) ||
+           (llh->lgh_hdr->llh_cat_idx == hsd->hsd_start_cat_idx &&
+            larr->arr_hdr.lrh_index > hsd->hsd_start_rec_idx)) {
+               hsd->hsd_start_cat_idx = llh->lgh_hdr->llh_cat_idx;
+               hsd->hsd_start_rec_idx = larr->arr_hdr.lrh_index;
+       }
 
        RETURN(0);
 }
@@ -270,7 +327,7 @@ static int mdt_cdt_started_cb(const struct lu_env *env,
        struct cdt_agent_req *car;
        time64_t now = ktime_get_real_seconds();
        time64_t last;
-       int cl_flags;
+       enum changelog_rec_flags clf_flags;
        int rc;
 
        if (!hsd->hsd_housekeeping)
@@ -302,30 +359,30 @@ static int mdt_cdt_started_cb(const struct lu_env *env,
        }
 
        /* Emit a changelog record for the failed action.*/
-       cl_flags = 0;
-       hsm_set_cl_error(&cl_flags, ECANCELED);
+       clf_flags = 0;
+       hsm_set_cl_error(&clf_flags, ECANCELED);
 
        switch (hai->hai_action) {
        case HSMA_ARCHIVE:
-               hsm_set_cl_event(&cl_flags, HE_ARCHIVE);
+               hsm_set_cl_event(&clf_flags, HE_ARCHIVE);
                break;
        case HSMA_RESTORE:
-               hsm_set_cl_event(&cl_flags, HE_RESTORE);
+               hsm_set_cl_event(&clf_flags, HE_RESTORE);
                break;
        case HSMA_REMOVE:
-               hsm_set_cl_event(&cl_flags, HE_REMOVE);
+               hsm_set_cl_event(&clf_flags, HE_REMOVE);
                break;
        case HSMA_CANCEL:
-               hsm_set_cl_event(&cl_flags, HE_CANCEL);
+               hsm_set_cl_event(&clf_flags, HE_CANCEL);
                break;
        default:
                /* Unknown record type, skip changelog. */
-               cl_flags = 0;
+               clf_flags = 0;
                break;
        }
 
-       if (cl_flags != 0)
-               mo_changelog(env, CL_HSM, cl_flags, mdt->mdt_child,
+       if (clf_flags != 0)
+               mo_changelog(env, CL_HSM, clf_flags, mdt->mdt_child,
                             &hai->hai_fid);
 
        if (hai->hai_action == HSMA_RESTORE)
@@ -395,55 +452,6 @@ static int mdt_coordinator_cb(const struct lu_env *env,
        }
 }
 
-/**
- * create /proc entries for coordinator
- * \param mdt [IN]
- * \retval 0 success
- * \retval -ve failure
- */
-int hsm_cdt_procfs_init(struct mdt_device *mdt)
-{
-       struct coordinator      *cdt = &mdt->mdt_coordinator;
-       int                      rc = 0;
-       ENTRY;
-
-       /* init /proc entries, failure is not critical */
-       cdt->cdt_proc_dir = lprocfs_register("hsm",
-                                            mdt2obd_dev(mdt)->obd_proc_entry,
-                                            lprocfs_mdt_hsm_vars, mdt);
-       if (IS_ERR(cdt->cdt_proc_dir)) {
-               rc = PTR_ERR(cdt->cdt_proc_dir);
-               CERROR("%s: Cannot create 'hsm' directory in mdt proc dir,"
-                      " rc=%d\n", mdt_obd_name(mdt), rc);
-               cdt->cdt_proc_dir = NULL;
-               RETURN(rc);
-       }
-
-       RETURN(0);
-}
-
-/**
- * remove /proc entries for coordinator
- * \param mdt [IN]
- */
-void hsm_cdt_procfs_fini(struct mdt_device *mdt)
-{
-       struct coordinator *cdt = &mdt->mdt_coordinator;
-
-       if (cdt->cdt_proc_dir != NULL)
-               lprocfs_remove(&cdt->cdt_proc_dir);
-}
-
-/**
- * get vector of hsm cdt /proc vars
- * \param none
- * \retval var vector
- */
-struct lprocfs_vars *hsm_cdt_get_proc_vars(void)
-{
-       return lprocfs_mdt_hsm_vars;
-}
-
 /* Release the ressource used by the coordinator. Called when the
  * coordinator is stopping. */
 static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt)
@@ -469,6 +477,9 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt)
        down_write(&cdt->cdt_agent_lock);
        list_for_each_entry_safe(ha, tmp2, &cdt->cdt_agents, ha_list) {
                list_del(&ha->ha_list);
+               if (ha->ha_archive_cnt != 0)
+                       OBD_FREE_PTR_ARRAY(ha->ha_archive_id,
+                                          ha->ha_archive_cnt);
                OBD_FREE_PTR(ha);
        }
        up_write(&cdt->cdt_agent_lock);
@@ -477,6 +488,9 @@ static void mdt_hsm_cdt_cleanup(struct mdt_device *mdt)
        mutex_lock(&cdt->cdt_restore_lock);
        list_for_each_entry_safe(crh, tmp3, &cdt->cdt_restore_handle_list,
                                 crh_list) {
+               /* not locked yet, cleanup by cdt_restore_handle_add() */
+               if (crh->crh_lh.mlh_type == MDT_NUL_LOCK)
+                       continue;
                list_del(&crh->crh_list);
                /* give back layout lock */
                mdt_object_unlock(cdt_mti, NULL, &crh->crh_lh, 1);
@@ -538,7 +552,31 @@ static int set_cdt_state(struct coordinator *cdt, enum cdt_states new_state)
        return rc;
 }
 
+static int mdt_hsm_pending_restore(struct mdt_thread_info *mti);
+
+static void cdt_start_pending_restore(struct mdt_device *mdt,
+                                     struct coordinator *cdt)
+{
+       struct mdt_thread_info *cdt_mti;
+       unsigned int i = 0;
+       int rc;
+
+       /* wait until MDD initialize hsm actions llog */
+       while (!test_bit(MDT_FL_CFGLOG, &mdt->mdt_state) && i < obd_timeout) {
+               schedule_timeout_interruptible(cfs_time_seconds(1));
+               i++;
+       }
+       if (!test_bit(MDT_FL_CFGLOG, &mdt->mdt_state))
+               CWARN("%s: trying to init HSM before MDD\n", mdt_obd_name(mdt));
+
+       /* set up list of started restore requests */
+       cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
+       rc = mdt_hsm_pending_restore(cdt_mti);
+       if (rc)
+               CERROR("%s: cannot take the layout locks needed for registered restore: %d\n",
+                      mdt_obd_name(mdt), rc);
 
+}
 
 /**
  * coordinator thread
@@ -558,7 +596,7 @@ static int mdt_coordinator(void *data)
        ENTRY;
 
        CDEBUG(D_HSM, "%s: coordinator thread starting, pid=%d\n",
-              mdt_obd_name(mdt), current_pid());
+              mdt_obd_name(mdt), current->pid);
 
        hsd.hsd_mti = mti;
        obd_uuid2fsname(hsd.hsd_fsname, mdt_obd_name(mdt),
@@ -567,13 +605,16 @@ static int mdt_coordinator(void *data)
        set_cdt_state(cdt, CDT_RUNNING);
 
        /* Inform mdt_hsm_cdt_start(). */
-       wake_up_all(&cdt->cdt_waitq);
+       wake_up(&cdt->cdt_waitq);
+       cdt_start_pending_restore(mdt, cdt);
 
        while (1) {
                int i;
                int update_idx = 0;
                int updates_sz;
                int updates_cnt;
+               u32 start_cat_idx;
+               u32 start_rec_idx;
                struct hsm_record_update *updates;
 
                /* Limit execution of the expensive requests traversal
@@ -607,8 +648,12 @@ static int mdt_coordinator(void *data)
                    ktime_get_real_seconds()) {
                        last_housekeeping = ktime_get_real_seconds();
                        hsd.hsd_housekeeping = true;
+                       start_cat_idx = 0;
+                       start_rec_idx = 0;
                } else if (cdt->cdt_event) {
                        hsd.hsd_housekeeping = false;
+                       start_cat_idx = hsd.hsd_start_cat_idx;
+                       start_rec_idx = hsd.hsd_start_rec_idx;
                } else {
                        continue;
                }
@@ -643,9 +688,11 @@ static int mdt_coordinator(void *data)
 
                hsd.hsd_action_count = 0;
                hsd.hsd_request_count = 0;
+               hsd.hsd_one_restore = false;
 
                rc = cdt_llog_process(mti->mti_env, mdt, mdt_coordinator_cb,
-                                     &hsd, 0, 0, WRITE);
+                                     &hsd, start_cat_idx, start_rec_idx,
+                                     WRITE);
                if (rc < 0)
                        goto clean_cb_alloc;
 
@@ -655,6 +702,9 @@ static int mdt_coordinator(void *data)
                if (list_empty(&cdt->cdt_agents)) {
                        CDEBUG(D_HSM, "no agent available, "
                                      "coordinator sleeps\n");
+                       /* reset HSM scanning index range. */
+                       hsd.hsd_start_cat_idx = start_cat_idx;
+                       hsd.hsd_start_rec_idx = start_rec_idx;
                        goto clean_cb_alloc;
                }
 
@@ -672,10 +722,10 @@ static int mdt_coordinator(void *data)
                updates_sz = updates_cnt * sizeof(*updates);
                OBD_ALLOC_LARGE(updates, updates_sz);
                if (updates == NULL) {
-                       CERROR("%s: Cannot allocate memory (%d o) "
-                              "for %d updates\n",
+                       CERROR("%s: Cannot allocate memory (%d bytes) "
+                               "for %d updates. Too many HSM requests?\n",
                               mdt_obd_name(mdt), updates_sz, updates_cnt);
-                       continue;
+                       goto clean_cb_alloc;
                }
 
                /* here hsd contains a list of requests to be started */
@@ -707,11 +757,19 @@ static int mdt_coordinator(void *data)
                                hai = hai_next(hai);
                                update_idx++;
                        }
+
+                       /* TODO: narrow down the HSM action range that already
+                        * scanned accroding to the cookies when a failure
+                        * occurs.
+                        */
+                       if (rc) {
+                               hsd.hsd_start_cat_idx = start_cat_idx;
+                               hsd.hsd_start_rec_idx = start_rec_idx;
+                       }
                }
 
                if (update_idx) {
-                       rc = mdt_agent_record_update(mti->mti_env, mdt,
-                                                    updates, update_idx);
+                       rc = mdt_agent_record_update(mti, updates, update_idx);
                        if (rc)
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "rc=%d, cannot update records "
@@ -737,19 +795,30 @@ clean_cb_alloc:
 
        if (rc != 0)
                CERROR("%s: coordinator thread exiting, process=%d, rc=%d\n",
-                      mdt_obd_name(mdt), current_pid(), rc);
+                      mdt_obd_name(mdt), current->pid, rc);
        else
                CDEBUG(D_HSM, "%s: coordinator thread exiting, process=%d,"
                              " no error\n",
-                      mdt_obd_name(mdt), current_pid());
+                      mdt_obd_name(mdt), current->pid);
 
        RETURN(rc);
 }
 
+/**
+ * register a new HSM restore handle for a file and take EX lock on the layout
+ * \param mti [IN] thread info
+ * \param cdt [IN] coordinator
+ * \param fid [IN] fid of the file to restore
+ * \param he  [IN] HSM extent
+ * \retval 0 success
+ * \retval 1 restore handle already exists for the fid
+ * \retval -ve failure
+ */
 int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt,
                           const struct lu_fid *fid,
                           const struct hsm_extent *he)
 {
+       struct mdt_lock_handle lh = { 0 };
        struct cdt_restore_handle *crh;
        struct mdt_object *obj;
        int rc;
@@ -766,31 +835,48 @@ int cdt_restore_handle_add(struct mdt_thread_info *mti, struct coordinator *cdt,
         */
        crh->crh_extent.start = 0;
        crh->crh_extent.end = he->length;
+       crh->crh_lh.mlh_type = MDT_NUL_LOCK;
+
+       mutex_lock(&cdt->cdt_restore_lock);
+       if (cdt_restore_handle_find(cdt, fid) != NULL)
+               GOTO(out_crl, rc = 1);
+
+       if (unlikely(cdt->cdt_state == CDT_STOPPED ||
+                    cdt->cdt_state == CDT_STOPPING))
+               GOTO(out_crl, rc = -EAGAIN);
+
+       list_add_tail(&crh->crh_list, &cdt->cdt_restore_handle_list);
+       mutex_unlock(&cdt->cdt_restore_lock);
+
        /* get the layout lock */
-       mdt_lock_reg_init(&crh->crh_lh, LCK_EX);
-       obj = mdt_object_find_lock(mti, &crh->crh_fid, &crh->crh_lh,
+       mdt_lock_reg_init(&lh, LCK_EX);
+       obj = mdt_object_find_lock(mti, &crh->crh_fid, &lh,
                                   MDS_INODELOCK_LAYOUT);
-       if (IS_ERR(obj))
-               GOTO(out_crh, rc = PTR_ERR(obj));
+       if (IS_ERR(obj)) {
+               mutex_lock(&cdt->cdt_restore_lock);
+               GOTO(out_ldel, rc = PTR_ERR(obj));
+       }
 
        /* We do not keep a reference on the object during the restore
-        * which can be very long. */
+        * which can be very long.
+        */
        mdt_object_put(mti->mti_env, obj);
 
        mutex_lock(&cdt->cdt_restore_lock);
        if (unlikely(cdt->cdt_state == CDT_STOPPED ||
-                    cdt->cdt_state == CDT_STOPPING)) {
-               mutex_unlock(&cdt->cdt_restore_lock);
+                    cdt->cdt_state == CDT_STOPPING))
                GOTO(out_lh, rc = -EAGAIN);
-       }
 
-       list_add_tail(&crh->crh_list, &cdt->cdt_restore_handle_list);
+       crh->crh_lh = lh;
        mutex_unlock(&cdt->cdt_restore_lock);
 
        RETURN(0);
 out_lh:
        mdt_object_unlock(mti, NULL, &crh->crh_lh, 1);
-out_crh:
+out_ldel:
+       list_del(&crh->crh_list);
+out_crl:
+       mutex_unlock(&cdt->cdt_restore_lock);
        OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
 
        return rc;
@@ -875,9 +961,10 @@ static int hsm_restore_cb(const struct lu_env *env,
 
        larr = (struct llog_agent_req_rec *)hdr;
        hai = &larr->arr_hai;
-       if (hai->hai_cookie > cdt->cdt_last_cookie)
+       if (hai->hai_cookie >= cdt->cdt_last_cookie) {
                /* update the cookie to avoid collision */
                cdt->cdt_last_cookie = hai->hai_cookie + 1;
+       }
 
        if (hai->hai_action != HSMA_RESTORE ||
            agent_req_in_final_state(larr->arr_status))
@@ -897,6 +984,8 @@ static int hsm_restore_cb(const struct lu_env *env,
        }
 
        rc = cdt_restore_handle_add(mti, cdt, &hai->hai_fid, &hai->hai_extent);
+       if (rc == 1)
+               rc = 0;
 out:
        RETURN(rc);
 }
@@ -920,10 +1009,9 @@ static int mdt_hsm_pending_restore(struct mdt_thread_info *mti)
        RETURN(rc);
 }
 
-static int hsm_init_ucred(struct lu_ucred *uc)
+int hsm_init_ucred(struct lu_ucred *uc)
 {
        ENTRY;
-
        uc->uc_valid = UCRED_OLD;
        uc->uc_o_uid = 0;
        uc->uc_o_gid = 0;
@@ -935,7 +1023,7 @@ static int hsm_init_ucred(struct lu_ucred *uc)
        uc->uc_fsgid = 0;
        uc->uc_suppgids[0] = -1;
        uc->uc_suppgids[1] = -1;
-       uc->uc_cap = CFS_CAP_FS_MASK;
+       uc->uc_cap = cap_combine(CAP_FS_SET, CAP_NFSD_SET);
        uc->uc_umask = 0777;
        uc->uc_ginfo = NULL;
        uc->uc_identity = NULL;
@@ -1014,7 +1102,7 @@ int mdt_hsm_cdt_init(struct mdt_device *mdt)
 
        hsm_init_ucred(mdt_ucred(cdt_mti));
 
-       /* default values for /proc tunnables
+       /* default values for sysfs tunnables
         * can be override by MGS conf */
        cdt->cdt_default_archive_id = 1;
        cdt->cdt_grace_delay = 60;
@@ -1071,11 +1159,11 @@ int  mdt_hsm_cdt_fini(struct mdt_device *mdt)
  */
 static int mdt_hsm_cdt_start(struct mdt_device *mdt)
 {
-       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       struct coordinator *cdt = &mdt->mdt_coordinator;
        struct mdt_thread_info *cdt_mti;
-       int                      rc;
-       void                    *ptr;
-       struct task_struct      *task;
+       int rc;
+       void *ptr;
+       struct task_struct *task;
        ENTRY;
 
        /* functions defined but not yet used
@@ -1090,7 +1178,7 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt)
                RETURN(-EALREADY);
        }
 
-       CLASSERT(1 << (CDT_POLICY_SHIFT_COUNT - 1) == CDT_POLICY_LAST);
+       BUILD_BUG_ON(BIT(CDT_POLICY_SHIFT_COUNT - 1) != CDT_POLICY_LAST);
        cdt->cdt_policy = CDT_DEFAULT_POLICY;
 
        /* just need to be larger than previous one */
@@ -1104,20 +1192,13 @@ static int mdt_hsm_cdt_start(struct mdt_device *mdt)
        cdt->cdt_group_request_mask = (1UL << HSMA_RESTORE);
        cdt->cdt_other_request_mask = (1UL << HSMA_RESTORE);
 
-       /* to avoid deadlock when start is made through /proc
-        * /proc entries are created by the coordinator thread */
-
-       /* set up list of started restore requests */
-       cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
-       rc = mdt_hsm_pending_restore(cdt_mti);
-       if (rc)
-               CERROR("%s: cannot take the layout locks needed"
-                      " for registered restore: %d\n",
-                      mdt_obd_name(mdt), rc);
-
+       /* to avoid deadlock when start is made through sysfs
+        * sysfs entries are created by the coordinator thread
+        */
        if (mdt->mdt_bottom->dd_rdonly)
                RETURN(0);
 
+       cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
        task = kthread_run(mdt_coordinator, cdt_mti, "hsm_cdtr");
        if (IS_ERR(task)) {
                rc = PTR_ERR(task);
@@ -1217,8 +1298,7 @@ int mdt_hsm_add_hal(struct mdt_thread_info *mti,
                                .status = ARS_CANCELED,
                        };
 
-                       rc = mdt_agent_record_update(mti->mti_env, mti->mti_mdt,
-                                                    &update, 1);
+                       rc = mdt_agent_record_update(mti, &update, 1);
                        if (rc) {
                                CERROR("%s: mdt_agent_record_update() failed, "
                                       "rc=%d, cannot update status to %s "
@@ -1317,7 +1397,15 @@ static int hsm_swap_layouts(struct mdt_thread_info *mti,
                                     mdt_object_child(obj),
                                     mdt_object_child(dobj),
                                     SWAP_LAYOUTS_MDS_HSM);
-
+       if (rc == 0) {
+               rc = mdt_lsom_downgrade(mti, obj);
+               if (rc)
+                       CDEBUG(D_INODE,
+                              "%s: File fid="DFID" SOM "
+                              "downgrade failed, rc = %d\n",
+                              mdt_obd_name(mti->mti_mdt),
+                              PFID(mdt_object_fid(obj)), rc);
+       }
 out_dobj:
        mdt_object_unlock_put(mti, dobj, dlh, 1);
 out:
@@ -1336,16 +1424,17 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                                     const struct cdt_agent_req *car,
                                     enum agent_req_status *status)
 {
-       const struct lu_env     *env = mti->mti_env;
-       struct mdt_device       *mdt = mti->mti_mdt;
-       struct coordinator      *cdt = &mdt->mdt_coordinator;
-       struct mdt_object       *obj = NULL;
-       int                      cl_flags = 0, rc = 0;
-       struct md_hsm            mh;
-       bool                     is_mh_changed;
-       bool                     need_changelog = true;
-       ENTRY;
+       const struct lu_env *env = mti->mti_env;
+       struct mdt_device *mdt = mti->mti_mdt;
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       struct mdt_object *obj = NULL;
+       enum changelog_rec_flags clf_flags = 0;
+       struct md_hsm mh;
+       bool is_mh_changed;
+       bool need_changelog = true;
+       int rc = 0;
 
+       ENTRY;
        /* default is to retry */
        *status = ARS_WAITING;
 
@@ -1395,25 +1484,24 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                               mdt_obd_name(mdt),
                               pgs->hpk_cookie, PFID(&pgs->hpk_fid),
                               pgs->hpk_errval);
-                       hsm_set_cl_error(&cl_flags,
-                                        CLF_HSM_ERROVERFLOW);
+                       hsm_set_cl_error(&clf_flags, CLF_HSM_ERROVERFLOW);
                        rc = -EINVAL;
                } else {
-                       hsm_set_cl_error(&cl_flags, pgs->hpk_errval);
+                       hsm_set_cl_error(&clf_flags, pgs->hpk_errval);
                }
 
                switch (car->car_hai->hai_action) {
                case HSMA_ARCHIVE:
-                       hsm_set_cl_event(&cl_flags, HE_ARCHIVE);
+                       hsm_set_cl_event(&clf_flags, HE_ARCHIVE);
                        break;
                case HSMA_RESTORE:
-                       hsm_set_cl_event(&cl_flags, HE_RESTORE);
+                       hsm_set_cl_event(&clf_flags, HE_RESTORE);
                        break;
                case HSMA_REMOVE:
-                       hsm_set_cl_event(&cl_flags, HE_REMOVE);
+                       hsm_set_cl_event(&clf_flags, HE_REMOVE);
                        break;
                case HSMA_CANCEL:
-                       hsm_set_cl_event(&cl_flags, HE_CANCEL);
+                       hsm_set_cl_event(&clf_flags, HE_CANCEL);
                        CERROR("%s: Failed request %#llx on "DFID
                               " cannot be a CANCEL\n",
                               mdt_obd_name(mdt),
@@ -1433,7 +1521,7 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                *status = ARS_SUCCEED;
                switch (car->car_hai->hai_action) {
                case HSMA_ARCHIVE:
-                       hsm_set_cl_event(&cl_flags, HE_ARCHIVE);
+                       hsm_set_cl_event(&clf_flags, HE_ARCHIVE);
                        /* set ARCHIVE keep EXIST and clear LOST and
                         * DIRTY */
                        mh.mh_arch_ver = pgs->hpk_data_version;
@@ -1442,7 +1530,7 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                        is_mh_changed = true;
                        break;
                case HSMA_RESTORE:
-                       hsm_set_cl_event(&cl_flags, HE_RESTORE);
+                       hsm_set_cl_event(&clf_flags, HE_RESTORE);
 
                        /* do not clear RELEASED and DIRTY here
                         * this will occur in hsm_swap_layouts()
@@ -1454,13 +1542,13 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                        is_mh_changed = true;
                        break;
                case HSMA_REMOVE:
-                       hsm_set_cl_event(&cl_flags, HE_REMOVE);
+                       hsm_set_cl_event(&clf_flags, HE_REMOVE);
                        /* clear ARCHIVED EXISTS and LOST */
                        mh.mh_flags &= ~(HS_ARCHIVED | HS_EXISTS | HS_LOST);
                        is_mh_changed = true;
                        break;
                case HSMA_CANCEL:
-                       hsm_set_cl_event(&cl_flags, HE_CANCEL);
+                       hsm_set_cl_event(&clf_flags, HE_CANCEL);
                        CERROR("%s: Successful request %#llx on "DFID" cannot be a CANCEL\n",
                               mdt_obd_name(mdt),
                               pgs->hpk_cookie,
@@ -1482,7 +1570,7 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
         * filled
         */
        if (rc == 0 && !IS_ERR(obj))
-               hsm_set_cl_flags(&cl_flags,
+               hsm_set_cl_flags(&clf_flags,
                                 mh.mh_flags & HS_DIRTY ? CLF_HSM_DIRTY : 0);
 
        /* unlock is done later, after layout lock management */
@@ -1493,6 +1581,8 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
         * if no retry will be attempted and if object is still alive,
         * in other cases we just unlock the object */
        if (car->car_hai->hai_action == HSMA_RESTORE) {
+               struct mdt_lock_handle *lh;
+
                /* restore in data FID done, we swap the layouts
                 * only if restore is successful */
                if (pgs->hpk_errval == 0 && !IS_ERR(obj)) {
@@ -1511,11 +1601,18 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
                /* restore special case, need to create ChangeLog record
                 * before to give back layout lock to avoid concurrent
                 * file updater to post out of order ChangeLog */
-               mo_changelog(env, CL_HSM, cl_flags, mdt->mdt_child,
+               mo_changelog(env, CL_HSM, clf_flags, mdt->mdt_child,
                             &car->car_hai->hai_fid);
                need_changelog = false;
 
                cdt_restore_handle_del(mti, cdt, &car->car_hai->hai_fid);
+               if (!IS_ERR_OR_NULL(obj)) {
+                       /* flush UPDATE lock so attributes are upadated */
+                       lh = &mti->mti_lh[MDT_LH_OLD];
+                       mdt_lock_reg_init(lh, LCK_EX);
+                       mdt_object_lock(mti, obj, lh, MDS_INODELOCK_UPDATE);
+                       mdt_object_unlock(mti, obj, lh, 1);
+               }
        }
 
        GOTO(out, rc);
@@ -1523,7 +1620,7 @@ static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
 out:
        /* always add a ChangeLog record */
        if (need_changelog)
-               mo_changelog(env, CL_HSM, cl_flags, mdt->mdt_child,
+               mo_changelog(env, CL_HSM, clf_flags, mdt->mdt_child,
                             &car->car_hai->hai_fid);
 
        if (!IS_ERR(obj))
@@ -1625,8 +1722,7 @@ int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
                update.cookie = pgs->hpk_cookie;
                update.status = status;
 
-               rc1 = mdt_agent_record_update(mti->mti_env, mdt,
-                                             &update, 1);
+               rc1 = mdt_agent_record_update(mti, &update, 1);
                if (rc1)
                        CERROR("%s: mdt_agent_record_update() failed,"
                               " rc=%d, cannot update status to %s"
@@ -1660,20 +1756,12 @@ out:
 
 
 /**
- * data passed to llog_cat_process() callback
- * to cancel requests
- */
-struct hsm_cancel_all_data {
-       struct mdt_device       *mdt;
-};
-
-/**
  *  llog_cat_process() callback, used to:
  *  - purge all requests
  * \param env [IN] environment
  * \param llh [IN] llog handle
  * \param hdr [IN] llog record
- * \param data [IN] cb data = struct hsm_cancel_all_data
+ * \param data [IN] cb data = struct mdt_thread_info
  * \retval 0 success
  * \retval -ve failure
  */
@@ -1681,18 +1769,28 @@ static int mdt_cancel_all_cb(const struct lu_env *env,
                             struct llog_handle *llh,
                             struct llog_rec_hdr *hdr, void *data)
 {
-       struct llog_agent_req_rec       *larr;
-       struct hsm_cancel_all_data      *hcad;
-       int                              rc = 0;
+       struct llog_agent_req_rec *larr = (struct llog_agent_req_rec *)hdr;
+       struct hsm_action_item *hai = &larr->arr_hai;
+       struct mdt_thread_info  *mti = data;
+       struct coordinator *cdt = &mti->mti_mdt->mdt_coordinator;
+       int rc;
        ENTRY;
 
-       larr = (struct llog_agent_req_rec *)hdr;
-       hcad = data;
-       if (larr->arr_status == ARS_WAITING ||
-           larr->arr_status == ARS_STARTED) {
-               larr->arr_status = ARS_CANCELED;
-               larr->arr_req_change = ktime_get_real_seconds();
-               rc = llog_write(env, llh, hdr, hdr->lrh_index);
+       if (larr->arr_status != ARS_WAITING &&
+           larr->arr_status != ARS_STARTED)
+               RETURN(0);
+
+       /* Unlock the EX layout lock */
+       if (hai->hai_action == HSMA_RESTORE)
+               cdt_restore_handle_del(mti, cdt, &hai->hai_fid);
+
+       larr->arr_status = ARS_CANCELED;
+       larr->arr_req_change = ktime_get_real_seconds();
+       rc = llog_write(env, llh, hdr, hdr->lrh_index);
+       if (rc < 0) {
+               CERROR("%s: cannot update agent log: rc = %d\n",
+                      mdt_obd_name(mti->mti_mdt), rc);
+               rc = LLOG_DEL_RECORD;
        }
 
        RETURN(rc);
@@ -1711,7 +1809,6 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
        struct cdt_agent_req            *car;
        struct hsm_action_list          *hal = NULL;
        struct hsm_action_item          *hai;
-       struct hsm_cancel_all_data       hcad;
        int                              hal_sz = 0, hal_len, rc;
        enum cdt_states                  old_state;
        ENTRY;
@@ -1809,10 +1906,8 @@ static int hsm_cancel_all_actions(struct mdt_device *mdt)
                OBD_FREE(hal, hal_sz);
 
        /* cancel all on-disk records */
-       hcad.mdt = mdt;
-
        rc = cdt_llog_process(mti->mti_env, mti->mti_mdt, mdt_cancel_all_cb,
-                             &hcad, 0, 0, WRITE);
+                             (void *)mti, 0, 0, WRITE);
 out_cdt_state:
        /* Enable coordinator, unless the coordinator was stopping. */
        set_cdt_state_locked(cdt, old_state);
@@ -1882,7 +1977,7 @@ bool mdt_hsm_is_action_compat(const struct hsm_action_item *hai,
 }
 
 /*
- * /proc interface used to get/set HSM behaviour (cdt->cdt_policy)
+ * sysfs interface used to get/set HSM behaviour (cdt->cdt_policy)
  */
 static const struct {
        __u64            bit;
@@ -2041,46 +2136,149 @@ out:
        OBD_FREE(buf, count + 1);
        RETURN(rc);
 }
-LPROC_SEQ_FOPS(mdt_hsm_policy);
-
-#define GENERATE_PROC_METHOD(VAR)                                      \
-static int mdt_hsm_##VAR##_seq_show(struct seq_file *m, void *data)    \
-{                                                                      \
-       struct mdt_device       *mdt = m->private;                      \
-       struct coordinator      *cdt = &mdt->mdt_coordinator;           \
-       ENTRY;                                                          \
-                                                                       \
-       seq_printf(m, "%llu\n", (__u64)cdt->VAR);                       \
-       RETURN(0);                                                      \
-}                                                                      \
-static ssize_t                                                         \
-mdt_hsm_##VAR##_seq_write(struct file *file, const char __user *buffer,        \
-                         size_t count, loff_t *off)                    \
-                                                                       \
-{                                                                      \
-       struct seq_file         *m = file->private_data;                \
-       struct mdt_device       *mdt = m->private;                      \
-       struct coordinator      *cdt = &mdt->mdt_coordinator;           \
-       unsigned int val;                                               \
-       int rc;                                                         \
-                                                                       \
-       ENTRY;                                                          \
-       rc = kstrtouint_from_user(buffer, count, 0, &val);              \
-       if (rc)                                                         \
-               RETURN(rc);                                             \
-                                                                       \
-       if (val !=  0) {                                                \
-               cdt->VAR = val;                                         \
-               RETURN(count);                                          \
-       }                                                               \
-       RETURN(-EINVAL);                                                \
-}                                                                      \
-
-GENERATE_PROC_METHOD(cdt_loop_period)
-GENERATE_PROC_METHOD(cdt_grace_delay)
-GENERATE_PROC_METHOD(cdt_active_req_timeout)
-GENERATE_PROC_METHOD(cdt_max_requests)
-GENERATE_PROC_METHOD(cdt_default_archive_id)
+LDEBUGFS_SEQ_FOPS(mdt_hsm_policy);
+
+ssize_t loop_period_show(struct kobject *kobj, struct attribute *attr,
+                        char *buf)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", cdt->cdt_loop_period);
+}
+
+ssize_t loop_period_store(struct kobject *kobj, struct attribute *attr,
+                         const char *buffer, size_t count)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val != 0)
+               cdt->cdt_loop_period = val;
+
+       return val ? count : -EINVAL;
+}
+LUSTRE_RW_ATTR(loop_period);
+
+ssize_t grace_delay_show(struct kobject *kobj, struct attribute *attr,
+                        char *buf)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", cdt->cdt_grace_delay);
+}
+
+ssize_t grace_delay_store(struct kobject *kobj, struct attribute *attr,
+                         const char *buffer, size_t count)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val != 0)
+               cdt->cdt_grace_delay = val;
+
+       return val ? count : -EINVAL;
+}
+LUSTRE_RW_ATTR(grace_delay);
+
+ssize_t active_request_timeout_show(struct kobject *kobj,
+                                   struct attribute *attr,
+                                   char *buf)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%d\n", cdt->cdt_active_req_timeout);
+}
+
+ssize_t active_request_timeout_store(struct kobject *kobj,
+                                    struct attribute *attr,
+                                    const char *buffer, size_t count)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val != 0)
+               cdt->cdt_active_req_timeout = val;
+
+       return val ? count : -EINVAL;
+}
+LUSTRE_RW_ATTR(active_request_timeout);
+
+ssize_t max_requests_show(struct kobject *kobj, struct attribute *attr,
+                         char *buf)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%llu\n", cdt->cdt_max_requests);
+}
+
+ssize_t max_requests_store(struct kobject *kobj, struct attribute *attr,
+                          const char *buffer, size_t count)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+       unsigned long long val;
+       int rc;
+
+       rc = kstrtoull(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val != 0)
+               cdt->cdt_max_requests = val;
+
+       return val ? count : -EINVAL;
+}
+LUSTRE_RW_ATTR(max_requests);
+
+ssize_t default_archive_id_show(struct kobject *kobj, struct attribute *attr,
+                               char *buf)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+
+       return scnprintf(buf, PAGE_SIZE, "%u\n", cdt->cdt_default_archive_id);
+}
+
+ssize_t default_archive_id_store(struct kobject *kobj, struct attribute *attr,
+                                const char *buffer, size_t count)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+       unsigned int val;
+       int rc;
+
+       rc = kstrtouint(buffer, 0, &val);
+       if (rc)
+               return rc;
+
+       if (val != 0)
+               cdt->cdt_default_archive_id = val;
+
+       return val ? count : -EINVAL;
+}
+LUSTRE_RW_ATTR(default_archive_id);
 
 /*
  * procfs write method for MDT/hsm_control
@@ -2093,58 +2291,55 @@ GENERATE_PROC_METHOD(cdt_default_archive_id)
 #define CDT_HELP_CMD     "help"
 #define CDT_MAX_CMD_LEN  10
 
-ssize_t
-mdt_hsm_cdt_control_seq_write(struct file *file, const char __user *buffer,
-                             size_t count, loff_t *off)
+ssize_t hsm_control_store(struct kobject *kobj, struct attribute *attr,
+                         const char *buffer, size_t count)
 {
-       struct seq_file         *m = file->private_data;
-       struct obd_device       *obd = m->private;
-       struct mdt_device       *mdt = mdt_dev(obd->obd_lu_dev);
-       struct coordinator      *cdt = &(mdt->mdt_coordinator);
-       int                      rc, usage = 0;
-       char                     kernbuf[CDT_MAX_CMD_LEN];
-       ENTRY;
-
-       if (count == 0 || count >= sizeof(kernbuf))
-               RETURN(-EINVAL);
-
-       if (copy_from_user(kernbuf, buffer, count))
-               RETURN(-EFAULT);
-       kernbuf[count] = 0;
+       struct obd_device *obd = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+       struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev);
+       struct coordinator *cdt = &(mdt->mdt_coordinator);
+       int usage = 0;
+       int rc = 0;
 
-       if (kernbuf[count - 1] == '\n')
-               kernbuf[count - 1] = 0;
+       if (count == 0 || count >= CDT_MAX_CMD_LEN)
+               return -EINVAL;
 
-       rc = 0;
-       if (strcmp(kernbuf, CDT_ENABLE_CMD) == 0) {
+       if (strncmp(buffer, CDT_ENABLE_CMD, strlen(CDT_ENABLE_CMD)) == 0) {
                if (cdt->cdt_state == CDT_DISABLE) {
                        rc = set_cdt_state(cdt, CDT_RUNNING);
                        mdt_hsm_cdt_event(cdt);
                        wake_up(&cdt->cdt_waitq);
+               } else if (cdt->cdt_state == CDT_RUNNING) {
+                       rc = 0;
                } else {
                        rc = mdt_hsm_cdt_start(mdt);
                }
-       } else if (strcmp(kernbuf, CDT_STOP_CMD) == 0) {
-               if ((cdt->cdt_state == CDT_STOPPING) ||
-                   (cdt->cdt_state == CDT_STOPPED)) {
-                       CERROR("%s: Coordinator already stopped\n",
+       } else if (strncmp(buffer, CDT_STOP_CMD, strlen(CDT_STOP_CMD)) == 0) {
+               if (cdt->cdt_state == CDT_STOPPING) {
+                       CERROR("%s: Coordinator is already stopping\n",
                               mdt_obd_name(mdt));
                        rc = -EALREADY;
+               } else if (cdt->cdt_state == CDT_STOPPED) {
+                       rc = 0;
                } else {
                        rc = mdt_hsm_cdt_stop(mdt);
                }
-       } else if (strcmp(kernbuf, CDT_DISABLE_CMD) == 0) {
+       } else if (strncmp(buffer, CDT_DISABLE_CMD,
+                          strlen(CDT_DISABLE_CMD)) == 0) {
                if ((cdt->cdt_state == CDT_STOPPING) ||
                    (cdt->cdt_state == CDT_STOPPED)) {
-                       CERROR("%s: Coordinator is stopped\n",
-                              mdt_obd_name(mdt));
-                       rc = -EINVAL;
+                       /* exit gracefully if coordinator is being stopped
+                        * or stopped already.
+                        */
+                       rc = 0;
                } else {
                        rc = set_cdt_state(cdt, CDT_DISABLE);
                }
-       } else if (strcmp(kernbuf, CDT_PURGE_CMD) == 0) {
+       } else if (strncmp(buffer, CDT_PURGE_CMD,
+                          strlen(CDT_PURGE_CMD)) == 0) {
                rc = hsm_cancel_all_actions(mdt);
-       } else if (strcmp(kernbuf, CDT_HELP_CMD) == 0) {
+       } else if (strncmp(buffer, CDT_HELP_CMD,
+                          strlen(CDT_HELP_CMD)) == 0) {
                usage = 1;
        } else {
                usage = 1;
@@ -2163,17 +2358,17 @@ mdt_hsm_cdt_control_seq_write(struct file *file, const char __user *buffer,
        RETURN(count);
 }
 
-int mdt_hsm_cdt_control_seq_show(struct seq_file *m, void *data)
+ssize_t hsm_control_show(struct kobject *kobj, struct attribute *attr,
+                        char *buf)
 {
-       struct obd_device       *obd = m->private;
-       struct coordinator      *cdt;
-       ENTRY;
+       struct obd_device *obd = container_of(kobj, struct obd_device,
+                                             obd_kset.kobj);
+       struct coordinator *cdt;
 
        cdt = &(mdt_dev(obd->obd_lu_dev)->mdt_coordinator);
 
-       seq_printf(m, "%s\n", cdt_mdt_state2str(cdt->cdt_state));
-
-       RETURN(0);
+       return scnprintf(buf, PAGE_SIZE, "%s\n",
+                        cdt_mdt_state2str(cdt->cdt_state));
 }
 
 static int
@@ -2321,99 +2516,82 @@ mdt_hsm_other_request_mask_seq_write(struct file *file, const char __user *buf,
                                           &cdt->cdt_other_request_mask);
 }
 
-static int mdt_hsm_cdt_raolu_seq_show(struct seq_file *m, void *data)
+static ssize_t remove_archive_on_last_unlink_show(struct kobject *kobj,
+                                                 struct attribute *attr,
+                                                 char *buf)
 {
-       struct mdt_device *mdt = m->private;
-       struct coordinator *cdt = &mdt->mdt_coordinator;
-       ENTRY;
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
 
-       seq_printf(m, "%d\n", (int)cdt->cdt_remove_archive_on_last_unlink);
-       RETURN(0);
+       return scnprintf(buf, PAGE_SIZE, "%u\n",
+                        cdt->cdt_remove_archive_on_last_unlink);
 }
 
-static ssize_t
-mdt_hsm_cdt_raolu_seq_write(struct file *file, const char __user *buffer,
-                           size_t count, loff_t *off)
-
+static ssize_t remove_archive_on_last_unlink_store(struct kobject *kobj,
+                                                  struct attribute *attr,
+                                                  const char *buffer,
+                                                  size_t count)
 {
-       struct seq_file *m = file->private_data;
-       struct mdt_device *mdt = m->private;
-       struct coordinator *cdt = &mdt->mdt_coordinator;
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
        bool val;
        int rc;
 
-       ENTRY;
-       rc = kstrtobool_from_user(buffer, count, &val);
+       rc = kstrtobool(buffer, &val);
        if (rc < 0)
-               RETURN(rc);
+               return rc;
 
        cdt->cdt_remove_archive_on_last_unlink = val;
-       RETURN(count);
+       return count;
 }
+LUSTRE_RW_ATTR(remove_archive_on_last_unlink);
+
+LDEBUGFS_SEQ_FOPS(mdt_hsm_user_request_mask);
+LDEBUGFS_SEQ_FOPS(mdt_hsm_group_request_mask);
+LDEBUGFS_SEQ_FOPS(mdt_hsm_other_request_mask);
 
-LPROC_SEQ_FOPS(mdt_hsm_cdt_loop_period);
-LPROC_SEQ_FOPS(mdt_hsm_cdt_grace_delay);
-LPROC_SEQ_FOPS(mdt_hsm_cdt_active_req_timeout);
-LPROC_SEQ_FOPS(mdt_hsm_cdt_max_requests);
-LPROC_SEQ_FOPS(mdt_hsm_cdt_default_archive_id);
-LPROC_SEQ_FOPS(mdt_hsm_user_request_mask);
-LPROC_SEQ_FOPS(mdt_hsm_group_request_mask);
-LPROC_SEQ_FOPS(mdt_hsm_other_request_mask);
-LPROC_SEQ_FOPS(mdt_hsm_cdt_raolu);
-
-/* Read-only proc files for request counters */
-static int mdt_hsm_cdt_archive_count_seq_show(struct seq_file *m, void *data)
+/* Read-only sysfs files for request counters */
+static ssize_t archive_count_show(struct kobject *kobj, struct attribute *attr,
+                                 char *buf)
 {
-       struct mdt_device *mdt = m->private;
-       struct coordinator *cdt = &mdt->mdt_coordinator;
-       ENTRY;
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
 
-       seq_printf(m, "%d\n", atomic_read(&cdt->cdt_archive_count));
-       RETURN(0);
+       return scnprintf(buf, PAGE_SIZE, "%d\n",
+                        atomic_read(&cdt->cdt_archive_count));
 }
+LUSTRE_RO_ATTR(archive_count);
 
-static int mdt_hsm_cdt_restore_count_seq_show(struct seq_file *m, void *data)
+static ssize_t restore_count_show(struct kobject *kobj, struct attribute *attr,
+                                 char *buf)
 {
-       struct mdt_device *mdt = m->private;
-       struct coordinator *cdt = &mdt->mdt_coordinator;
-       ENTRY;
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
 
-       seq_printf(m, "%d\n", atomic_read(&cdt->cdt_restore_count));
-       RETURN(0);
+       return scnprintf(buf, PAGE_SIZE, "%d\n",
+                        atomic_read(&cdt->cdt_restore_count));
 }
+LUSTRE_RO_ATTR(restore_count);
 
-static int mdt_hsm_cdt_remove_count_seq_show(struct seq_file *m, void *data)
+static ssize_t remove_count_show(struct kobject *kobj, struct attribute *attr,
+                                char *buf)
 {
-       struct mdt_device *mdt = m->private;
-       struct coordinator *cdt = &mdt->mdt_coordinator;
-       ENTRY;
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
 
-       seq_printf(m, "%d\n", atomic_read(&cdt->cdt_remove_count));
-       RETURN(0);
+       return scnprintf(buf, PAGE_SIZE, "%d\n",
+                        atomic_read(&cdt->cdt_remove_count));
 }
+LUSTRE_RO_ATTR(remove_count);
 
-LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_archive_count);
-LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_restore_count);
-LPROC_SEQ_FOPS_RO(mdt_hsm_cdt_remove_count);
-
-static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = {
+static struct ldebugfs_vars ldebugfs_mdt_hsm_vars[] = {
        { .name =       "agents",
          .fops =       &mdt_hsm_agent_fops                     },
        { .name =       "actions",
          .fops =       &mdt_hsm_actions_fops,
          .proc_mode =  0444                                    },
-       { .name =       "default_archive_id",
-         .fops =       &mdt_hsm_cdt_default_archive_id_fops    },
-       { .name =       "grace_delay",
-         .fops =       &mdt_hsm_cdt_grace_delay_fops           },
-       { .name =       "loop_period",
-         .fops =       &mdt_hsm_cdt_loop_period_fops           },
-       { .name =       "max_requests",
-         .fops =       &mdt_hsm_cdt_max_requests_fops          },
        { .name =       "policy",
          .fops =       &mdt_hsm_policy_fops                    },
-       { .name =       "active_request_timeout",
-         .fops =       &mdt_hsm_cdt_active_req_timeout_fops    },
        { .name =       "active_requests",
          .fops =       &mdt_hsm_active_requests_fops           },
        { .name =       "user_request_mask",
@@ -2422,13 +2600,76 @@ static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = {
          .fops =       &mdt_hsm_group_request_mask_fops,       },
        { .name =       "other_request_mask",
          .fops =       &mdt_hsm_other_request_mask_fops,       },
-       { .name =       "remove_archive_on_last_unlink",
-         .fops =       &mdt_hsm_cdt_raolu_fops,                },
-       { .name =       "archive_count",
-         .fops =       &mdt_hsm_cdt_archive_count_fops,        },
-       { .name =       "restore_count",
-         .fops =       &mdt_hsm_cdt_restore_count_fops,        },
-       { .name =       "remove_count",
-         .fops =       &mdt_hsm_cdt_remove_count_fops,         },
        { 0 }
 };
+
+static struct attribute *hsm_attrs[] = {
+       &lustre_attr_loop_period.attr,
+       &lustre_attr_grace_delay.attr,
+       &lustre_attr_active_request_timeout.attr,
+       &lustre_attr_max_requests.attr,
+       &lustre_attr_default_archive_id.attr,
+       &lustre_attr_remove_archive_on_last_unlink.attr,
+       &lustre_attr_archive_count.attr,
+       &lustre_attr_restore_count.attr,
+       &lustre_attr_remove_count.attr,
+       NULL,
+};
+
+static void hsm_kobj_release(struct kobject *kobj)
+{
+       struct coordinator *cdt = container_of(kobj, struct coordinator,
+                                              cdt_hsm_kobj);
+
+       debugfs_remove_recursive(cdt->cdt_debugfs_dir);
+       cdt->cdt_debugfs_dir = NULL;
+
+       complete(&cdt->cdt_kobj_unregister);
+}
+
+static struct kobj_type hsm_ktype = {
+       .default_attrs  = hsm_attrs,
+       .sysfs_ops      = &lustre_sysfs_ops,
+       .release        = hsm_kobj_release,
+};
+
+/**
+ * create sysfs entries for coordinator
+ * \param mdt [IN]
+ * \retval 0 success
+ * \retval -ve failure
+ */
+int hsm_cdt_tunables_init(struct mdt_device *mdt)
+{
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       struct obd_device *obd = mdt2obd_dev(mdt);
+       int rc;
+
+       init_completion(&cdt->cdt_kobj_unregister);
+       rc = kobject_init_and_add(&cdt->cdt_hsm_kobj, &hsm_ktype,
+                                 &obd->obd_kset.kobj, "%s", "hsm");
+       if (rc) {
+               kobject_put(&cdt->cdt_hsm_kobj);
+               return rc;
+       }
+
+       /* init debugfs entries, failure is not critical */
+       cdt->cdt_debugfs_dir = debugfs_create_dir("hsm",
+                                                 obd->obd_debugfs_entry);
+       ldebugfs_add_vars(cdt->cdt_debugfs_dir, ldebugfs_mdt_hsm_vars, mdt);
+
+       return 0;
+}
+
+/**
+ * remove sysfs entries for coordinator
+ *
+ * @mdt
+ */
+void hsm_cdt_tunables_fini(struct mdt_device *mdt)
+{
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+
+       kobject_put(&cdt->cdt_hsm_kobj);
+       wait_for_completion(&cdt->cdt_kobj_unregister);
+}