LU-3343 mdt: HSM coordinator main thread

author jcl <jacques-charles.lafoucriere@cea.fr>

Sat, 6 Jul 2013 12:57:08 +0000 (14:57 +0200)

committer Oleg Drokin <oleg.drokin@intel.com>

Wed, 7 Aug 2013 20:20:52 +0000 (20:20 +0000)
author jcl <jacques-charles.lafoucriere@cea.fr>
Sat, 6 Jul 2013 12:57:08 +0000 (14:57 +0200)
committer Oleg Drokin <oleg.drokin@intel.com>
Wed, 7 Aug 2013 20:20:52 +0000 (20:20 +0000)
diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h

index c856390..045ff2b 100644 (file)
--- a/lustre/include/lustre/lustre_user.h
+++ b/lustre/include/lustre/lustre_user.h
@@ -1071,7 +1071,7 @@ struct hsm_action_item {
   * \param len [IN] max buffer len
   * \retval buffer
   */
-static inline char *hai_dump_data_field(struct hsm_action_item *hai,
+static inline char *hai_dump_data_field(const struct hsm_action_item *hai,
                                          char *buffer, int len)
  {
          int i, sz, data_len;
diff --git a/lustre/include/md_object.h b/lustre/include/md_object.h

index 6b83789..304646c 100644 (file)
--- a/lustre/include/md_object.h
+++ b/lustre/include/md_object.h
@@ -849,7 +849,7 @@ struct lu_local_obj_desc {
  
  int lustre_buf2som(void *buf, int rc, struct md_som_data *msd);
  int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh);
-void lustre_hsm2buf(void *buf, struct md_hsm *mh);
+void lustre_hsm2buf(void *buf, const struct md_hsm *mh);
  
  enum {
         UCRED_INVALID   = -1,
diff --git a/lustre/mdt/Makefile.in b/lustre/mdt/Makefile.in

index a6b55e5..2b23a67 100644 (file)
--- a/lustre/mdt/Makefile.in
+++ b/lustre/mdt/Makefile.in
@@ -6,5 +6,6 @@ mdt-objs += mdt_hsm_cdt_actions.o
  mdt-objs += mdt_hsm_cdt_requests.o
  mdt-objs += mdt_hsm_cdt_client.o
  mdt-objs += mdt_hsm_cdt_agent.o
+mdt-objs += mdt_coordinator.o
  
  @INCLUDE_RULES@
diff --git a/lustre/mdt/mdt_coordinator.c b/lustre/mdt/mdt_coordinator.c

new file mode 100644 (file)

index 0000000..a48bc36
--- /dev/null
+++ b/lustre/mdt/mdt_coordinator.c
@@ -0,0 +1,1999 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2013, Intel Corporation.
+ * Use is subject to license terms.
+ * Copyright (c) 2011, 2012 Commissariat a l'energie atomique et aux energies
+ *                          alternatives
+ */
+/*
+ * lustre/mdt/mdt_coordinator.c
+ *
+ * Lustre HSM Coordinator
+ *
+ * Author: Jacques-Charles Lafoucriere <jacques-charles.lafoucriere@cea.fr>
+ * Author: Aurelien Degremont <aurelien.degremont@cea.fr>
+ * Author: Thomas Leibovici <thomas.leibovici@cea.fr>
+ */
+
+#define DEBUG_SUBSYSTEM S_MDS
+
+#include <obd_support.h>
+#include <lustre_net.h>
+#include <lustre_export.h>
+#include <obd.h>
+#include <obd_lov.h>
+#include <lprocfs_status.h>
+#include <lustre_log.h>
+#include "mdt_internal.h"
+
+static struct lprocfs_vars lprocfs_mdt_hsm_vars[];
+
+/**
+ * get obj and HSM attributes on a fid
+ * \param mti [IN] context
+ * \param fid [IN] object fid
+ * \param hsm [OUT] HSM meta data
+ * \retval obj
+ */
+struct mdt_object *mdt_hsm_get_md_hsm(struct mdt_thread_info *mti,
+                                     const struct lu_fid *fid,
+                                     struct md_hsm *hsm)
+{
+       struct md_attr          *ma;
+       struct mdt_object       *obj;
+       int                      rc;
+       ENTRY;
+
+       ma = &mti->mti_attr;
+       ma->ma_need = MA_HSM;
+       ma->ma_valid = 0;
+
+       /* find object by FID */
+       obj = mdt_object_find(mti->mti_env, mti->mti_mdt, fid);
+       if (IS_ERR(obj))
+               RETURN(obj);
+
+       if (!mdt_object_exists(obj)) {
+               /* no more object */
+               mdt_object_put(mti->mti_env, obj);
+               RETURN(ERR_PTR(-ENOENT));
+       }
+
+       rc = mdt_attr_get_complex(mti, obj, ma);
+       if (rc) {
+               mdt_object_put(mti->mti_env, obj);
+               RETURN(ERR_PTR(rc));
+       }
+
+       if (ma->ma_valid & MA_HSM)
+               *hsm = ma->ma_hsm;
+       else
+               memset(hsm, 0, sizeof(*hsm));
+       ma->ma_valid = 0;
+       RETURN(obj);
+}
+
+void mdt_hsm_dump_hal(int level, const char *prefix,
+                     struct hsm_action_list *hal)
+{
+       int                      i, sz;
+       struct hsm_action_item  *hai;
+       char                     buf[12];
+
+       CDEBUG(level, "%s: HAL header: version %X count %d compound "LPX64
+                     " archive_id %d flags "LPX64"\n",
+              prefix, hal->hal_version, hal->hal_count,
+              hal->hal_compound_id, hal->hal_archive_id, hal->hal_flags);
+
+       hai = hai_zero(hal);
+       for (i = 0; i < hal->hal_count; i++) {
+               sz = hai->hai_len - sizeof(*hai);
+               CDEBUG(level, "%s %d: fid="DFID" dfid="DFID
+                      " compound/cookie="LPX64"/"LPX64
+                      " action=%s extent="LPX64"-"LPX64" gid="LPX64
+                      " datalen=%d data=[%s]\n",
+                      prefix, i,
+                      PFID(&hai->hai_fid), PFID(&hai->hai_dfid),
+                      hal->hal_compound_id, hai->hai_cookie,
+                      hsm_copytool_action2name(hai->hai_action),
+                      hai->hai_extent.offset,
+                      hai->hai_extent.length,
+                      hai->hai_gid, sz,
+                      hai_dump_data_field(hai, buf, sizeof(buf)));
+               hai = hai_next(hai);
+       }
+}
+
+/**
+ * data passed to llog_cat_process() callback
+ * to scan requests and take actions
+ */
+struct hsm_scan_data {
+       struct mdt_thread_info          *mti;
+       char                             fs_name[MTI_NAME_MAXLEN+1];
+       /* request to be send to agents */
+       int                              request_sz;    /** allocated size */
+       int                              max_request;   /** vector size */
+       int                              request_cnt;   /** used count */
+       struct {
+               int                      hal_sz;
+               int                      hal_used_sz;
+               struct hsm_action_list  *hal;
+       } *request;
+       /* records to be canceled */
+       int                              max_cookie;    /** vector size */
+       int                              cookie_cnt;    /** used count */
+       __u64                           *cookies;
+};
+
+/**
+ *  llog_cat_process() callback, used to:
+ *  - find waiting request and start action
+ *  - purge canceled and done requests
+ * \param env [IN] environment
+ * \param llh [IN] llog handle
+ * \param hdr [IN] llog record
+ * \param data [IN/OUT] cb data = struct hsm_scan_data
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int mdt_coordinator_cb(const struct lu_env *env,
+                             struct llog_handle *llh,
+                             struct llog_rec_hdr *hdr,
+                             void *data)
+{
+       const struct llog_agent_req_rec *larr;
+       struct hsm_scan_data            *hsd;
+       struct hsm_action_item          *hai;
+       struct mdt_device               *mdt;
+       struct coordinator              *cdt;
+       int                              rc;
+       ENTRY;
+
+       hsd = data;
+       mdt = hsd->mti->mti_mdt;
+       cdt = &mdt->mdt_coordinator;
+
+       larr = (struct llog_agent_req_rec *)hdr;
+       dump_llog_agent_req_rec("mdt_coordinator_cb(): ", larr);
+       switch (larr->arr_status) {
+       case ARS_WAITING: {
+               int i, empty_slot, found;
+
+               /* Are agents full? */
+               if (atomic_read(&cdt->cdt_request_count) ==
+                   cdt->cdt_max_request)
+                       break;
+
+               /* first search if the request if known in the list we have
+                * build and if there is room in the request vector */
+               empty_slot = -1;
+               found = -1;
+               for (i = 0; i < hsd->max_request &&
+                           (empty_slot == -1 || found == -1); i++) {
+                       if (hsd->request[i].hal == NULL) {
+                               empty_slot = i;
+                               continue;
+                       }
+                       if (hsd->request[i].hal->hal_compound_id ==
+                               larr->arr_compound_id) {
+                               found = i;
+                               continue;
+                       }
+               }
+               if ((found == -1) && (empty_slot == -1))
+                       /* unknown request and no more room for new request,
+                        * continue scan for to find other entries for
+                        * already found request
+                        */
+                       RETURN(0);
+
+               if (found == -1) {
+                       struct hsm_action_list *hal;
+
+                       /* request is not already known */
+                       /* allocates hai vector size just needs to be large
+                        * enough */
+                       hsd->request[empty_slot].hal_sz =
+                                    sizeof(*hsd->request[empty_slot].hal) +
+                                    cfs_size_round(MTI_NAME_MAXLEN+1) +
+                                    2 * cfs_size_round(larr->arr_hai.hai_len);
+                       OBD_ALLOC(hal, hsd->request[empty_slot].hal_sz);
+                       if (!hal) {
+                               CERROR("%s: Cannot allocate memory (%d o)"
+                                      "for compound "LPX64"\n",
+                                      mdt_obd_name(mdt),
+                                      hsd->request[i].hal_sz,
+                                      larr->arr_compound_id);
+                               RETURN(-ENOMEM);
+                       }
+                       hal->hal_version = HAL_VERSION;
+                       strncpy(hal->hal_fsname, hsd->fs_name,
+                               MTI_NAME_MAXLEN);
+                       hal->hal_fsname[MTI_NAME_MAXLEN] = '\0';
+                       hal->hal_compound_id = larr->arr_compound_id;
+                       hal->hal_archive_id = larr->arr_archive_id;
+                       hal->hal_flags = larr->arr_flags;
+                       hal->hal_count = 0;
+                       hsd->request[empty_slot].hal_used_sz = hal_size(hal);
+                       hsd->request[empty_slot].hal = hal;
+                       hsd->request_cnt++;
+                       found = empty_slot;
+                       hai = hai_zero(hal);
+               } else {
+                       /* request is known */
+                       /* we check if record archive num is the same as the
+                        * known request, if not we will serve it in multiple
+                        * time because we do not know if the agent can serve
+                        * multiple backend
+                        * a use case is a compound made of multiple restore
+                        * where the files are not archived in the same backend
+                        */
+                       if (larr->arr_archive_id !=
+                           hsd->request[found].hal->hal_archive_id)
+                               RETURN(0);
+
+                       if (hsd->request[found].hal_sz <
+                           hsd->request[found].hal_used_sz +
+                            cfs_size_round(larr->arr_hai.hai_len)) {
+                               /* Not enough room, need an extension */
+                               void *hal_buffer;
+                               int sz;
+
+                               sz = 2 * hsd->request[found].hal_sz;
+                               OBD_ALLOC(hal_buffer, sz);
+                               if (!hal_buffer) {
+                                       CERROR("%s: Cannot allocate memory "
+                                              "(%d o) for compound "LPX64"\n",
+                                              mdt_obd_name(mdt), sz,
+                                              larr->arr_compound_id);
+                                       RETURN(-ENOMEM);
+                               }
+                               memcpy(hal_buffer, hsd->request[found].hal,
+                                      hsd->request[found].hal_used_sz);
+                               OBD_FREE(hsd->request[found].hal,
+                                        hsd->request[found].hal_sz);
+                               hsd->request[found].hal = hal_buffer;
+                               hsd->request[found].hal_sz = sz;
+                       }
+                       hai = hai_zero(hsd->request[found].hal);
+                       for (i = 0; i < hsd->request[found].hal->hal_count;
+                            i++)
+                               hai = hai_next(hai);
+               }
+               memcpy(hai, &larr->arr_hai, larr->arr_hai.hai_len);
+               hai->hai_cookie = larr->arr_hai.hai_cookie;
+               hai->hai_gid = larr->arr_hai.hai_gid;
+
+               hsd->request[found].hal_used_sz +=
+                                                  cfs_size_round(hai->hai_len);
+               hsd->request[found].hal->hal_count++;
+               break;
+       }
+       case ARS_STARTED: {
+               struct cdt_agent_req *car;
+               cfs_time_t last;
+
+               /* we search for a running request
+                * error may happen if coordinator crashes or stopped
+                * with running request
+                */
+               car = mdt_cdt_find_request(cdt, larr->arr_hai.hai_cookie, NULL);
+               if (car == NULL) {
+                       last = larr->arr_req_create;
+               } else {
+                       last = car->car_req_update;
+                       mdt_cdt_put_request(car);
+               }
+
+               /* test if request too long, if yes cancel it
+                * the same way the copy tool acknowledge a cancel request */
+               if ((last + cdt->cdt_timeout) < cfs_time_current_sec()) {
+                       struct hsm_progress_kernel pgs;
+
+                       dump_llog_agent_req_rec("mdt_coordinator_cb(): "
+                                               "request timeouted, start "
+                                               "cleaning", larr);
+                       /* a too old cancel request just needs to be removed
+                        * this can happen, if copy tool does not support cancel
+                        * for other requests, we have to remove the running
+                        * request and notify the copytool
+                        */
+                       pgs.hpk_fid = larr->arr_hai.hai_fid;
+                       pgs.hpk_cookie = larr->arr_hai.hai_cookie;
+                       pgs.hpk_extent = larr->arr_hai.hai_extent;
+                       pgs.hpk_flags = HP_FLAG_COMPLETED;
+                       pgs.hpk_errval = ENOSYS;
+                       pgs.hpk_data_version = 0;
+                       /* update request state, but do not record in llog, to
+                        * avoid deadlock on cdt_llog_lock
+                        */
+                       rc = mdt_hsm_update_request_state(hsd->mti, &pgs, 0);
+                       if (rc)
+                               CERROR("%s: Cannot cleanup timeouted request: "
+                                      DFID" for cookie "LPX64" action=%s\n",
+                                      mdt_obd_name(mdt),
+                                      PFID(&pgs.hpk_fid), pgs.hpk_cookie,
+                                      hsm_copytool_action2name(
+                                                    larr->arr_hai.hai_action));
+
+                       /* add the cookie to the list of record to be
+                        * canceled by caller */
+                       if (hsd->max_cookie == (hsd->cookie_cnt - 1)) {
+                               __u64 *ptr, *old_ptr;
+                               int old_sz, new_sz, new_cnt;
+
+                               /* need to increase vector size */
+                               old_sz = sizeof(__u64) * hsd->max_cookie;
+                               old_ptr = hsd->cookies;
+
+                               new_cnt = 2 * hsd->max_cookie;
+                               new_sz = sizeof(__u64) * new_cnt;
+
+                               OBD_ALLOC(ptr, new_sz);
+                               if (!ptr) {
+                                       CERROR("%s: Cannot allocate memory "
+                                              "(%d o) for cookie vector\n",
+                                              mdt_obd_name(mdt), new_sz);
+                                       RETURN(-ENOMEM);
+                               }
+                               memcpy(ptr, hsd->cookies, old_sz);
+                               hsd->cookies = ptr;
+                               hsd->max_cookie = new_cnt;
+                               OBD_FREE(old_ptr, old_sz);
+                       }
+                       hsd->cookies[hsd->cookie_cnt] =
+                                                      larr->arr_hai.hai_cookie;
+                       hsd->cookie_cnt++;
+               }
+               break;
+       }
+       case ARS_FAILED:
+       case ARS_CANCELED:
+       case ARS_SUCCEED:
+               if ((larr->arr_req_change + cdt->cdt_delay) <
+                   cfs_time_current_sec())
+                       RETURN(LLOG_DEL_RECORD);
+               break;
+       }
+       RETURN(0);
+}
+
+/**
+ * create /proc entries for coordinator
+ * \param mdt [IN]
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int hsm_cdt_procfs_init(struct mdt_device *mdt)
+{
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       int                      rc = 0;
+       ENTRY;
+
+       /* init /proc entries, failure is not critical */
+       cdt->cdt_proc_dir = lprocfs_register("hsm",
+                                            mdt2obd_dev(mdt)->obd_proc_entry,
+                                            lprocfs_mdt_hsm_vars, mdt);
+       if (IS_ERR(cdt->cdt_proc_dir)) {
+               rc = PTR_ERR(cdt->cdt_proc_dir);
+               CERROR("%s: Cannot create 'hsm' directory in mdt proc dir,"
+                      " rc=%d\n", mdt_obd_name(mdt), rc);
+               cdt->cdt_proc_dir = NULL;
+               RETURN(rc);
+       }
+
+       RETURN(0);
+}
+
+/**
+ * coordinator thread
+ * \param data [IN] obd device
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int mdt_coordinator(void *data)
+{
+       struct mdt_thread_info  *mti = data;
+       struct mdt_device       *mdt = mti->mti_mdt;
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       struct hsm_scan_data     hsd = { 0 };
+       int                      rc = 0;
+       ENTRY;
+
+       cdt->cdt_thread.t_flags = SVC_RUNNING;
+       cfs_waitq_signal(&cdt->cdt_thread.t_ctl_waitq);
+
+       CDEBUG(D_HSM, "%s: coordinator thread starting, pid=%d\n",
+              mdt_obd_name(mdt), cfs_curproc_pid());
+
+       /*
+        * create /proc entries for coordinator
+        */
+       hsm_cdt_procfs_init(mdt);
+       /* timeouted cookie vector initialization */
+       hsd.max_cookie = 0;
+       hsd.cookie_cnt = 0;
+       hsd.cookies = NULL;
+       /* we use a copy of cdt_max_request in the cb, so if cdt_max_request
+        * increases due to a change from /proc we do not overflow the
+        * hsd.request[] vector
+        */
+       hsd.max_request = cdt->cdt_max_request;
+       hsd.request_sz = hsd.max_request * sizeof(*hsd.request);
+       OBD_ALLOC(hsd.request, hsd.request_sz);
+       if (!hsd.request)
+               GOTO(out, rc = -ENOMEM);
+
+       hsd.mti = mti;
+       obd_uuid2fsname(hsd.fs_name, mdt_obd_name(mdt), MTI_NAME_MAXLEN);
+
+       while (1) {
+               struct l_wait_info lwi;
+               int i;
+
+               lwi = LWI_TIMEOUT(cfs_time_seconds(cdt->cdt_loop_period),
+                                 NULL, NULL);
+               l_wait_event(cdt->cdt_thread.t_ctl_waitq,
+                            (cdt->cdt_thread.t_flags &
+                             (SVC_STOPPING|SVC_EVENT)),
+                            &lwi);
+
+               CDEBUG(D_HSM, "coordinator resumes\n");
+
+               if ((cdt->cdt_thread.t_flags & SVC_STOPPING) ||
+                   (cdt->cdt_state == CDT_STOPPING)) {
+                       cdt->cdt_thread.t_flags &= ~SVC_STOPPING;
+                       rc = 0;
+                       break;
+               }
+
+               /* wake up before timeout, new work arrives */
+               if (cdt->cdt_thread.t_flags & SVC_EVENT)
+                       cdt->cdt_thread.t_flags &= ~SVC_EVENT;
+
+               /* if coordinator is suspended continue to wait */
+               if (cdt->cdt_state == CDT_DISABLE) {
+                       CDEBUG(D_HSM, "disable state, coordinator sleeps\n");
+                       continue;
+               }
+
+               CDEBUG(D_HSM, "coordinator starts reading llog\n");
+
+               if (hsd.max_request != cdt->cdt_max_request) {
+                       /* cdt_max_request has changed,
+                        * we need to allocate a new buffer
+                        */
+                       OBD_FREE(hsd.request, hsd.request_sz);
+                       hsd.max_request = cdt->cdt_max_request;
+                       hsd.request_sz =
+                                  hsd.max_request * sizeof(*hsd.request);
+                       OBD_ALLOC(hsd.request, hsd.request_sz);
+                       if (!hsd.request) {
+                               rc = -ENOMEM;
+                               break;
+                       }
+               }
+
+               /* create canceled cookie vector for an arbitrary size
+                * if needed, vector will grow during llog scan
+                */
+               hsd.max_cookie = 10;
+               hsd.cookie_cnt = 0;
+               OBD_ALLOC(hsd.cookies, hsd.max_cookie * sizeof(__u64));
+               if (!hsd.cookies) {
+                       rc = -ENOMEM;
+                       goto clean_cb_alloc;
+               }
+               hsd.request_cnt = 0;
+
+               rc = cdt_llog_process(mti->mti_env, mdt,
+                                     mdt_coordinator_cb, &hsd);
+               if (rc < 0)
+                       goto clean_cb_alloc;
+
+               CDEBUG(D_HSM, "Found %d requests to send and %d"
+                             " requests to cancel\n",
+                      hsd.request_cnt, hsd.cookie_cnt);
+               /* first we cancel llog records of the timeouted requests */
+               if (hsd.cookie_cnt > 0) {
+                       rc = mdt_agent_record_update(mti->mti_env, mdt,
+                                                    hsd.cookies,
+                                                    hsd.cookie_cnt,
+                                                    ARS_CANCELED);
+                       if (rc)
+                               CERROR("%s: mdt_agent_record_update() failed, "
+                                      "rc=%d, cannot update status to %s "
+                                      "for %d cookies\n",
+                                      mdt_obd_name(mdt), rc,
+                                      agent_req_status2name(ARS_CANCELED),
+                                      hsd.cookie_cnt);
+               }
+
+               if (list_empty(&cdt->cdt_agents)) {
+                       CDEBUG(D_HSM, "no agent available, "
+                                     "coordinator sleeps\n");
+                       goto clean_cb_alloc;
+               }
+
+               /* here hsd contains a list of requests to be started */
+               for (i = 0; i < hsd.max_request; i++) {
+                       struct hsm_action_list  *hal;
+                       struct hsm_action_item  *hai;
+                       __u64                   *cookies;
+                       int                      sz, j;
+                       enum agent_req_status    status;
+
+                       /* still room for work ? */
+                       if (atomic_read(&cdt->cdt_request_count) ==
+                           cdt->cdt_max_request)
+                               break;
+
+                       if (hsd.request[i].hal == NULL)
+                               continue;
+
+                       /* found a request, we start it */
+                       /* kuc payload allocation so we avoid an additionnal
+                        * allocation in mdt_hsm_agent_send()
+                        */
+                       hal = kuc_alloc(hsd.request[i].hal_used_sz,
+                                       KUC_TRANSPORT_HSM, HMT_ACTION_LIST);
+                       if (IS_ERR(hal)) {
+                               CERROR("%s: Cannot allocate memory (%d o) "
+                                      "for compound "LPX64"\n",
+                                      mdt_obd_name(mdt),
+                                      hsd.request[i].hal_used_sz,
+                                      hsd.request[i].hal->hal_compound_id);
+                               continue;
+                       }
+                       memcpy(hal, hsd.request[i].hal,
+                              hsd.request[i].hal_used_sz);
+
+                       rc = mdt_hsm_agent_send(mti, hal, 0);
+                       /* if failure, we suppose it is temporary
+                        * if the copy tool failed to do the request
+                        * it has to use hsm_progress
+                        */
+                       status = (rc ? ARS_WAITING : ARS_STARTED);
+
+                       /* set up cookie vector to set records status
+                        * after copy tools start or failed
+                        */
+                       sz = hsd.request[i].hal->hal_count * sizeof(__u64);
+                       OBD_ALLOC(cookies, sz);
+                       if (cookies == NULL) {
+                               CERROR("%s: Cannot allocate memory (%d o) "
+                                      "for cookies vector "LPX64"\n",
+                                      mdt_obd_name(mdt), sz,
+                                      hsd.request[i].hal->hal_compound_id);
+                               kuc_free(hal, hsd.request[i].hal_used_sz);
+                               continue;
+                       }
+                       hai = hai_zero(hal);
+                       for (j = 0; j < hsd.request[i].hal->hal_count; j++) {
+                               cookies[j] = hai->hai_cookie;
+                               hai = hai_next(hai);
+                       }
+
+                       rc = mdt_agent_record_update(mti->mti_env, mdt, cookies,
+                                               hsd.request[i].hal->hal_count,
+                                               status);
+                       if (rc)
+                               CERROR("%s: mdt_agent_record_update() failed, "
+                                      "rc=%d, cannot update status to %s "
+                                      "for %d cookies\n",
+                                      mdt_obd_name(mdt), rc,
+                                      agent_req_status2name(status),
+                                      hsd.request[i].hal->hal_count);
+
+                       OBD_FREE(cookies, sz);
+                       kuc_free(hal, hsd.request[i].hal_used_sz);
+               }
+clean_cb_alloc:
+               /* free cookie vector allocated for/by callback */
+               if (hsd.cookies) {
+                       OBD_FREE(hsd.cookies, hsd.max_cookie * sizeof(__u64));
+                       hsd.max_cookie = 0;
+                       hsd.cookie_cnt = 0;
+                       hsd.cookies = NULL;
+               }
+
+               /* free hal allocated by callback */
+               for (i = 0; i < hsd.max_request; i++) {
+                       if (hsd.request[i].hal) {
+                               OBD_FREE(hsd.request[i].hal,
+                                        hsd.request[i].hal_sz);
+                               hsd.request[i].hal_sz = 0;
+                               hsd.request[i].hal = NULL;
+                               hsd.request_cnt--;
+                       }
+               }
+               LASSERT(hsd.request_cnt == 0);
+
+               /* reset callback data */
+               memset(hsd.request, 0, hsd.request_sz);
+       }
+       EXIT;
+out:
+       if (hsd.request)
+               OBD_FREE(hsd.request, hsd.request_sz);
+
+       if (hsd.cookies)
+               OBD_FREE(hsd.cookies, hsd.max_cookie * sizeof(__u64));
+
+       if (cdt->cdt_state == CDT_STOPPING) {
+               /* request comes from /proc path, so we need to clean cdt
+                * struct */
+                mdt_hsm_cdt_stop(mdt);
+                mdt->mdt_opts.mo_coordinator = 0;
+       } else {
+               /* request comes from a thread event, generated
+                * by mdt_stop_coordinator(), we have to ack
+                * and cdt cleaning will be done by event sender
+                */
+               cdt->cdt_thread.t_flags = SVC_STOPPED;
+               cfs_waitq_signal(&cdt->cdt_thread.t_ctl_waitq);
+       }
+
+       if (rc != 0)
+               CERROR("%s: coordinator thread exiting, process=%d, rc=%d\n",
+                      mdt_obd_name(mdt), cfs_curproc_pid(), rc);
+       else
+               CDEBUG(D_HSM, "%s: coordinator thread exiting, process=%d,"
+                             " no error\n",
+                      mdt_obd_name(mdt), cfs_curproc_pid());
+
+       return rc;
+}
+
+/**
+ * lookup a restore handle by FID
+ * caller needs to hold cdt_restore_lock
+ * \param cdt [IN] coordinator
+ * \param fid [IN] FID
+ * \retval cdt_restore_handle found
+ * \retval NULL not found
+ */
+static struct cdt_restore_handle *hsm_restore_hdl_find(struct coordinator *cdt,
+                                                      const struct lu_fid *fid)
+{
+       struct cdt_restore_handle       *crh;
+       ENTRY;
+
+       list_for_each_entry(crh, &cdt->cdt_restore_hdl, crh_list) {
+               if (lu_fid_eq(&crh->crh_fid, fid))
+                       RETURN(crh);
+       }
+       RETURN(NULL);
+}
+
+/**
+ * data passed to llog_cat_process() callback
+ * to scan requests and take actions
+ */
+struct hsm_restore_data {
+       struct mdt_thread_info  *hrd_mti;
+};
+
+/**
+ *  llog_cat_process() callback, used to:
+ *  - find restore request and allocate the restore handle
+ * \param env [IN] environment
+ * \param llh [IN] llog handle
+ * \param hdr [IN] llog record
+ * \param data [IN/OUT] cb data = struct hsm_restore_data
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int hsm_restore_cb(const struct lu_env *env,
+                         struct llog_handle *llh,
+                         struct llog_rec_hdr *hdr, void *data)
+{
+       struct llog_agent_req_rec       *larr;
+       struct hsm_restore_data         *hrd;
+       struct cdt_restore_handle       *crh;
+       struct hsm_action_item          *hai;
+       struct mdt_thread_info          *mti;
+       struct coordinator              *cdt;
+       struct mdt_object               *child;
+       int rc;
+       ENTRY;
+
+       hrd = data;
+       mti = hrd->hrd_mti;
+       cdt = &mti->mti_mdt->mdt_coordinator;
+
+       larr = (struct llog_agent_req_rec *)hdr;
+       hai = &larr->arr_hai;
+       if ((hai->hai_action != HSMA_RESTORE) ||
+            agent_req_in_final_state(larr->arr_status))
+               RETURN(0);
+
+       /* restore request not in a final state */
+
+       OBD_SLAB_ALLOC_PTR(crh, mdt_hsm_cdt_kmem);
+       if (crh == NULL)
+               RETURN(-ENOMEM);
+
+       crh->crh_fid = hai->hai_fid;
+       /* in V1 all file is restored
+       crh->extent.start = hai->hai_extent.offset;
+       crh->extent.end = hai->hai_extent.offset + hai->hai_extent.length;
+       */
+       crh->crh_extent.start = 0;
+       crh->crh_extent.end = OBD_OBJECT_EOF;
+       /* get the layout lock */
+       mdt_lock_reg_init(&crh->crh_lh, LCK_EX);
+       child = mdt_object_find_lock(mti, &crh->crh_fid, &crh->crh_lh,
+                                    MDS_INODELOCK_LAYOUT);
+       if (IS_ERR(child))
+               GOTO(out, rc = PTR_ERR(child));
+
+       rc = 0;
+       /* we choose to not keep a reference
+        * on the object during the restore time which can be very long */
+       mdt_object_put(mti->mti_env, child);
+
+       mutex_lock(&cdt->cdt_restore_lock);
+       list_add_tail(&crh->crh_list, &cdt->cdt_restore_hdl);
+       mutex_unlock(&cdt->cdt_restore_lock);
+
+out:
+       RETURN(rc);
+}
+
+/**
+ * restore coordinator state at startup
+ * the goal is to take a layout lock for each registered restore request
+ * \param mti [IN] context
+ */
+static int mdt_hsm_pending_restore(struct mdt_thread_info *mti)
+{
+       struct hsm_restore_data  hrd;
+       int                      rc;
+       ENTRY;
+
+       hrd.hrd_mti = mti;
+
+       rc = cdt_llog_process(mti->mti_env, mti->mti_mdt,
+                             hsm_restore_cb, &hrd);
+
+       RETURN(rc);
+}
+
+static int hsm_init_ucred(struct lu_ucred *uc)
+{
+       ENTRY;
+
+       uc->uc_valid = UCRED_OLD;
+       uc->uc_o_uid = 0;
+       uc->uc_o_gid = 0;
+       uc->uc_o_fsuid = 0;
+       uc->uc_o_fsgid = 0;
+       uc->uc_uid = 0;
+       uc->uc_gid = 0;
+       uc->uc_fsuid = 0;
+       uc->uc_fsgid = 0;
+       uc->uc_suppgids[0] = -1;
+       uc->uc_suppgids[1] = -1;
+       uc->uc_cap = 0;
+       uc->uc_umask = 0777;
+       uc->uc_ginfo = NULL;
+       uc->uc_identity = NULL;
+
+       RETURN(0);
+}
+
+/**
+ * wake up coordinator thread
+ * \param mdt [IN] device
+ * \retval 0 success
+ * \retval -ve failure
+ */
+int mdt_hsm_cdt_wakeup(struct mdt_device *mdt)
+{
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       ENTRY;
+
+       if (cdt->cdt_state == CDT_STOPPED)
+               RETURN(-ESRCH);
+
+       /* wake up coordinator */
+       cdt->cdt_thread.t_flags = SVC_EVENT;
+       cfs_waitq_signal(&cdt->cdt_thread.t_ctl_waitq);
+
+       RETURN(0);
+}
+
+/**
+ * initialize coordinator struct
+ * \param mdt [IN] device
+ * \retval 0 success
+ * \retval -ve failure
+ */
+int mdt_hsm_cdt_init(struct mdt_device *mdt)
+{
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       struct mdt_thread_info  *cdt_mti = NULL;
+       int                      rc;
+       ENTRY;
+
+       cdt->cdt_state = CDT_STOPPED;
+
+       cfs_waitq_init(&cdt->cdt_thread.t_ctl_waitq);
+       mutex_init(&cdt->cdt_llog_lock);
+       init_rwsem(&cdt->cdt_agent_lock);
+       init_rwsem(&cdt->cdt_request_lock);
+       mutex_init(&cdt->cdt_restore_lock);
+
+       CFS_INIT_LIST_HEAD(&cdt->cdt_requests);
+       CFS_INIT_LIST_HEAD(&cdt->cdt_agents);
+       CFS_INIT_LIST_HEAD(&cdt->cdt_restore_hdl);
+
+       rc = lu_env_init(&cdt->cdt_env, LCT_MD_THREAD);
+       if (rc < 0)
+               RETURN(rc);
+
+       /* for mdt_ucred(), lu_ucred stored in lu_ucred_key */
+       rc = lu_context_init(&cdt->cdt_session, LCT_SESSION);
+       if (rc == 0) {
+               lu_context_enter(&cdt->cdt_session);
+               cdt->cdt_env.le_ses = &cdt->cdt_session;
+       } else {
+               lu_env_fini(&cdt->cdt_env);
+               RETURN(rc);
+       }
+
+       cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
+       LASSERT(cdt_mti != NULL);
+
+       cdt_mti->mti_env = &cdt->cdt_env;
+       cdt_mti->mti_mdt = mdt;
+
+       hsm_init_ucred(mdt_ucred(cdt_mti));
+
+       RETURN(0);
+}
+
+/**
+ * free a coordinator thread
+ * \param mdt [IN] device
+ */
+int  mdt_hsm_cdt_fini(struct mdt_device *mdt)
+{
+       struct coordinator *cdt = &mdt->mdt_coordinator;
+       ENTRY;
+
+       lu_context_exit(cdt->cdt_env.le_ses);
+       lu_context_fini(cdt->cdt_env.le_ses);
+
+       lu_env_fini(&cdt->cdt_env);
+
+       RETURN(0);
+}
+
+/**
+ * start a coordinator thread
+ * \param mdt [IN] device
+ * \retval 0 success
+ * \retval -ve failure
+ */
+int mdt_hsm_cdt_start(struct mdt_device *mdt)
+{
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       int                      rc;
+       void                    *ptr;
+       struct mdt_thread_info  *cdt_mti;
+       cfs_task_t              *task;
+       ENTRY;
+
+       /* functions defined but not yet used
+        * this avoid compilation warning
+        */
+       ptr = dump_requests;
+
+       if (cdt->cdt_state != CDT_STOPPED) {
+               CERROR("%s: Coordinator already started\n",
+                      mdt_obd_name(mdt));
+               RETURN(-EALREADY);
+       }
+
+       cdt->cdt_policy = CDT_DEFAULT_POLICY;
+       cdt->cdt_state = CDT_INIT;
+
+       cfs_atomic_set(&cdt->cdt_compound_id, cfs_time_current_sec());
+       /* just need to be larger than previous one */
+       /* cdt_last_cookie is protected by cdt_llog_lock */
+       cdt->cdt_last_cookie = cfs_time_current_sec();
+       cdt->cdt_loop_period = 10;
+       cdt->cdt_delay = 60;
+       cdt->cdt_timeout = 3600;
+       cdt->cdt_max_request = 3;
+       atomic_set(&cdt->cdt_request_count, 0);
+
+       /* to avoid deadlock when start is made through /proc
+        * /proc entries are created by the coordinator thread */
+
+       /* set up list of started restore requests */
+       cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
+       rc = mdt_hsm_pending_restore(cdt_mti);
+       if (rc)
+               CERROR("%s: cannot take the layout locks needed"
+                      " for registered restore: %d",
+                      mdt_obd_name(mdt), rc);
+
+       task = kthread_run(mdt_coordinator, cdt_mti, "hsm_cdtr");
+       if (IS_ERR(task)) {
+               rc = PTR_ERR(task);
+               cdt->cdt_state = CDT_STOPPED;
+               CERROR("%s: error starting coordinator thread: %d\n",
+                      mdt_obd_name(mdt), rc);
+               RETURN(rc);
+       } else {
+               CDEBUG(D_HSM, "%s: coordinator thread started\n",
+                      mdt_obd_name(mdt));
+               rc = 0;
+       }
+
+       cfs_wait_event(cdt->cdt_thread.t_ctl_waitq,
+                      (cdt->cdt_thread.t_flags & SVC_RUNNING));
+
+       cdt->cdt_state = CDT_RUNNING;
+       mdt->mdt_opts.mo_coordinator = 1;
+       RETURN(0);
+}
+
+/**
+ * stop a coordinator thread
+ * \param mdt [IN] device
+ */
+int mdt_hsm_cdt_stop(struct mdt_device *mdt)
+{
+       struct coordinator              *cdt = &mdt->mdt_coordinator;
+       struct cdt_agent_req            *car, *tmp1;
+       struct hsm_agent                *ha, *tmp2;
+       struct cdt_restore_handle       *crh, *tmp3;
+       struct mdt_thread_info          *cdt_mti;
+       ENTRY;
+
+       if (cdt->cdt_state == CDT_STOPPED) {
+               CERROR("%s: Coordinator already stopped\n",
+                      mdt_obd_name(mdt));
+               RETURN(-EALREADY);
+       }
+
+       /* remove proc entries */
+       if (cdt->cdt_proc_dir != NULL)
+               lprocfs_remove(&cdt->cdt_proc_dir);
+
+       if (cdt->cdt_state != CDT_STOPPING) {
+               /* stop coordinator thread before cleaning */
+               cdt->cdt_thread.t_flags = SVC_STOPPING;
+               cfs_waitq_signal(&cdt->cdt_thread.t_ctl_waitq);
+               cfs_wait_event(cdt->cdt_thread.t_ctl_waitq,
+                              cdt->cdt_thread.t_flags & SVC_STOPPED);
+       }
+       cdt->cdt_state = CDT_STOPPED;
+
+       /* start cleaning */
+       down_write(&cdt->cdt_request_lock);
+       list_for_each_entry_safe(car, tmp1, &cdt->cdt_requests,
+                                car_request_list) {
+               list_del(&car->car_request_list);
+               mdt_cdt_free_request(car);
+       }
+       up_write(&cdt->cdt_request_lock);
+
+       down_write(&cdt->cdt_agent_lock);
+       list_for_each_entry_safe(ha, tmp2, &cdt->cdt_agents, ha_list) {
+               list_del(&ha->ha_list);
+               OBD_FREE_PTR(ha);
+       }
+       up_write(&cdt->cdt_agent_lock);
+
+       cdt_mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
+       mutex_lock(&cdt->cdt_restore_lock);
+       list_for_each_entry_safe(crh, tmp3, &cdt->cdt_restore_hdl, crh_list) {
+               struct mdt_object       *child;
+
+               /* give back layout lock */
+               child = mdt_object_find(&cdt->cdt_env, mdt, &crh->crh_fid);
+               if (!IS_ERR(child))
+                       mdt_object_unlock_put(cdt_mti, child, &crh->crh_lh, 1);
+
+               list_del(&crh->crh_list);
+
+               OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
+       }
+       mutex_unlock(&cdt->cdt_restore_lock);
+
+       mdt->mdt_opts.mo_coordinator = 0;
+
+       RETURN(0);
+}
+
+/**
+ * register all requests from an hal in the memory list
+ * \param mti [IN] context
+ * \param hal [IN] request
+ * \param uuid [OUT] in case of CANCEL, the uuid of the agent
+ *  which is running the CT
+ * \retval 0 success
+ * \retval -ve failure
+ */
+int mdt_hsm_add_hal(struct mdt_thread_info *mti,
+                   struct hsm_action_list *hal, struct obd_uuid *uuid)
+{
+       struct mdt_device       *mdt = mti->mti_mdt;
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       struct hsm_action_item  *hai;
+       int                      rc = 0, i;
+       ENTRY;
+
+       /* register request in memory list */
+       hai = hai_zero(hal);
+       for (i = 0; i < hal->hal_count; i++, hai = hai_next(hai)) {
+               struct cdt_agent_req *car;
+
+               /* in case of a cancel request, we first mark the ondisk
+                * record of the request we want to stop as canceled
+                * this does not change the cancel record
+                * it will be done when updating the request status
+                */
+               if (hai->hai_action == HSMA_CANCEL) {
+                       rc = mdt_agent_record_update(mti->mti_env, mti->mti_mdt,
+                                                    &hai->hai_cookie,
+                                                    1, ARS_CANCELED);
+                       if (rc) {
+                               CERROR("%s: mdt_agent_record_update() failed, "
+                                      "rc=%d, cannot update status to %s "
+                                      "for cookie "LPX64"\n",
+                                      mdt_obd_name(mdt), rc,
+                                      agent_req_status2name(ARS_CANCELED),
+                                      hai->hai_cookie);
+                               GOTO(out, rc);
+                       }
+
+                       /* find the running request to set it canceled */
+                       car = mdt_cdt_find_request(cdt, hai->hai_cookie, NULL);
+                       if (car != NULL) {
+                               car->car_canceled = 1;
+                               /* uuid has to be changed to the one running the
+                               * request to cancel */
+                               *uuid = car->car_uuid;
+                               mdt_cdt_put_request(car);
+                       }
+                       /* no need to memorize cancel request
+                        * this also avoid a deadlock when we receive
+                        * a purge all requests command
+                        */
+                       continue;
+               }
+
+               if (hai->hai_action == HSMA_ARCHIVE) {
+                       struct mdt_object *obj;
+                       struct md_hsm hsm;
+
+                       obj = mdt_hsm_get_md_hsm(mti, &hai->hai_fid, &hsm);
+                       if (IS_ERR(obj) && (PTR_ERR(obj) == -ENOENT))
+                               continue;
+                       if (IS_ERR(obj))
+                               GOTO(out, rc = PTR_ERR(obj));
+
+                       hsm.mh_flags |= HS_EXISTS;
+                       hsm.mh_arch_id = hal->hal_archive_id;
+                       rc = mdt_hsm_attr_set(mti, obj, &hsm);
+                       mdt_object_put(mti->mti_env, obj);
+                       if (rc)
+                               GOTO(out, rc);
+               }
+
+               car = mdt_cdt_alloc_request(hal->hal_compound_id,
+                                           hal->hal_archive_id, hal->hal_flags,
+                                           uuid, hai);
+               if (IS_ERR(car))
+                       GOTO(out, rc = PTR_ERR(car));
+
+               rc = mdt_cdt_add_request(cdt, car);
+               if (rc != 0)
+                       mdt_cdt_free_request(car);
+       }
+out:
+       RETURN(rc);
+}
+
+/**
+ * swap layouts between 2 fids
+ * \param mti [IN] context
+ * \param fid1 [IN]
+ * \param fid2 [IN]
+ */
+static int hsm_swap_layouts(struct mdt_thread_info *mti,
+                           const lustre_fid *fid, const lustre_fid *dfid)
+{
+       struct mdt_device       *mdt = mti->mti_mdt;
+       struct mdt_object       *child1, *child2;
+       struct mdt_lock_handle  *lh2;
+       int                      rc;
+       ENTRY;
+
+       child1 = mdt_object_find(mti->mti_env, mdt, fid);
+       if (IS_ERR(child1))
+               GOTO(out, rc = PTR_ERR(child1));
+
+       /* we already have layout lock on FID so take only
+        * on dfid */
+       lh2 = &mti->mti_lh[MDT_LH_OLD];
+       mdt_lock_reg_init(lh2, LCK_EX);
+       child2 = mdt_object_find_lock(mti, dfid, lh2, MDS_INODELOCK_LAYOUT);
+       if (IS_ERR(child2))
+               GOTO(out_child1, rc = PTR_ERR(child2));
+
+       /* if copy tool closes the volatile before sending the final
+        * progress through llapi_hsm_copy_end(), all the objects
+        * are removed and mdd_swap_layout LBUG */
+       if (mdt_object_exists(child2)) {
+               rc = mo_swap_layouts(mti->mti_env, mdt_object_child(child1),
+                                    mdt_object_child(child2), 0);
+       } else {
+               CERROR("%s: Copytool has closed volatile file "DFID"\n",
+                      mdt_obd_name(mti->mti_mdt), PFID(dfid));
+               rc = -ENOENT;
+       }
+
+       mdt_object_unlock_put(mti, child2, lh2, 1);
+out_child1:
+       mdt_object_put(mti->mti_env, child1);
+out:
+       RETURN(rc);
+}
+
+/**
+ * update status of a completed request
+ * \param mti [IN] context
+ * \param pgs [IN] progress of the copy tool
+ * \param update_record [IN] update llog record
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int hsm_cdt_request_completed(struct mdt_thread_info *mti,
+                                    struct hsm_progress_kernel *pgs,
+                                    const struct cdt_agent_req *car,
+                                    enum agent_req_status *status)
+{
+       const struct lu_env     *env = mti->mti_env;
+       struct mdt_device       *mdt = mti->mti_mdt;
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       struct mdt_object       *obj = NULL;
+       int                      cl_flags = 0, rc = 0;
+       struct md_hsm            mh;
+       bool                     is_mh_changed;
+       ENTRY;
+
+       /* default is to retry */
+       *status = ARS_WAITING;
+
+       /* find object by FID */
+       obj = mdt_hsm_get_md_hsm(mti, &car->car_hai->hai_fid, &mh);
+       /* we will update MD HSM only if needed */
+       is_mh_changed = false;
+       if (IS_ERR(obj)) {
+               /* object removed */
+               *status = ARS_SUCCEED;
+               goto unlock;
+       }
+
+       /* no need to change mh->mh_arch_id
+        * mdt_hsm_get_md_hsm() got it from disk and it is still valid
+        */
+       if (pgs->hpk_errval != 0) {
+               switch (pgs->hpk_errval) {
+               case ENOSYS:
+                       /* the copy tool does not support cancel
+                        * so the cancel request is failed
+                        * As we cannot distinguish a cancel progress
+                        * from another action progress (they have the
+                        * same cookie), we suppose here the CT returns
+                        * ENOSYS only if does not support cancel
+                        */
+                       /* this can also happen when cdt calls it to
+                        * for a timeouted request */
+                       *status = ARS_FAILED;
+                       /* to have a cancel event in changelog */
+                       pgs->hpk_errval = ECANCELED;
+                       break;
+               case ECANCELED:
+                       /* the request record has already been set to
+                        * ARS_CANCELED, this set the cancel request
+                        * to ARS_SUCCEED */
+                       *status = ARS_SUCCEED;
+                       break;
+               default:
+                       *status = (((cdt->cdt_policy &
+                                  CDT_NORETRY_ACTION) ||
+                                  !(pgs->hpk_flags & HP_FLAG_RETRY)) ?
+                                  ARS_FAILED : ARS_WAITING);
+                       break;
+               }
+
+               if (pgs->hpk_errval > CLF_HSM_MAXERROR) {
+                       CERROR("%s: Request "LPX64" on "DFID
+                              " failed, error code %d too large\n",
+                              mdt_obd_name(mdt),
+                              pgs->hpk_cookie, PFID(&pgs->hpk_fid),
+                              pgs->hpk_errval);
+                       hsm_set_cl_error(&cl_flags,
+                                        CLF_HSM_ERROVERFLOW);
+                       rc = -EINVAL;
+               } else {
+                       hsm_set_cl_error(&cl_flags, pgs->hpk_errval);
+               }
+
+               switch (car->car_hai->hai_action) {
+               case HSMA_ARCHIVE:
+                       hsm_set_cl_event(&cl_flags, HE_ARCHIVE);
+                       break;
+               case HSMA_RESTORE:
+                       hsm_set_cl_event(&cl_flags, HE_RESTORE);
+                       break;
+               case HSMA_REMOVE:
+                       hsm_set_cl_event(&cl_flags, HE_REMOVE);
+                       break;
+               case HSMA_CANCEL:
+                       hsm_set_cl_event(&cl_flags, HE_CANCEL);
+                       CERROR("%s: Failed request "LPX64" on "DFID
+                              " cannot be a CANCEL\n",
+                              mdt_obd_name(mdt),
+                              pgs->hpk_cookie,
+                              PFID(&pgs->hpk_fid));
+                       break;
+               default:
+                       CERROR("%s: Failed request "LPX64" on "DFID
+                              " %d is an unknown action\n",
+                              mdt_obd_name(mdt),
+                              pgs->hpk_cookie, PFID(&pgs->hpk_fid),
+                              car->car_hai->hai_action);
+                       rc = -EINVAL;
+                       break;
+               }
+       } else {
+               *status = ARS_SUCCEED;
+               switch (car->car_hai->hai_action) {
+               case HSMA_ARCHIVE:
+                       hsm_set_cl_event(&cl_flags, HE_ARCHIVE);
+                       /* set ARCHIVE keep EXIST and clear LOST and
+                        * DIRTY */
+                       mh.mh_arch_ver = pgs->hpk_data_version;
+                       mh.mh_flags |= HS_ARCHIVED;
+                       mh.mh_flags &= ~(HS_LOST|HS_DIRTY);
+                       is_mh_changed = true;
+                       break;
+               case HSMA_RESTORE:
+                       hsm_set_cl_event(&cl_flags, HE_RESTORE);
+
+                       /* clear RELEASED and DIRTY */
+                       mh.mh_flags &= ~(HS_RELEASED | HS_DIRTY);
+                       /* Restoring has changed the file version on
+                        * disk. */
+                       mh.mh_arch_ver = pgs->hpk_data_version;
+                       is_mh_changed = true;
+                       break;
+               case HSMA_REMOVE:
+                       hsm_set_cl_event(&cl_flags, HE_REMOVE);
+                       /* clear ARCHIVED EXISTS and LOST */
+                       mh.mh_flags &= ~(HS_ARCHIVED | HS_EXISTS | HS_LOST);
+                       is_mh_changed = true;
+                       break;
+               case HSMA_CANCEL:
+                       hsm_set_cl_event(&cl_flags, HE_CANCEL);
+                       CERROR("%s: Successful request "LPX64
+                              " on "DFID
+                              " cannot be a CANCEL\n",
+                              mdt_obd_name(mdt),
+                              pgs->hpk_cookie,
+                              PFID(&pgs->hpk_fid));
+                       break;
+               default:
+                       CERROR("%s: Successful request "LPX64
+                              " on "DFID
+                              " %d is an unknown action\n",
+                              mdt_obd_name(mdt),
+                              pgs->hpk_cookie, PFID(&pgs->hpk_fid),
+                              car->car_hai->hai_action);
+                       rc = -EINVAL;
+                       break;
+               }
+       }
+
+       /* rc != 0 means error when analysing action, it may come from
+        * a crasy CT no need to manage DIRTY
+        */
+       if (rc == 0)
+               hsm_set_cl_flags(&cl_flags, ((mh.mh_flags & HS_DIRTY) ?
+                                            CLF_HSM_DIRTY : 0));
+
+       /* unlock is done later, after layout lock management */
+       if (is_mh_changed)
+               rc = mdt_hsm_attr_set(mti, obj, &mh);
+
+unlock:
+       /* we give back layout lock only if restore was successful or
+        * if restore was canceled or if policy is to not retry
+        * in other cases we just unlock the object */
+       if ((car->car_hai->hai_action == HSMA_RESTORE) &&
+           ((pgs->hpk_errval == 0) || (pgs->hpk_errval == ECANCELED) ||
+            (cdt->cdt_policy & CDT_NORETRY_ACTION))) {
+               struct cdt_restore_handle       *crh;
+
+               /* restore in data FID done, we swap the layouts
+                * only if restore is successfull */
+               if (pgs->hpk_errval == 0) {
+                       rc = hsm_swap_layouts(mti, &car->car_hai->hai_fid,
+                                             &car->car_hai->hai_dfid);
+                       if (rc) {
+                               if (cdt->cdt_policy & CDT_NORETRY_ACTION)
+                                       *status = ARS_FAILED;
+                               pgs->hpk_errval = -rc;
+                       }
+               }
+               /* we have to retry, so keep layout lock */
+               if (*status == ARS_WAITING)
+                       GOTO(out, rc);
+
+               /* give back layout lock */
+               mutex_lock(&cdt->cdt_restore_lock);
+               crh = hsm_restore_hdl_find(cdt, &car->car_hai->hai_fid);
+               if (crh != NULL)
+                       list_del(&crh->crh_list);
+               mutex_unlock(&cdt->cdt_restore_lock);
+               /* just give back layout lock, we keep
+                * the reference which is given back
+                * later with the lock for HSM flags */
+               if (!IS_ERR(obj))
+                       mdt_object_unlock(mti, obj, &crh->crh_lh, 1);
+               if (crh != NULL)
+                       OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
+       }
+
+       GOTO(out, rc);
+
+out:
+       if ((obj != NULL) && !IS_ERR(obj)) {
+               mo_changelog(env, CL_HSM, cl_flags,
+                            mdt_object_child(obj));
+               mdt_object_put(mti->mti_env, obj);
+       }
+
+       RETURN(rc);
+}
+
+/**
+ * update status of a request
+ * \param mti [IN] context
+ * \param pgs [IN] progress of the copy tool
+ * \param update_record [IN] update llog record
+ * \retval 0 success
+ * \retval -ve failure
+ */
+int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
+                                struct hsm_progress_kernel *pgs,
+                                const int update_record)
+{
+       struct mdt_device       *mdt = mti->mti_mdt;
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       struct cdt_agent_req    *car;
+       int                      rc = 0;
+       ENTRY;
+
+       /* no coordinator started, so we cannot serve requests */
+       if (cdt->cdt_state == CDT_STOPPED)
+               RETURN(-EAGAIN);
+
+       /* first do sanity checks */
+       car = mdt_cdt_update_request(cdt, pgs);
+       if (IS_ERR(car)) {
+               CERROR("%s: Cannot find running request for cookie "LPX64
+                      " on fid="DFID"\n",
+                      mdt_obd_name(mdt),
+                      pgs->hpk_cookie, PFID(&pgs->hpk_fid));
+               RETURN(PTR_ERR(car));
+       }
+
+       CDEBUG(D_HSM, "Progress received for fid="DFID" cookie="LPX64
+                     " action=%s flags=%d err=%d fid="DFID" dfid="DFID"\n",
+                     PFID(&pgs->hpk_fid), pgs->hpk_cookie,
+                     hsm_copytool_action2name(car->car_hai->hai_action),
+                     pgs->hpk_flags, pgs->hpk_errval,
+                     PFID(&car->car_hai->hai_fid),
+                     PFID(&car->car_hai->hai_dfid));
+
+       /* progress is done on FID or data FID depending of the action and
+        * of the copy progress */
+       /* for restore progress is used to send back the data FID to cdt */
+       if ((car->car_hai->hai_action == HSMA_RESTORE) &&
+           (lu_fid_eq(&car->car_hai->hai_fid, &car->car_hai->hai_dfid)))
+               car->car_hai->hai_dfid = pgs->hpk_fid;
+
+       if (((car->car_hai->hai_action == HSMA_RESTORE) ||
+            (car->car_hai->hai_action == HSMA_ARCHIVE)) &&
+           (!lu_fid_eq(&pgs->hpk_fid, &car->car_hai->hai_dfid) &&
+            !lu_fid_eq(&pgs->hpk_fid, &car->car_hai->hai_fid))) {
+               CERROR("%s: Progress on "DFID" for cookie "LPX64
+                      " does not match request FID "DFID" nor data FID "
+                      DFID"\n",
+                      mdt_obd_name(mdt),
+                      PFID(&pgs->hpk_fid), pgs->hpk_cookie,
+                      PFID(&car->car_hai->hai_fid),
+                      PFID(&car->car_hai->hai_dfid));
+               GOTO(out, rc = -EINVAL);
+       }
+
+       if (pgs->hpk_errval != 0 && !(pgs->hpk_flags & HP_FLAG_COMPLETED)) {
+               CERROR("%s: Progress on "DFID" for cookie "LPX64" action=%s"
+                      " is not coherent (err=%d and not completed"
+                      " (flags=%d))\n",
+                      mdt_obd_name(mdt),
+                      PFID(&pgs->hpk_fid), pgs->hpk_cookie,
+                      hsm_copytool_action2name(car->car_hai->hai_action),
+                      pgs->hpk_errval, pgs->hpk_flags);
+               GOTO(out, rc = -EINVAL);
+       }
+
+       /* now progress is valid */
+
+       /* we use a root like ucred */
+       hsm_init_ucred(mdt_ucred(mti));
+
+       if (pgs->hpk_flags & HP_FLAG_COMPLETED) {
+               enum agent_req_status    status;
+
+               rc = hsm_cdt_request_completed(mti, pgs, car, &status);
+
+               /* remove request from memory list */
+               mdt_cdt_remove_request(cdt, pgs->hpk_cookie);
+
+               CDEBUG(D_HSM, "Updating record: fid="DFID" cookie="LPX64
+                             " action=%s status=%s\n", PFID(&pgs->hpk_fid),
+                      pgs->hpk_cookie,
+                      hsm_copytool_action2name(car->car_hai->hai_action),
+                      agent_req_status2name(status));
+
+               if (update_record) {
+                       int rc1;
+
+                       rc1 = mdt_agent_record_update(mti->mti_env, mdt,
+                                                    &pgs->hpk_cookie, 1,
+                                                    status);
+                       if (rc1)
+                               CERROR("%s: mdt_agent_record_update() failed,"
+                                      " rc=%d, cannot update status to %s"
+                                      " for cookie "LPX64"\n",
+                                      mdt_obd_name(mdt), rc1,
+                                      agent_req_status2name(status),
+                                      pgs->hpk_cookie);
+                       rc = (rc != 0 ? rc : rc1);
+               }
+               /* ct has completed a request, so a slot is available, wakeup
+                * cdt to find new work */
+               mdt_hsm_cdt_wakeup(mdt);
+       } else {
+               /* if copytool send a progress on a canceled request
+                * we inform copytool it should stop
+                */
+               if (car->car_canceled == 1)
+                       rc = -ECANCELED;
+       }
+       GOTO(out, rc);
+
+out:
+       /* remove ref got from mdt_cdt_update_request() */
+       mdt_cdt_put_request(car);
+
+       return rc;
+}
+
+
+/**
+ * data passed to llog_cat_process() callback
+ * to cancel requests
+ */
+struct hsm_cancel_all_data {
+       struct mdt_device       *mdt;
+};
+
+/**
+ *  llog_cat_process() callback, used to:
+ *  - purge all requests
+ * \param env [IN] environment
+ * \param llh [IN] llog handle
+ * \param hdr [IN] llog record
+ * \param data [IN] cb data = struct hsm_cancel_all_data
+ * \retval 0 success
+ * \retval -ve failure
+ */
+static int mdt_cancel_all_cb(const struct lu_env *env,
+                            struct llog_handle *llh,
+                            struct llog_rec_hdr *hdr, void *data)
+{
+       struct llog_agent_req_rec       *larr;
+       struct hsm_cancel_all_data      *hcad;
+       int                              rc = 0;
+       ENTRY;
+
+       larr = (struct llog_agent_req_rec *)hdr;
+       hcad = data;
+       if ((larr->arr_status == ARS_WAITING) ||
+           (larr->arr_status == ARS_STARTED)) {
+               larr->arr_status = ARS_CANCELED;
+               larr->arr_req_change = cfs_time_current_sec();
+               rc = mdt_agent_llog_update_rec(env, hcad->mdt, llh, larr);
+               if (rc == 0)
+                       RETURN(LLOG_DEL_RECORD);
+       }
+       RETURN(rc);
+}
+
+/**
+ * cancel all actions
+ * \param obd [IN] MDT device
+ */
+static int hsm_cancel_all_actions(struct mdt_device *mdt)
+{
+       struct mdt_thread_info          *mti;
+       struct coordinator              *cdt = &mdt->mdt_coordinator;
+       struct cdt_agent_req            *car;
+       struct hsm_action_list          *hal = NULL;
+       struct hsm_action_item          *hai;
+       struct hsm_cancel_all_data       hcad;
+       int                              hal_sz = 0, hal_len, rc;
+       enum cdt_states                  save_state;
+       ENTRY;
+
+       /* retrieve coordinator context */
+       mti = lu_context_key_get(&cdt->cdt_env.le_ctx, &mdt_thread_key);
+
+       /* disable coordinator */
+       save_state = cdt->cdt_state;
+       cdt->cdt_state = CDT_DISABLE;
+
+       /* send cancel to all running requests */
+       down_read(&cdt->cdt_request_lock);
+       list_for_each_entry(car, &cdt->cdt_requests, car_request_list) {
+               mdt_cdt_get_request(car);
+               /* request is not yet removed from list, it will be done
+                * when copytool will return progress
+                */
+
+               if (car->car_hai->hai_action == HSMA_CANCEL) {
+                       mdt_cdt_put_request(car);
+                       continue;
+               }
+
+               /* needed size */
+               hal_len = sizeof(*hal) + cfs_size_round(MTI_NAME_MAXLEN + 1) +
+                         cfs_size_round(car->car_hai->hai_len);
+
+               if ((hal_len > hal_sz) && (hal_sz > 0)) {
+                       /* not enough room, free old buffer */
+                       OBD_FREE(hal, hal_sz);
+                       hal = NULL;
+               }
+
+               /* empty buffer, allocate one */
+               if (hal == NULL) {
+                       hal_sz = hal_len;
+                       OBD_ALLOC(hal, hal_sz);
+                       if (hal == NULL) {
+                               mdt_cdt_put_request(car);
+                               up_read(&cdt->cdt_request_lock);
+                               GOTO(out, rc = -ENOMEM);
+                       }
+               }
+
+               hal->hal_version = HAL_VERSION;
+               obd_uuid2fsname(hal->hal_fsname, mdt_obd_name(mdt),
+                               MTI_NAME_MAXLEN);
+               hal->hal_fsname[MTI_NAME_MAXLEN] = '\0';
+               hal->hal_compound_id = car->car_compound_id;
+               hal->hal_archive_id = car->car_archive_id;
+               hal->hal_flags = car->car_flags;
+               hal->hal_count = 0;
+
+               hai = hai_zero(hal);
+               memcpy(hai, car->car_hai, car->car_hai->hai_len);
+               hai->hai_action = HSMA_CANCEL;
+               hal->hal_count = 1;
+
+               /* it is possible to safely call mdt_hsm_agent_send()
+                * (ie without a deadlock on cdt_request_lock), because the
+                * write lock is taken only if we are not in purge mode
+                * (mdt_hsm_agent_send() does not call mdt_cdt_add_request()
+                *   nor mdt_cdt_remove_request())
+                */
+               /* no conflict with cdt thread because cdt is disable and we
+                * have the request lock */
+               mdt_hsm_agent_send(mti, hal, 1);
+
+               mdt_cdt_put_request(car);
+       }
+       up_read(&cdt->cdt_request_lock);
+
+       if (hal != NULL)
+               OBD_FREE(hal, hal_sz);
+
+       /* cancel all on-disk records */
+       hcad.mdt = mdt;
+
+       rc = cdt_llog_process(mti->mti_env, mti->mti_mdt,
+                             mdt_cancel_all_cb, &hcad);
+out:
+       /* enable coordinator */
+       cdt->cdt_state = save_state;
+
+       RETURN(rc);
+}
+
+/**
+ * check if a request is comptaible with file status
+ * \param hai [IN] request description
+ * \param hal_an [IN] request archive number (not used)
+ * \param rq_flags [IN] request flags
+ * \param hsm [IN] file HSM metadata
+ * \retval boolean
+ */
+bool mdt_hsm_is_action_compat(const struct hsm_action_item *hai,
+                             const int hal_an, const __u64 rq_flags,
+                             const struct md_hsm *hsm)
+{
+       int      is_compat = false;
+       int      hsm_flags;
+       ENTRY;
+
+       hsm_flags = hsm->mh_flags;
+       switch (hai->hai_action) {
+       case HSMA_ARCHIVE:
+               if (!(hsm_flags & HS_NOARCHIVE) &&
+                   ((hsm_flags & HS_DIRTY) || !(hsm_flags & HS_ARCHIVED)))
+                       is_compat = true;
+               break;
+       case HSMA_RESTORE:
+               if (!(hsm_flags & HS_DIRTY) && (hsm_flags & HS_RELEASED) &&
+                   (hsm_flags & HS_ARCHIVED) && !(hsm_flags & HS_LOST))
+                       is_compat = true;
+               break;
+       case HSMA_REMOVE:
+               if (!(hsm_flags & HS_RELEASED) &&
+                   (hsm_flags & (HS_ARCHIVED | HS_EXISTS)))
+                       is_compat = true;
+               break;
+       case HSMA_CANCEL:
+               is_compat = true;
+               break;
+       }
+       CDEBUG(D_HSM, "fid="DFID" action=%s flags="LPX64
+                     " extent="LPX64"-"LPX64" hsm_flags=%.8X %s\n",
+                     PFID(&hai->hai_fid),
+                     hsm_copytool_action2name(hai->hai_action), rq_flags,
+                     hai->hai_extent.offset, hai->hai_extent.length,
+                     hsm->mh_flags,
+                     (is_compat ? "compatible" : "uncompatible"));
+
+       RETURN(is_compat);
+}
+
+/*
+ * /proc interface used to get/set HSM behaviour (cdt->cdt_policy)
+ */
+static const struct {
+       __u64            bit;
+       char            *name;
+       char            *nickname;
+} hsm_policy_names[] = {
+       { CDT_NONBLOCKING_RESTORE,      "non_blocking_restore", "nbr"},
+       { CDT_NORETRY_ACTION,           "no_retry_action",      "nra"},
+       { 0 },
+};
+
+/**
+ * convert a policy name to a bit
+ * \param name [IN] policy name
+ * \retval 0 unknown
+ * \retval   policy bit
+ */
+static __u64 hsm_policy_str2bit(const char *name)
+{
+       int      i;
+
+       for (i = 0; hsm_policy_names[i].bit != 0; i++)
+               if (strcmp(hsm_policy_names[i].nickname, name) == 0)
+                       return hsm_policy_names[i].bit;
+       return 0;
+}
+
+/**
+ * convert a policy bit field to a string
+ * \param mask [IN] policy bit field
+ * \param buffer [OUT] string
+ * \param count [IN] size of buffer
+ * \retval size filled in buffer
+ */
+static int hsm_policy_bit2str(const __u64 mask, char *buffer, int count)
+{
+       int      i, j, sz;
+       char    *ptr;
+       __u64    bit;
+       ENTRY;
+
+       ptr = buffer;
+       sz = snprintf(buffer, count, "("LPX64") ", mask);
+       ptr += sz;
+       count -= sz;
+       for (i = 0; i < (sizeof(mask) * 8); i++) {
+               bit = (1ULL << i);
+               if (!(bit  & mask))
+                       continue;
+
+               for (j = 0; hsm_policy_names[j].bit != 0; j++) {
+                       if (hsm_policy_names[j].bit == bit) {
+                               sz = snprintf(ptr, count, "%s(%s) ",
+                                             hsm_policy_names[j].name,
+                                             hsm_policy_names[j].nickname);
+                               ptr += sz;
+                               count -= sz;
+                               break;
+                       }
+               }
+       }
+       RETURN(ptr - buffer);
+}
+
+/* methods to read/write HSM policy flags */
+static int lprocfs_rd_hsm_policy(char *page, char **start, off_t off,
+                                int count, int *eof, void *data)
+{
+       struct mdt_device       *mdt = data;
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       int                      sz;
+       ENTRY;
+
+       sz = hsm_policy_bit2str(cdt->cdt_policy, page, count);
+       page[sz] = '\n';
+       sz++;
+       page[sz] = '\0';
+       *eof = 1;
+       RETURN(sz);
+}
+
+static int lprocfs_wr_hsm_policy(struct file *file, const char *buffer,
+                                unsigned long count, void *data)
+{
+       struct mdt_device       *mdt = data;
+       struct coordinator      *cdt = &mdt->mdt_coordinator;
+       int                      sz;
+       char                    *start, *end;
+       __u64                    policy;
+       int                      set;
+       char                    *buf;
+       ENTRY;
+
+       if (strncmp(buffer, "help", 4) == 0) {
+               sz = PAGE_SIZE;
+               OBD_ALLOC(buf, sz);
+               if (!buf)
+                       RETURN(-ENOMEM);
+
+               hsm_policy_bit2str(CDT_POLICY_MASK, buf, sz);
+               CWARN("Supported policies are: %s\n", buf);
+               OBD_FREE(buf, sz);
+               RETURN(count);
+       }
+
+       OBD_ALLOC(buf, count + 1);
+       if (buf == NULL)
+               RETURN(-ENOMEM);
+
+       if (copy_from_user(buf, buffer, count))
+               RETURN(-EFAULT);
+
+       buf[count] = '\0';
+       start = buf;
+
+       policy = 0;
+       do {
+               end = strchr(start, ' ');
+               if (end != NULL)
+                       *end = '\0';
+               switch (*start) {
+               case '-':
+                       start++;
+                       set = 0;
+                       break;
+               case '+':
+                       start++;
+                       set = 1;
+                       break;
+               default:
+                       set = 2;
+                       break;
+               }
+               policy = hsm_policy_str2bit(start);
+               if (!policy)
+                       break;
+
+               switch (set) {
+               case 0:
+                       cdt->cdt_policy &= ~policy;
+                       break;
+               case 1:
+                       cdt->cdt_policy |= policy;
+                       break;
+               case 2:
+                       cdt->cdt_policy = policy;
+                       break;
+               }
+
+               start = end + 1;
+       } while (end != NULL);
+       OBD_FREE(buf, count + 1);
+       RETURN(count);
+}
+
+#define GENERATE_PROC_METHOD(VAR)                                      \
+static int lprocfs_rd_hsm_##VAR(char *page, char **start, off_t off,   \
+                               int count, int *eof, void *data)        \
+{                                                                      \
+       struct mdt_device       *mdt = data;                            \
+       struct coordinator      *cdt = &mdt->mdt_coordinator;           \
+       int                      sz;                                    \
+       ENTRY;                                                          \
+                                                                       \
+       sz = snprintf(page, count, LPU64"\n", (__u64)cdt->VAR);         \
+       *eof = 1;                                                       \
+       RETURN(sz);                                                     \
+}                                                                      \
+static int lprocfs_wr_hsm_##VAR(struct file *file, const char *buffer, \
+                               unsigned long count, void *data)        \
+                                                                       \
+{                                                                      \
+       struct mdt_device       *mdt = data;                            \
+       struct coordinator      *cdt = &mdt->mdt_coordinator;           \
+       int                      val;                                   \
+       int                      rc;                                    \
+       ENTRY;                                                          \
+                                                                       \
+       rc = lprocfs_write_helper(buffer, count, &val);                 \
+       if (rc)                                                         \
+               RETURN(rc);                                             \
+       if (val > 0) {                                                  \
+               cdt->VAR = val;                                         \
+               RETURN(count);                                          \
+       }                                                               \
+       RETURN(-EINVAL);                                                \
+}
+
+GENERATE_PROC_METHOD(cdt_loop_period)
+GENERATE_PROC_METHOD(cdt_delay)
+GENERATE_PROC_METHOD(cdt_timeout)
+GENERATE_PROC_METHOD(cdt_max_request)
+
+/*
+ * procfs write method for MDT/hsm_control
+ * proc entry is in mdt directory so data is mdt obd_device pointer
+ */
+#define CDT_ENABLE_CMD   "enabled"
+#define CDT_STOP_CMD     "shutdown"
+#define CDT_DISABLE_CMD  "disabled"
+#define CDT_PURGE_CMD    "purge"
+#define CDT_HELP_CMD     "help"
+
+int lprocfs_wr_hsm_cdt_control(struct file *file, const char *buffer,
+                              unsigned long count, void *data)
+{
+       struct obd_device       *obd = data;
+       struct mdt_device       *mdt = mdt_dev(obd->obd_lu_dev);
+       struct coordinator      *cdt = &(mdt->mdt_coordinator);
+       int                      rc, usage = 0;
+       ENTRY;
+
+       rc = 0;
+       if (strncmp(buffer, CDT_ENABLE_CMD, strlen(CDT_ENABLE_CMD)) == 0) {
+               if (cdt->cdt_state == CDT_DISABLE) {
+                       cdt->cdt_state = CDT_RUNNING;
+                       mdt_hsm_cdt_wakeup(mdt);
+               } else {
+                       rc = mdt_hsm_cdt_start(mdt);
+               }
+       } else if (strncmp(buffer, CDT_STOP_CMD, strlen(CDT_STOP_CMD)) == 0) {
+               cdt->cdt_state = CDT_STOPPING;
+       } else if (strncmp(buffer, CDT_DISABLE_CMD,
+                          strlen(CDT_DISABLE_CMD)) == 0) {
+               cdt->cdt_state = CDT_DISABLE;
+       } else if (strncmp(buffer, CDT_PURGE_CMD, strlen(CDT_PURGE_CMD)) == 0) {
+               rc = hsm_cancel_all_actions(mdt);
+       } else if (strncmp(buffer, CDT_HELP_CMD, strlen(CDT_HELP_CMD)) == 0) {
+               usage = 1;
+       } else {
+               usage = 1;
+               rc = -EINVAL;
+       }
+
+       if (usage == 1)
+               CERROR("%s: Valid coordinator control commands are: "
+                      "%s %s %s %s %s\n", mdt_obd_name(mdt),
+                      CDT_ENABLE_CMD, CDT_STOP_CMD, CDT_DISABLE_CMD,
+                      CDT_PURGE_CMD, CDT_HELP_CMD);
+
+       if (rc)
+               RETURN(rc);
+
+       RETURN(count);
+}
+
+int lprocfs_rd_hsm_cdt_control(char *page, char **start, off_t off,
+                              int count, int *eof, void *data)
+{
+       struct obd_device       *obd = data;
+       struct coordinator      *cdt;
+       int                      sz;
+       ENTRY;
+
+       cdt = &(mdt_dev(obd->obd_lu_dev)->mdt_coordinator);
+       *eof = 1;
+
+       if (cdt->cdt_state == CDT_INIT)
+               sz = snprintf(page, count, "init\n");
+       else if (cdt->cdt_state == CDT_RUNNING)
+               sz = snprintf(page, count, "enabled\n");
+       else if (cdt->cdt_state == CDT_STOPPING)
+               sz = snprintf(page, count, "stopping\n");
+       else if (cdt->cdt_state == CDT_STOPPED)
+               sz = snprintf(page, count, "stopped\n");
+       else if (cdt->cdt_state == CDT_DISABLE)
+               sz = snprintf(page, count, "disabled\n");
+       else
+               sz = snprintf(page, count, "unknown\n");
+
+       RETURN(sz);
+}
+
+static struct lprocfs_vars lprocfs_mdt_hsm_vars[] = {
+       { "agents",             NULL, NULL, NULL, &mdt_hsm_agent_fops, 0 },
+       { "agent_actions",      NULL, NULL, NULL,
+                               &mdt_agent_actions_fops, 0444 },
+       { "grace_delay",        lprocfs_rd_hsm_cdt_delay,
+                               lprocfs_wr_hsm_cdt_delay,
+                               NULL, NULL, 0 },
+       { "loop_period",        lprocfs_rd_hsm_cdt_loop_period,
+                               lprocfs_wr_hsm_cdt_loop_period,
+                               NULL, NULL, 0 },
+       { "max_requests",       lprocfs_rd_hsm_cdt_max_request,
+                               lprocfs_wr_hsm_cdt_max_request,
+                               NULL, NULL, 0 },
+       { "policy",             lprocfs_rd_hsm_policy, lprocfs_wr_hsm_policy,
+                               NULL, NULL, 0 },
+       { "request_timeout",    lprocfs_rd_hsm_cdt_timeout,
+                               lprocfs_wr_hsm_cdt_timeout,
+                               NULL, NULL, 0 },
+       { "requests",           NULL, NULL, NULL, &mdt_hsm_request_fops, 0 },
+       { 0 }
+};
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c

index d997413..9ec0404 100644 (file)
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -100,6 +100,12 @@ static const struct lu_object_operations mdt_obj_ops;
  /* Slab for MDT object allocation */
  static struct kmem_cache *mdt_object_kmem;
  
+/* For HSM restore handles */
+struct kmem_cache *mdt_hsm_cdt_kmem;
+
+/* For HSM request handles */
+struct kmem_cache *mdt_hsm_car_kmem;
+
  static struct lu_kmem_descr mdt_caches[] = {
         {
                 .ckd_cache = &mdt_object_kmem,
@@ -107,6 +113,16 @@ static struct lu_kmem_descr mdt_caches[] = {
                 .ckd_size  = sizeof(struct mdt_object)
         },
         {
+               .ckd_cache      = &mdt_hsm_cdt_kmem,
+               .ckd_name       = "mdt_cdt_restore_handle",
+               .ckd_size       = sizeof(struct cdt_restore_handle)
+       },
+       {
+               .ckd_cache      = &mdt_hsm_car_kmem,
+               .ckd_name       = "mdt_cdt_agent_req",
+               .ckd_size       = sizeof(struct cdt_agent_req)
+       },
+       {
                 .ckd_cache = NULL
         }
  };
@@ -4957,13 +4973,15 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
          cfs_timer_init(&m->mdt_ck_timer, mdt_ck_timer_callback, m);
  
         rc = mdt_hsm_cdt_init(m);
-       if (rc != 0)
-               CERROR("%s: Cannot init coordinator, rc %d\n",
+       if (rc != 0) {
+               CERROR("%s: error initializing coordinator, rc %d\n",
                        mdt_obd_name(m), rc);
+                GOTO(err_free_ns, rc);
+       }
  
          rc = mdt_ck_thread_start(m);
          if (rc)
-                GOTO(err_free_ns, rc);
+                GOTO(err_free_hsm, rc);
  
         rc = tgt_init(env, &m->mdt_lut, obd, m->mdt_bottom, mdt_common_slice,
                       OBD_FAIL_MDS_ALL_REQUEST_NET,
@@ -5053,6 +5071,8 @@ err_tgt:
  err_capa:
         cfs_timer_disarm(&m->mdt_ck_timer);
         mdt_ck_thread_stop(m);
+err_free_hsm:
+       mdt_hsm_cdt_fini(m);
  err_free_ns:
         ldlm_namespace_free(m->mdt_namespace, NULL, 0);
         obd->obd_namespace = m->mdt_namespace = NULL;
diff --git a/lustre/mdt/mdt_hsm.c b/lustre/mdt/mdt_hsm.c

index bbebf98..46511fe 100644 (file)
--- a/lustre/mdt/mdt_hsm.c
+++ b/lustre/mdt/mdt_hsm.c
@@ -55,7 +55,7 @@
   * Update on-disk HSM attributes.
   */
  int mdt_hsm_attr_set(struct mdt_thread_info *info, struct mdt_object *obj,
-                    struct md_hsm *mh)
+                    const struct md_hsm *mh)
  {
         struct md_object        *next = mdt_object_child(obj);
         struct lu_buf           *buf = &info->mti_buf;
@@ -505,9 +505,6 @@ int mdt_hsm_request(struct mdt_thread_info *info)
         }
  
         rc = mdt_hsm_add_actions(info, hal, &compound_id);
-       /* ENODATA error code is needed only for implicit requests */
-       if (rc == -ENODATA)
-               rc = 0;
  
         MDT_HSM_FREE(hal, hal_size);
  
diff --git a/lustre/mdt/mdt_hsm_cdt_actions.c b/lustre/mdt/mdt_hsm_cdt_actions.c

index 5ee61bc..8a08005 100644 (file)
--- a/lustre/mdt/mdt_hsm_cdt_actions.c
+++ b/lustre/mdt/mdt_hsm_cdt_actions.c
@@ -43,7 +43,8 @@
  #include <lustre_log.h>
  #include "mdt_internal.h"
  
-void dump_llog_agent_req_rec(char *prefix, struct llog_agent_req_rec *larr)
+void dump_llog_agent_req_rec(const char *prefix,
+                            const struct llog_agent_req_rec *larr)
  {
         char    buf[12];
         int     sz;
@@ -95,7 +96,7 @@ int cdt_llog_process(const struct lu_env *env, struct mdt_device *mdt,
         if ((lctxt == NULL) || (lctxt->loc_handle == NULL))
                 RETURN(-ENOENT);
  
-       down(&cdt->cdt_llog_lock);
+       mutex_lock(&cdt->cdt_llog_lock);
  
         rc = llog_cat_process(env, lctxt->loc_handle, cb, data, 0, 0);
         if (rc < 0)
@@ -105,7 +106,7 @@ int cdt_llog_process(const struct lu_env *env, struct mdt_device *mdt,
                 rc = 0;
  
         llog_ctxt_put(lctxt);
-       up(&cdt->cdt_llog_lock);
+       mutex_unlock(&cdt->cdt_llog_lock);
         RETURN(rc);
  }
  
@@ -150,7 +151,7 @@ int mdt_agent_record_add(const struct lu_env *env,
         if ((lctxt == NULL) || (lctxt->loc_handle == NULL))
                 GOTO(free, rc = -ENOENT);
  
-       down(&cdt->cdt_llog_lock);
+       mutex_lock(&cdt->cdt_llog_lock);
  
         /* in case of cancel request, the cookie is already set to the
          * value of the request cookie to be cancelled
@@ -164,7 +165,7 @@ int mdt_agent_record_add(const struct lu_env *env,
         if (rc > 0)
                 rc = 0;
  
-       up(&cdt->cdt_llog_lock);
+       mutex_unlock(&cdt->cdt_llog_lock);
         llog_ctxt_put(lctxt);
  
         EXIT;
diff --git a/lustre/mdt/mdt_hsm_cdt_agent.c b/lustre/mdt/mdt_hsm_cdt_agent.c

index f5c60eb..b7c4c93 100644 (file)
--- a/lustre/mdt/mdt_hsm_cdt_agent.c
+++ b/lustre/mdt/mdt_hsm_cdt_agent.c
@@ -367,30 +367,35 @@ int mdt_hsm_agent_send(struct mdt_thread_info *mti,
                         struct mdt_object *obj;
                         struct md_hsm hsm;
  
-                       obj = mdt_hsm_get_md_hsm(mti, &hai->hai_fid, &hsm,
-                                                NULL);
-                       if (IS_ERR(obj) && (hai->hai_action == HSMA_REMOVE))
-                               continue;
+                       obj = mdt_hsm_get_md_hsm(mti, &hai->hai_fid, &hsm);
+                       if (!IS_ERR(obj)) {
+                               mdt_object_put(mti->mti_env, obj);
+                       } else {
+                               if (hai->hai_action == HSMA_REMOVE)
+                                       continue;
  
-                       if (IS_ERR(obj) && (PTR_ERR(obj) == -ENOENT)) {
-                               fail_request = true;
-                               rc = mdt_agent_record_update(mti->mti_env, mdt,
+                               if (PTR_ERR(obj) == -ENOENT) {
+                                       fail_request = true;
+                                       rc = mdt_agent_record_update(
+                                                            mti->mti_env, mdt,
                                                              &hai->hai_cookie,
                                                              1, ARS_FAILED);
-                               if (rc) {
-                                       CERROR("%s: mdt_agent_record_update() "
+                                       if (rc) {
+                                               CERROR(
+                                             "%s: mdt_agent_record_update() "
                                               "failed, rc=%d, cannot update "
                                               "status to %s for cookie "
                                               LPX64": rc = %d\n",
                                               mdt_obd_name(mdt), rc,
                                               agent_req_status2name(ARS_FAILED),
                                               hai->hai_cookie, rc);
-                                       GOTO(out_buf, rc);
+                                               GOTO(out_buf, rc);
+                                       }
+                                       continue;
                                 }
-                               continue;
-                       }
-                       if (IS_ERR(obj))
                                 GOTO(out_buf, rc = PTR_ERR(obj));
+                       }
+
  
                         if (!mdt_hsm_is_action_compat(hai, hal->hal_archive_id,
                                                       hal->hal_flags, &hsm)) {
diff --git a/lustre/mdt/mdt_hsm_cdt_client.c b/lustre/mdt/mdt_hsm_cdt_client.c

index af2ddf4..b47acea 100644 (file)
--- a/lustre/mdt/mdt_hsm_cdt_client.c
+++ b/lustre/mdt/mdt_hsm_cdt_client.c
@@ -326,7 +326,7 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti,
                         goto record;
  
                 /* get HSM attributes */
-               obj = mdt_hsm_get_md_hsm(mti, &hai->hai_fid, &mh, NULL);
+               obj = mdt_hsm_get_md_hsm(mti, &hai->hai_fid, &mh);
                 if (IS_ERR(obj)) {
                         /* in case of archive remove, Lustre file
                          * is not mandatory */
@@ -334,6 +334,7 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti,
                                 goto record;
                         GOTO(out, rc = PTR_ERR(obj));
                 }
+               mdt_object_put(mti->mti_env, obj);
  
                 /* Check if an action is needed, compare request
                  * and HSM flags status */
@@ -364,7 +365,7 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti,
                         struct cdt_restore_handle       *crh;
                         struct mdt_object               *child;
  
-                       OBD_ALLOC_PTR(crh);
+                       OBD_SLAB_ALLOC_PTR(crh, mdt_hsm_cdt_kmem);
                         if (crh == NULL)
                                 GOTO(out, rc = -ENOMEM);
  
@@ -386,7 +387,7 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti,
                                 CERROR("%s: cannot take layout lock for "
                                        DFID": rc = %d\n", mdt_obd_name(mdt),
                                        PFID(&crh->crh_fid), rc);
-                               OBD_FREE_PTR(crh);
+                               OBD_SLAB_FREE_PTR(crh, mdt_hsm_cdt_kmem);
                                 GOTO(out, rc);
                         }
                         /* we choose to not keep a keep a reference
@@ -394,10 +395,10 @@ int mdt_hsm_add_actions(struct mdt_thread_info *mti,
                          * very long */
                         mdt_object_put(mti->mti_env, child);
  
-                       down(&cdt->cdt_restore_lock);
+                       mutex_lock(&cdt->cdt_restore_lock);
                         cfs_list_add_tail(&crh->crh_list,
                                           &cdt->cdt_restore_hdl);
-                       up(&cdt->cdt_restore_lock);
+                       mutex_unlock(&cdt->cdt_restore_lock);
                 }
  record:
                 /* record request */
@@ -411,7 +412,8 @@ record:
                 rc = -ENODATA;
         else
                 rc = 0;
-       EXIT;
+
+       GOTO(out, rc);
  out:
         /* if work has been added, wake up coordinator */
         if ((rc == 0) || (rc == -ENODATA))
@@ -444,7 +446,7 @@ int mdt_hsm_get_running(struct mdt_thread_info *mti,
                         RETURN(-EINVAL);
  
                 car = mdt_cdt_find_request(cdt, 0, &hai->hai_fid);
-               if (IS_ERR(car)) {
+               if (car == NULL) {
                         hai->hai_cookie = 0;
                         hai->hai_action = HSMA_NONE;
                 } else {
@@ -477,7 +479,7 @@ bool mdt_hsm_restore_is_running(struct mdt_thread_info *mti,
         if (!fid_is_sane(fid))
                 RETURN(rc);
  
-       down(&cdt->cdt_restore_lock);
+       mutex_lock(&cdt->cdt_restore_lock);
         cfs_list_for_each_safe(pos, tmp, &cdt->cdt_restore_hdl) {
                 crh = cfs_list_entry(pos, struct cdt_restore_handle, crh_list);
                 if (lu_fid_eq(&crh->crh_fid, fid)) {
@@ -485,7 +487,7 @@ bool mdt_hsm_restore_is_running(struct mdt_thread_info *mti,
                         break;
                 }
         }
-       up(&cdt->cdt_restore_lock);
+       mutex_unlock(&cdt->cdt_restore_lock);
         RETURN(rc);
  }
  
@@ -533,7 +535,7 @@ int mdt_hsm_get_actions(struct mdt_thread_info *mti,
                 struct cdt_agent_req *car;
  
                 car = mdt_cdt_find_request(cdt, hai->hai_cookie, NULL);
-               if (IS_ERR(car)) {
+               if (car == NULL) {
                         hai->hai_cookie = 0;
                 } else {
                         __u64 data_moved;
diff --git a/lustre/mdt/mdt_hsm_cdt_requests.c b/lustre/mdt/mdt_hsm_cdt_requests.c

index 9d20164..7029827 100644 (file)
--- a/lustre/mdt/mdt_hsm_cdt_requests.c
+++ b/lustre/mdt/mdt_hsm_cdt_requests.c
@@ -153,8 +153,8 @@ out:
  /**
   * update data moved information during a request
   */
-static int mdt_cdt_update_work(struct cdt_req_progress *crp,
-                              struct hsm_extent *extent)
+static int hsm_update_work(struct cdt_req_progress *crp,
+                          const struct hsm_extent *extent)
  {
         int                       rc, osz, nsz;
         struct interval_node    **new_vv;
@@ -239,11 +239,11 @@ struct cdt_agent_req *mdt_cdt_alloc_request(__u64 compound_id, __u32 archive_id,
         struct cdt_agent_req *car;
         ENTRY;
  
-       OBD_ALLOC_PTR(car);
+       OBD_SLAB_ALLOC_PTR(car, mdt_hsm_car_kmem);
         if (car == NULL)
                 RETURN(ERR_PTR(-ENOMEM));
  
-       cfs_atomic_set(&car->car_refcount, 0);
+       cfs_atomic_set(&car->car_refcount, 1);
         car->car_compound_id = compound_id;
         car->car_archive_id = archive_id;
         car->car_flags = flags;
@@ -253,7 +253,7 @@ struct cdt_agent_req *mdt_cdt_alloc_request(__u64 compound_id, __u32 archive_id,
         car->car_uuid = *uuid;
         OBD_ALLOC(car->car_hai, hai->hai_len);
         if (car->car_hai == NULL) {
-               OBD_FREE_PTR(car);
+               OBD_SLAB_FREE_PTR(car, mdt_hsm_car_kmem);
                 RETURN(ERR_PTR(-ENOMEM));
         }
         memcpy(car->car_hai, hai, hai->hai_len);
@@ -271,7 +271,7 @@ void mdt_cdt_free_request(struct cdt_agent_req *car)
  {
         mdt_cdt_free_request_tree(&car->car_progress);
         OBD_FREE(car->car_hai, car->car_hai->hai_len);
-       OBD_FREE_PTR(car);
+       OBD_SLAB_FREE_PTR(car, mdt_hsm_car_kmem);
  }
  
  /**
@@ -290,6 +290,7 @@ void mdt_cdt_get_request(struct cdt_agent_req *car)
   */
  void mdt_cdt_put_request(struct cdt_agent_req *car)
  {
+       LASSERT(cfs_atomic_read(&car->car_refcount) > 0);
         if (cfs_atomic_dec_and_test(&car->car_refcount))
                 mdt_cdt_free_request(car);
  }
@@ -306,25 +307,20 @@ static struct cdt_agent_req *cdt_find_request_nolock(struct coordinator *cdt,
                                                      __u64 cookie,
                                                      const struct lu_fid *fid)
  {
-       cfs_list_t              *pos;
-       struct cdt_agent_req    *car;
+       struct cdt_agent_req *car;
+       struct cdt_agent_req *found = NULL;
         ENTRY;
  
-       if (cfs_list_empty(&cdt->cdt_requests))
-               goto notfound;
-
-       cfs_list_for_each(pos, &cdt->cdt_requests) {
-               car = cfs_list_entry(pos, struct cdt_agent_req,
-                                    car_request_list);
+       cfs_list_for_each_entry(car, &cdt->cdt_requests, car_request_list) {
                 if ((car->car_hai->hai_cookie == cookie) ||
                     ((fid != NULL) && lu_fid_eq(fid, &car->car_hai->hai_fid))) {
                         mdt_cdt_get_request(car);
-                       RETURN(car);
+                       found = car;
+                       break;
                 }
         }
  
-notfound:
-       RETURN(ERR_PTR(-ENOENT));
+       RETURN(found);
  }
  
  /**
@@ -343,23 +339,19 @@ int mdt_cdt_add_request(struct coordinator *cdt, struct cdt_agent_req *new_car)
         LASSERT(new_car->car_hai->hai_action != HSMA_CANCEL);
  
         down_write(&cdt->cdt_request_lock);
-
         car = cdt_find_request_nolock(cdt, new_car->car_hai->hai_cookie, NULL);
-       if (!IS_ERR(car)) {
+       if (car != NULL) {
                 mdt_cdt_put_request(car);
                 up_write(&cdt->cdt_request_lock);
                 RETURN(-EEXIST);
         }
  
-       mdt_cdt_get_request(new_car);
         cfs_list_add_tail(&new_car->car_request_list, &cdt->cdt_requests);
         up_write(&cdt->cdt_request_lock);
  
         mdt_hsm_agent_update_statistics(cdt, 0, 0, 1, &new_car->car_uuid);
  
-       down(&cdt->cdt_counter_lock);
-       cdt->cdt_request_count++;
-       up(&cdt->cdt_counter_lock);
+       atomic_inc(&cdt->cdt_request_count);
  
         RETURN(0);
  }
@@ -372,16 +364,14 @@ int mdt_cdt_add_request(struct coordinator *cdt, struct cdt_agent_req *new_car)
   * \retval request pointer
   */
  struct cdt_agent_req *mdt_cdt_find_request(struct coordinator *cdt,
-                                          __u64 cookie,
+                                          const __u64 cookie,
                                            const struct lu_fid *fid)
  {
         struct cdt_agent_req    *car;
         ENTRY;
  
         down_read(&cdt->cdt_request_lock);
-
         car = cdt_find_request_nolock(cdt, cookie, fid);
-
         up_read(&cdt->cdt_request_lock);
  
         RETURN(car);
@@ -395,20 +385,23 @@ struct cdt_agent_req *mdt_cdt_find_request(struct coordinator *cdt,
   */
  int mdt_cdt_remove_request(struct coordinator *cdt, __u64 cookie)
  {
-       struct cdt_agent_req    *car;
+       struct cdt_agent_req *car;
         ENTRY;
  
         down_write(&cdt->cdt_request_lock);
-
         car = cdt_find_request_nolock(cdt, cookie, NULL);
-       if (!IS_ERR(car)) {
+       if (car != NULL) {
                 cfs_list_del(&car->car_request_list);
-               mdt_cdt_put_request(car);
                 up_write(&cdt->cdt_request_lock);
  
-               down(&cdt->cdt_counter_lock);
-               cdt->cdt_request_count--;
-               up(&cdt->cdt_counter_lock);
+               /* reference from cdt_requests list */
+               mdt_cdt_put_request(car);
+
+               /* reference from cdt_find_request_nolock() */
+               mdt_cdt_put_request(car);
+
+               LASSERT(atomic_read(&cdt->cdt_request_count) > 0);
+               atomic_dec(&cdt->cdt_request_count);
  
                 RETURN(0);
         }
@@ -426,21 +419,21 @@ int mdt_cdt_remove_request(struct coordinator *cdt, __u64 cookie)
   * \retval -ve failure
   */
  struct cdt_agent_req *mdt_cdt_update_request(struct coordinator *cdt,
-                                            struct hsm_progress_kernel *pgs)
+                                         const struct hsm_progress_kernel *pgs)
  {
         struct cdt_agent_req    *car;
         int                      rc;
         ENTRY;
  
         car = mdt_cdt_find_request(cdt, pgs->hpk_cookie, NULL);
-       if (IS_ERR(car))
-               RETURN(car);
+       if (car == NULL)
+               RETURN(ERR_PTR(-ENOENT));
  
         car->car_req_update = cfs_time_current_sec();
  
         /* update progress done by copy tool */
         if (pgs->hpk_errval == 0 && pgs->hpk_extent.length != 0) {
-               rc = mdt_cdt_update_work(&car->car_progress, &pgs->hpk_extent);
+               rc = hsm_update_work(&car->car_progress, &pgs->hpk_extent);
                 if (rc) {
                         mdt_cdt_put_request(car);
                         RETURN(ERR_PTR(rc));
diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h

index 42ff0b6..e45190a 100644 (file)
--- a/lustre/mdt/mdt_internal.h
+++ b/lustre/mdt/mdt_internal.h
@@ -92,7 +92,7 @@ struct mdt_file_data {
  /* when adding a new policy, do not forget to update
   * lustre/mdt/mdt_coordinator.c::hsm_policy_names[]
   */
-#define CDT_DEFAULT_POLICY             0x0000000000000000ULL
+#define CDT_DEFAULT_POLICY             CDT_NORETRY_ACTION
  
  enum cdt_states { CDT_STOPPED = 0,
                   CDT_INIT,
@@ -108,27 +108,26 @@ enum cdt_states { CDT_STOPPED = 0,
   * cdt_request_lock
   */
  struct coordinator {
-       struct ptlrpc_thread    *cdt_thread;        /**< coordinator thread */
+       struct ptlrpc_thread     cdt_thread;        /**< coordinator thread */
         struct lu_env            cdt_env;           /**< coordinator lustre
                                                      * env */
+       struct lu_context        cdt_session;       /** session for lu_ucred */
         struct proc_dir_entry   *cdt_proc_dir;      /**< cdt /proc directory */
         __u64                    cdt_policy;        /**< flags to defined
                                                      * policy */
         enum cdt_states          cdt_state;         /**< state */
-       cfs_atomic_t             cdt_compound_id;   /**< compound id counter */
+       atomic_t                 cdt_compound_id;   /**< compound id counter */
         __u64                    cdt_last_cookie;   /**< last cookie allocated */
-       struct semaphore         cdt_counter_lock;  /**< protect request
-                                                    * counter */
-       struct semaphore         cdt_llog_lock;     /**< protect llog access */
+       struct mutex             cdt_llog_lock;     /**< protect llog access */
         struct rw_semaphore      cdt_agent_lock;    /**< protect agent list */
         struct rw_semaphore      cdt_request_lock;  /**< protect request list */
-       struct semaphore         cdt_restore_lock;  /**< protect restore list */
+       struct mutex             cdt_restore_lock;  /**< protect restore list */
         cfs_time_t               cdt_loop_period;   /**< llog scan period */
         cfs_time_t               cdt_delay;         /**< request grace delay */
         cfs_time_t               cdt_timeout;       /**< request timeout */
         __u64                    cdt_max_request;   /**< max count of started
                                                      * requests */
-       __u64                    cdt_request_count; /**< current count of
+       atomic_t                 cdt_request_count; /**< current count of
                                                      * started requests */
         cfs_list_t               cdt_requests;      /**< list of started
                                                      * requests */
@@ -297,13 +296,13 @@ struct mdt_lock_handle {
  };
  
  enum {
-       MDT_LH_PARENT, /* parent lockh */
-       MDT_LH_CHILD,  /* child lockh */
-       MDT_LH_OLD,    /* old lockh for rename */
+       MDT_LH_PARENT,  /* parent lockh */
+       MDT_LH_CHILD,   /* child lockh */
+       MDT_LH_OLD,     /* old lockh for rename */
         MDT_LH_LAYOUT = MDT_LH_OLD, /* layout lock */
-       MDT_LH_NEW,    /* new lockh for rename */
-       MDT_LH_RMT,    /* used for return lh to caller */
-       MDT_LH_LOCAL,  /* local lock never return to client */
+       MDT_LH_NEW,     /* new lockh for rename */
+       MDT_LH_RMT,     /* used for return lh to caller */
+       MDT_LH_LOCAL,   /* local lock never return to client */
         MDT_LH_NR
  };
  
@@ -590,6 +589,7 @@ struct cdt_agent_req {
         struct cdt_req_progress  car_progress;     /**< track data mvt
                                                     *   progress */
  };
+extern struct kmem_cache *mdt_hsm_car_kmem;
  
  struct hsm_agent {
         cfs_list_t       ha_list;               /**< to chain the agents */
@@ -609,6 +609,7 @@ struct cdt_restore_handle {
         struct ldlm_extent       crh_extent;    /**< extent of the restore */
         struct mdt_lock_handle   crh_lh;        /**< lock handle */
  };
+extern struct kmem_cache *mdt_hsm_cdt_kmem;    /** restore handle slab cache */
  
  static inline const struct md_device_operations *
  mdt_child_ops(struct mdt_device * m)
@@ -877,7 +878,7 @@ extern struct lprocfs_vars lprocfs_mds_module_vars[];
  extern struct lprocfs_vars lprocfs_mds_obd_vars[];
  
  int mdt_hsm_attr_set(struct mdt_thread_info *info, struct mdt_object *obj,
-                    struct md_hsm *mh);
+                    const struct md_hsm *mh);
  
  struct mdt_handler *mdt_handler_find(__u32 opc,
                                      struct mdt_opc_slice *supported);
@@ -935,7 +936,8 @@ int mdt_hsm_ct_unregister(struct mdt_thread_info *info);
  int mdt_hsm_request(struct mdt_thread_info *info);
  /* mdt/mdt_hsm_cdt_actions.c */
  extern const struct file_operations mdt_agent_actions_fops;
-void dump_llog_agent_req_rec(char *prefix, struct llog_agent_req_rec *larr);
+void dump_llog_agent_req_rec(const char *prefix,
+                            const struct llog_agent_req_rec *larr);
  int cdt_llog_process(const struct lu_env *env, struct mdt_device *mdt,
                      llog_cb_t cb, void *data);
  int mdt_agent_record_add(const struct lu_env *env, struct mdt_device *mdt,
@@ -976,7 +978,6 @@ int mdt_hsm_get_running(struct mdt_thread_info *mti,
                         struct hsm_action_list *hal);
  bool mdt_hsm_restore_is_running(struct mdt_thread_info *mti,
                                 const struct lu_fid *fid);
-
  /* mdt/mdt_hsm_cdt_requests.c */
  extern const struct file_operations mdt_hsm_request_fops;
  void dump_requests(char *prefix, struct coordinator *cdt);
@@ -986,75 +987,42 @@ struct cdt_agent_req *mdt_cdt_alloc_request(__u64 compound_id, __u32 archive_id,
  void mdt_cdt_free_request(struct cdt_agent_req *car);
  int mdt_cdt_add_request(struct coordinator *cdt, struct cdt_agent_req *new_car);
  struct cdt_agent_req *mdt_cdt_find_request(struct coordinator *cdt,
-                                          __u64 cookie,
+                                          const __u64 cookie,
                                            const struct lu_fid *fid);
  void mdt_cdt_get_work_done(struct cdt_agent_req *car, __u64 *done_sz);
  void mdt_cdt_get_request(struct cdt_agent_req *car);
  void mdt_cdt_put_request(struct cdt_agent_req *car);
  struct cdt_agent_req *mdt_cdt_update_request(struct coordinator *cdt,
-                                            struct hsm_progress_kernel *pgs);
+                                        const struct hsm_progress_kernel *pgs);
  int mdt_cdt_remove_request(struct coordinator *cdt, __u64 cookie);
-
-/* fake functions, will be remove with patch LU-3343 */
-static inline struct mdt_object *mdt_hsm_get_md_hsm(struct mdt_thread_info *mti,
-                                                   struct lu_fid *fid,
-                                                   struct md_hsm *hsm,
-                                                   struct mdt_lock_handle *lh)
-{
-       return ERR_PTR(-EINVAL);
-}
-static inline bool mdt_hsm_is_action_compat(struct hsm_action_item *hai,
-                                           int hal_an, __u64 rq_flags,
-                                           struct md_hsm *hsm)
-{
-       return false;
-}
-static inline int mdt_hsm_cdt_init(struct mdt_device *mdt)
-{
-       struct coordinator      *cdt = &mdt->mdt_coordinator;
-
-       /* minimal init before final patch landing */
-       sema_init(&cdt->cdt_llog_lock, 1);
-       init_rwsem(&cdt->cdt_agent_lock);
-       init_rwsem(&cdt->cdt_request_lock);
-       sema_init(&cdt->cdt_restore_lock, 1);
-
-       CFS_INIT_LIST_HEAD(&cdt->cdt_requests);
-       CFS_INIT_LIST_HEAD(&cdt->cdt_agents);
-       CFS_INIT_LIST_HEAD(&cdt->cdt_restore_hdl);
-
-       cdt->cdt_state = CDT_STOPPED;
-       return 0;
-}
-static inline int mdt_hsm_cdt_start(struct mdt_device *mdt)
-{
-       return 0;
-}
-static inline int mdt_hsm_cdt_stop(struct mdt_device *mdt)
-{
-       return 0;
-}
-static inline int mdt_hsm_cdt_fini(struct mdt_device *mdt)
-{
-       return 0;
-}
-static inline int mdt_hsm_cdt_wakeup(struct mdt_device *mdt)
-{
-       return 0;
-}
-static inline int mdt_hsm_add_hal(struct mdt_thread_info *mti,
-                                 struct hsm_action_list *hal,
-                                 const struct obd_uuid *uuid)
-{
-       return 0;
-}
-static inline int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
-                                              struct hsm_progress_kernel *pgs,
-                                              bool update_record)
-{
-       return 0;
-}
-/* end of fake functions */
+/* mdt/mdt_coordinator.c */
+void mdt_hsm_dump_hal(int level, const char *prefix,
+                     struct hsm_action_list *hal);
+/* coordinator management */
+int mdt_hsm_cdt_init(struct mdt_device *mdt);
+int mdt_hsm_cdt_start(struct mdt_device *mdt);
+int mdt_hsm_cdt_stop(struct mdt_device *mdt);
+int mdt_hsm_cdt_fini(struct mdt_device *mdt);
+int mdt_hsm_cdt_wakeup(struct mdt_device *mdt);
+
+/* coordinator control /proc interface */
+int lprocfs_wr_hsm_cdt_control(struct file *file, const char *buffer,
+                              unsigned long count, void *data);
+int lprocfs_rd_hsm_cdt_control(char *page, char **start, off_t off,
+                              int count, int *eof, void *data);
+/* md_hsm helpers */
+struct mdt_object *mdt_hsm_get_md_hsm(struct mdt_thread_info *mti,
+                                     const struct lu_fid *fid,
+                                     struct md_hsm *hsm);
+/* actions/request helpers */
+int mdt_hsm_add_hal(struct mdt_thread_info *mti,
+                   struct hsm_action_list *hal, struct obd_uuid *uuid);
+bool mdt_hsm_is_action_compat(const struct hsm_action_item *hai,
+                             const int hal_an, const __u64 rq_flags,
+                             const struct md_hsm *hsm);
+int mdt_hsm_update_request_state(struct mdt_thread_info *mti,
+                                struct hsm_progress_kernel *pgs,
+                                const int update_record);
  
  extern struct lu_context_key       mdt_thread_key;
  /* debug issues helper starts here*/
diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c

index 17aa9e0..fd2b92d 100644 (file)
--- a/lustre/mdt/mdt_lproc.c
+++ b/lustre/mdt/mdt_lproc.c
@@ -949,50 +949,79 @@ static int lprocfs_wr_enable_remote_dir_gid(struct file *file,
  }
  
  static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
-        { "uuid",                       lprocfs_rd_uuid,                 0, 0 },
-        { "recovery_status",            lprocfs_obd_rd_recovery_status,  0, 0 },
-        { "num_exports",                lprocfs_rd_num_exports,          0, 0 },
-        { "identity_expire",            lprocfs_rd_identity_expire,
-                                        lprocfs_wr_identity_expire,         0 },
-        { "identity_acquire_expire",    lprocfs_rd_identity_acquire_expire,
-                                        lprocfs_wr_identity_acquire_expire, 0 },
-        { "identity_upcall",            lprocfs_rd_identity_upcall,
-                                        lprocfs_wr_identity_upcall,         0 },
-        { "identity_flush",             0, lprocfs_wr_identity_flush,       0 },
-        { "identity_info",              0, lprocfs_wr_identity_info,        0 },
-        { "capa",                       lprocfs_rd_capa,
-                                        lprocfs_wr_capa,                    0 },
-        { "capa_timeout",               lprocfs_rd_capa_timeout,
-                                        lprocfs_wr_capa_timeout,            0 },
-        { "capa_key_timeout",           lprocfs_rd_ck_timeout,
-                                        lprocfs_wr_ck_timeout,              0 },
-        { "capa_count",                 lprocfs_rd_capa_count,           0, 0 },
-        { "site_stats",                 lprocfs_rd_site_stats,           0, 0 },
-        { "evict_client",               0, lprocfs_mdt_wr_evict_client,     0 },
-        { "hash_stats",                 lprocfs_obd_rd_hash,    0, 0 },
-        { "sec_level",                  lprocfs_rd_sec_level,
-                                        lprocfs_wr_sec_level,               0 },
-        { "commit_on_sharing",          lprocfs_rd_cos, lprocfs_wr_cos, 0 },
-        { "root_squash",                lprocfs_rd_root_squash,
-                                        lprocfs_wr_root_squash,             0 },
-        { "nosquash_nids",              lprocfs_rd_nosquash_nids,
-                                        lprocfs_wr_nosquash_nids,           0 },
-        { "som",                        lprocfs_rd_mdt_som,
-                                        lprocfs_wr_mdt_som, 0 },
-        { "instance",                   lprocfs_target_rd_instance,         0 },
-        { "ir_factor",                  lprocfs_obd_rd_ir_factor,
-                                        lprocfs_obd_wr_ir_factor,           0 },
+       { "uuid",                       lprocfs_rd_uuid, NULL,
+                                       NULL, NULL, 0 },
+       { "recovery_status",            lprocfs_obd_rd_recovery_status, NULL,
+                                       NULL, NULL, 0 },
+       { "num_exports",                lprocfs_rd_num_exports, NULL,
+                                       NULL, NULL, 0 },
+       { "identity_expire",            lprocfs_rd_identity_expire,
+                                       lprocfs_wr_identity_expire,
+                                       NULL, NULL, 0 },
+       { "identity_acquire_expire",    lprocfs_rd_identity_acquire_expire,
+                                       lprocfs_wr_identity_acquire_expire,
+                                       NULL, NULL, 0 },
+       { "identity_upcall",            lprocfs_rd_identity_upcall,
+                                       lprocfs_wr_identity_upcall,
+                                       NULL, NULL, 0 },
+       { "identity_flush",             NULL, lprocfs_wr_identity_flush,
+                                       NULL, NULL, 0 },
+       { "identity_info",              NULL, lprocfs_wr_identity_info,
+                                       NULL, NULL, 0 },
+       { "capa",                       lprocfs_rd_capa,
+                                       lprocfs_wr_capa,
+                                       NULL, NULL, 0 },
+       { "capa_timeout",               lprocfs_rd_capa_timeout,
+                                       lprocfs_wr_capa_timeout,
+                                       NULL, NULL, 0 },
+       { "capa_key_timeout",           lprocfs_rd_ck_timeout,
+                                       lprocfs_wr_ck_timeout,
+                                       NULL, NULL, 0 },
+       { "capa_count",                 lprocfs_rd_capa_count, NULL,
+                                       NULL, NULL, 0 },
+       { "site_stats",                 lprocfs_rd_site_stats, NULL,
+                                       NULL, NULL, 0 },
+       { "evict_client",               NULL, lprocfs_mdt_wr_evict_client,
+                                       NULL, NULL, 0 },
+       { "hash_stats",                 lprocfs_obd_rd_hash, NULL,
+                                       NULL, NULL, 0 },
+       { "sec_level",                  lprocfs_rd_sec_level,
+                                       lprocfs_wr_sec_level,
+                                       NULL, NULL, 0 },
+       { "commit_on_sharing",          lprocfs_rd_cos, lprocfs_wr_cos,
+                                       NULL, NULL, 0 },
+       { "root_squash",                lprocfs_rd_root_squash,
+                                       lprocfs_wr_root_squash,
+                                       NULL, NULL, 0 },
+       { "nosquash_nids",              lprocfs_rd_nosquash_nids,
+                                       lprocfs_wr_nosquash_nids,
+                                       NULL, NULL, 0 },
+       { "som",                        lprocfs_rd_mdt_som,
+                                       lprocfs_wr_mdt_som,
+                                       NULL, NULL, 0 },
+       { "instance",                   lprocfs_target_rd_instance, NULL,
+                                       NULL, NULL, 0},
+       { "ir_factor",                  lprocfs_obd_rd_ir_factor,
+                                       lprocfs_obd_wr_ir_factor,
+                                       NULL, NULL, 0 },
         { "job_cleanup_interval",       lprocfs_rd_job_interval,
-                                       lprocfs_wr_job_interval, 0 },
+                                       lprocfs_wr_job_interval,
+                                       NULL, NULL, 0 },
         { "enable_remote_dir",          lprocfs_rd_enable_remote_dir,
-                                       lprocfs_wr_enable_remote_dir,       0},
+                                       lprocfs_wr_enable_remote_dir,
+                                       NULL, NULL, 0},
         { "enable_remote_dir_gid",      lprocfs_rd_enable_remote_dir_gid,
-                                       lprocfs_wr_enable_remote_dir_gid,   0},
+                                       lprocfs_wr_enable_remote_dir_gid,
+                                       NULL, NULL, 0},
+       { "hsm_control",                lprocfs_rd_hsm_cdt_control,
+                                       lprocfs_wr_hsm_cdt_control,
+                                       NULL, NULL, 0 },
         { 0 }
  };
  
  static struct lprocfs_vars lprocfs_mdt_module_vars[] = {
-        { "num_refs",                   lprocfs_rd_numrefs,              0, 0 },
+       { "num_refs",                   lprocfs_rd_numrefs, NULL,
+                                       NULL, NULL, 0 },
          { 0 }
  };
  
@@ -1003,12 +1032,12 @@ void lprocfs_mdt_init_vars(struct lprocfs_static_vars *lvars)
  }
  
  struct lprocfs_vars lprocfs_mds_obd_vars[] = {
-       { "uuid",        lprocfs_rd_uuid,       0, 0 },
+       { "uuid",       lprocfs_rd_uuid, NULL, NULL, NULL, 0 },
         { 0 }
  };
  
  struct lprocfs_vars lprocfs_mds_module_vars[] = {
-       { "num_refs",     lprocfs_rd_numrefs,     0, 0 },
+       { "num_refs",   lprocfs_rd_numrefs, NULL, NULL, NULL, 0 },
         { 0 }
  };
  
diff --git a/lustre/obdclass/md_attrs.c b/lustre/obdclass/md_attrs.c

index 111d707..f996438 100644 (file)
--- a/lustre/obdclass/md_attrs.c
+++ b/lustre/obdclass/md_attrs.c
@@ -185,7 +185,7 @@ EXPORT_SYMBOL(lustre_buf2hsm);
   * \param buf - is the output buffer where to pack the on-disk HSM xattr.
   * \param mh  - is the md_hsm structure to pack.
   */
-void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+void lustre_hsm2buf(void *buf, const struct md_hsm *mh)
  {
         struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
         ENTRY;
author	jcl <jacques-charles.lafoucriere@cea.fr>
	Sat, 6 Jul 2013 12:57:08 +0000 (14:57 +0200)
committer	Oleg Drokin <oleg.drokin@intel.com>
	Wed, 7 Aug 2013 20:20:52 +0000 (20:20 +0000)
lustre/include/lustre/lustre_user.h		patch \| blob \| history
lustre/include/md_object.h		patch \| blob \| history
lustre/mdt/Makefile.in		patch \| blob \| history
lustre/mdt/mdt_coordinator.c	[new file with mode: 0644]	patch \| blob
lustre/mdt/mdt_handler.c		patch \| blob \| history
lustre/mdt/mdt_hsm.c		patch \| blob \| history
lustre/mdt/mdt_hsm_cdt_actions.c		patch \| blob \| history
lustre/mdt/mdt_hsm_cdt_agent.c		patch \| blob \| history
lustre/mdt/mdt_hsm_cdt_client.c		patch \| blob \| history
lustre/mdt/mdt_hsm_cdt_requests.c		patch \| blob \| history
lustre/mdt/mdt_internal.h		patch \| blob \| history
lustre/mdt/mdt_lproc.c		patch \| blob \| history
lustre/obdclass/md_attrs.c		patch \| blob \| history